contrib/texinfo/info/search.c

   1 /* search.c -- searching large bodies of text.
   2    $Id: search.c,v 1.9 2008/06/11 09:55:42 gray Exp $
   3
   4    Copyright (C) 1993, 1997, 1998, 2002, 2004, 2007, 2008
   5    Free Software Foundation, Inc.
   6
   7    This program is free software: you can redistribute it and/or modify
   8    it under the terms of the GNU General Public License as published by
   9    the Free Software Foundation, either version 3 of the License, or
  10    (at your option) any later version.
  11
  12    This program is distributed in the hope that it will be useful,
  13    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15    GNU General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  19
  20    Originally written by Brian Fox (bfox@ai.mit.edu). */
  21
  22 #include "info.h"
  23 #include <regex.h>
  24
  25 #include "search.h"
  26 #include "nodes.h"
  27
  28 /* The search functions take two arguments:
  29
  30      1) a string to search for, and
  31
  32      2) a pointer to a SEARCH_BINDING which contains the buffer, start,
  33         and end of the search.
  34
  35    They return a long, which is the offset from the start of the buffer
  36    at which the match was found.  An offset of -1 indicates failure. */
  37
  38 /* A function which makes a binding with buffer and bounds. */
  39 SEARCH_BINDING *
  40 make_binding (char *buffer, long int start, long int end)
  41 {
  42   SEARCH_BINDING *binding;
  43
  44   binding = xmalloc (sizeof (SEARCH_BINDING));
  45   binding->buffer = buffer;
  46   binding->start = start;
  47   binding->end = end;
  48   binding->flags = 0;
  49
  50   return binding;
  51 }
  52
  53 /* Make a copy of BINDING without duplicating the data. */
  54 SEARCH_BINDING *
  55 copy_binding (SEARCH_BINDING *binding)
  56 {
  57   SEARCH_BINDING *copy;
  58
  59   copy = make_binding (binding->buffer, binding->start, binding->end);
  60   copy->flags = binding->flags;
  61   return copy;
  62 }
  63
  64 \f
  65 /* **************************************************************** */
  66 /*                                                                  */
  67 /*                 The Actual Searching Functions                   */
  68 /*                                                                  */
  69 /* **************************************************************** */
  70
  71 /* Search forwards or backwards for the text delimited by BINDING.
  72    The search is forwards if BINDING->start is greater than BINDING->end. */
  73 long
  74 search (char *string, SEARCH_BINDING *binding)
  75 {
  76   long result;
  77
  78   /* If the search is backwards, then search backwards, otherwise forwards. */
  79   if (binding->start > binding->end)
  80     result = search_backward (string, binding);
  81   else
  82     result = search_forward (string, binding);
  83
  84   return result;
  85 }
  86
  87 /* Search forwards or backwards for anything matching the regexp in the text
  88    delimited by BINDING. The search is forwards if BINDING->start is greater
  89    than BINDING->end.
  90
  91    If PRET is specified, it receives a copy of BINDING at the end of a
  92    succeded search.  Its START and END fields contain bounds of the found
  93    string instance.
  94 */
  95 long
  96 regexp_search (char *regexp, SEARCH_BINDING *binding, long length,
  97                SEARCH_BINDING *pret)
  98 {
  99   static char *previous_regexp = NULL;
 100   static char *previous_content = NULL;
 101   static int was_insensitive = 0;
 102   static regex_t preg;
 103   static regmatch_t *matches;
 104   static int match_alloc = 0;
 105   static int match_count = 0;
 106   regoff_t pos;
 107
 108   if (previous_regexp == NULL
 109       || ((binding->flags & S_FoldCase) != was_insensitive)
 110       || (strcmp (previous_regexp, regexp) != 0))
 111     {
 112       /* need to compile a new regexp */
 113       int result;
 114       char *unescaped_regexp;
 115       char *p, *q;
 116
 117       previous_content = NULL;
 118
 119       if (previous_regexp != NULL)
 120         {
 121           free (previous_regexp);
 122           previous_regexp = NULL;
 123           regfree (&preg);
 124         }
 125
 126       was_insensitive = binding->flags & S_FoldCase;
 127
 128       /* expand the \n and \t in regexp */
 129       unescaped_regexp = xmalloc (1 + strlen (regexp));
 130       for (p = regexp, q = unescaped_regexp; *p != '\0'; p++, q++)
 131         {
 132           if (*p == '\\')
 133             switch(*++p)
 134               {
 135               case 'n':
 136                 *q = '\n';
 137                 break;
 138               case 't':
 139                 *q = '\t';
 140                 break;
 141               case '\0':
 142                 *q = '\\';
 143                 p--;
 144                 break;
 145               default:
 146                 *q++ = '\\';
 147                 *q = *p;
 148                 break;
 149               }
 150           else
 151             *q = *p;
 152         }
 153       *q = '\0';
 154
 155       result = regcomp (&preg, unescaped_regexp,
 156                        REG_EXTENDED|
 157                        REG_NEWLINE|
 158                        (was_insensitive ? REG_ICASE : 0));
 159       free (unescaped_regexp);
 160
 161       if (result != 0)
 162         {
 163           int size = regerror (result, &preg, NULL, 0);
 164           char *buf = xmalloc (size);
 165           regerror (result, &preg, buf, size);
 166           info_error (_("regexp error: %s"), buf, NULL);
 167           return -1;
 168         }
 169
 170       previous_regexp = xstrdup(regexp);
 171     }
 172
 173   if (previous_content != binding->buffer)
 174     {
 175       /* new buffer to search in, let's scan it */
 176       regoff_t start = 0;
 177       char saved_char;
 178
 179       previous_content = binding->buffer;
 180       saved_char = previous_content[length-1];
 181       previous_content[length-1] = '\0';
 182
 183       for (match_count = 0; start < length; )
 184         {
 185           int result = 0;
 186           if (match_count >= match_alloc)
 187             {
 188               /* match list full. Initially allocate 256 entries, then double
 189                  every time we fill it */
 190               match_alloc = (match_alloc > 0 ? match_alloc * 2 : 256);
 191               matches = xrealloc (matches,
 192                                   match_alloc * sizeof(regmatch_t));
 193             }
 194
 195           result = regexec (&preg, &previous_content[start],
 196                             1, &matches[match_count], 0);
 197           if (result == 0)
 198             {
 199               if (matches[match_count].rm_eo == 0)
 200                 {
 201                   /* ignore empty matches */
 202                   start++;
 203                 }
 204               else
 205                 {
 206                   matches[match_count].rm_so += start;
 207                   matches[match_count].rm_eo += start;
 208                   start = matches[match_count++].rm_eo;
 209                 }
 210             }
 211           else
 212             {
 213               break;
 214             }
 215         }
 216       previous_content[length-1] = saved_char;
 217     }
 218
 219   pos = binding->start;
 220   if (pos > binding->end)
 221     {
 222       /* searching backward */
 223       int i;
 224       for (i = match_count - 1; i >= 0; i--)
 225         {
 226           if (matches[i].rm_so <= pos)
 227             {
 228               if (pret)
 229                 {
 230                   pret->buffer = binding->buffer;
 231                   pret->flags = binding->flags;
 232                   pret->start = matches[i].rm_so;
 233                   pret->end = matches[i].rm_eo;
 234                 }
 235               return matches[i].rm_so;
 236             }
 237         }
 238     }
 239   else
 240     {
 241       /* searching forward */
 242       int i;
 243       for (i = 0; i < match_count; i++)
 244         {
 245           if (matches[i].rm_so >= pos)
 246             {
 247               if (pret)
 248                 {
 249                   pret->buffer = binding->buffer;
 250                   pret->flags = binding->flags;
 251                   pret->start = matches[i].rm_so;
 252                   pret->end = matches[i].rm_eo;
 253                 }
 254               if (binding->flags & S_SkipDest)
 255                 return matches[i].rm_eo;
 256               else
 257                 return matches[i].rm_so;
 258             }
 259         }
 260     }
 261
 262   /* not found */
 263   return -1;
 264 }
 265
 266 /* Search forwards for STRING through the text delimited in BINDING. */
 267 long
 268 search_forward (char *string, SEARCH_BINDING *binding)
 269 {
 270   register int c, i, len;
 271   register char *buff, *end;
 272   char *alternate = NULL;
 273
 274   len = strlen (string);
 275
 276   /* We match characters in the search buffer against STRING and ALTERNATE.
 277      ALTERNATE is a case reversed version of STRING; this is cheaper than
 278      case folding each character before comparison.   Alternate is only
 279      used if the case folding bit is turned on in the passed BINDING. */
 280
 281   if (binding->flags & S_FoldCase)
 282     {
 283       alternate = xstrdup (string);
 284
 285       for (i = 0; i < len; i++)
 286         {
 287           if (islower (alternate[i]))
 288             alternate[i] = toupper (alternate[i]);
 289           else if (isupper (alternate[i]))
 290             alternate[i] = tolower (alternate[i]);
 291         }
 292     }
 293
 294   buff = binding->buffer + binding->start;
 295   end = binding->buffer + binding->end + 1;
 296
 297   while (buff < (end - len))
 298     {
 299       for (i = 0; i < len; i++)
 300         {
 301           c = buff[i];
 302
 303           if ((c != string[i]) && (!alternate || c != alternate[i]))
 304             break;
 305         }
 306
 307       if (!string[i])
 308         {
 309           if (alternate)
 310             free (alternate);
 311           if (binding->flags & S_SkipDest)
 312             buff += len;
 313           return buff - binding->buffer;
 314         }
 315
 316       buff++;
 317     }
 318
 319   if (alternate)
 320     free (alternate);
 321
 322   return -1;
 323 }
 324
 325 /* Search for STRING backwards through the text delimited in BINDING. */
 326 long
 327 search_backward (char *input_string, SEARCH_BINDING *binding)
 328 {
 329   register int c, i, len;
 330   register char *buff, *end;
 331   char *string;
 332   char *alternate = NULL;
 333
 334   len = strlen (input_string);
 335
 336   /* Reverse the characters in the search string. */
 337   string = xmalloc (1 + len);
 338   for (c = 0, i = len - 1; input_string[c]; c++, i--)
 339     string[i] = input_string[c];
 340
 341   string[c] = '\0';
 342
 343   /* We match characters in the search buffer against STRING and ALTERNATE.
 344      ALTERNATE is a case reversed version of STRING; this is cheaper than
 345      case folding each character before comparison.   ALTERNATE is only
 346      used if the case folding bit is turned on in the passed BINDING. */
 347
 348   if (binding->flags & S_FoldCase)
 349     {
 350       alternate = xstrdup (string);
 351
 352       for (i = 0; i < len; i++)
 353         {
 354           if (islower (alternate[i]))
 355             alternate[i] = toupper (alternate[i]);
 356           else if (isupper (alternate[i]))
 357             alternate[i] = tolower (alternate[i]);
 358         }
 359     }
 360
 361   buff = binding->buffer + binding->start - 1;
 362   end = binding->buffer + binding->end;
 363
 364   while (buff > (end + len))
 365     {
 366       for (i = 0; i < len; i++)
 367         {
 368           c = *(buff - i);
 369
 370           if (c != string[i] && (!alternate || c != alternate[i]))
 371             break;
 372         }
 373
 374       if (!string[i])
 375         {
 376           free (string);
 377           if (alternate)
 378             free (alternate);
 379
 380           if (binding->flags & S_SkipDest)
 381             buff -= len;
 382           return 1 + buff - binding->buffer;
 383         }
 384
 385       buff--;
 386     }
 387
 388   free (string);
 389   if (alternate)
 390     free (alternate);
 391
 392   return -1;
 393 }
 394
 395 /* Find STRING in LINE, returning the offset of the end of the string.
 396    Return an offset of -1 if STRING does not appear in LINE.  The search
 397    is bound by the end of the line (i.e., either NEWLINE or 0). */
 398 int
 399 string_in_line (char *string, char *line)
 400 {
 401   register int end;
 402   SEARCH_BINDING binding;
 403
 404   /* Find the end of the line. */
 405   for (end = 0; line[end] && line[end] != '\n'; end++);
 406
 407   /* Search for STRING within these confines. */
 408   binding.buffer = line;
 409   binding.start = 0;
 410   binding.end = end;
 411   binding.flags = S_FoldCase | S_SkipDest;
 412
 413   return search_forward (string, &binding);
 414 }
 415
 416 /* Return non-zero if STRING is the first text to appear at BINDING. */
 417 int
 418 looking_at (char *string, SEARCH_BINDING *binding)
 419 {
 420   long search_end;
 421
 422   search_end = search (string, binding);
 423
 424   /* If the string was not found, SEARCH_END is -1.  If the string was found,
 425      but not right away, SEARCH_END is != binding->start.  Otherwise, the
 426      string was found at binding->start. */
 427   return search_end == binding->start;
 428 }
 429 \f
 430 /* **************************************************************** */
 431 /*                                                                  */
 432 /*                    Small String Searches                         */
 433 /*                                                                  */
 434 /* **************************************************************** */
 435
 436 /* Function names that start with "skip" are passed a string, and return
 437    an offset from the start of that string.  Function names that start
 438    with "find" are passed a SEARCH_BINDING, and return an absolute position
 439    marker of the item being searched for.  "Find" functions return a value
 440    of -1 if the item being looked for couldn't be found. */
 441
 442 /* Return the index of the first non-whitespace character in STRING. */
 443 int
 444 skip_whitespace (char *string)
 445 {
 446   register int i;
 447
 448   for (i = 0; string && whitespace (string[i]); i++);
 449   return i;
 450 }
 451
 452 /* Return the index of the first non-whitespace or newline character in
 453    STRING. */
 454 int
 455 skip_whitespace_and_newlines (char *string)
 456 {
 457   register int i;
 458
 459   for (i = 0; string && whitespace_or_newline (string[i]); i++);
 460   return i;
 461 }
 462
 463 /* Return the index of the first whitespace character in STRING. */
 464 int
 465 skip_non_whitespace (char *string)
 466 {
 467   register int i;
 468
 469   for (i = 0; string && string[i] && !whitespace (string[i]); i++);
 470   return i;
 471 }
 472
 473 /* Return the index of the first non-node character in STRING.  Note that
 474    this function contains quite a bit of hair to ignore periods in some
 475    special cases.  This is because we here at GNU ship some info files which
 476    contain nodenames that contain periods.  No such nodename can start with
 477    a period, or continue with whitespace, newline, or ')' immediately following
 478    the period.  If second argument NEWLINES_OKAY is non-zero, newlines should
 479    be skipped while parsing out the nodename specification. */
 480 int
 481 skip_node_characters (char *string, int newlines_okay)
 482 {
 483   register int c, i = 0;
 484   int paren_seen = 0;
 485   int paren = 0;
 486
 487   /* Handle special case.  This is when another function has parsed out the
 488      filename component of the node name, and we just want to parse out the
 489      nodename proper.  In that case, a period at the start of the nodename
 490      indicates an empty nodename. */
 491   if (string && *string == '.')
 492     return 0;
 493
 494   if (string && *string == '(')
 495     {
 496       paren++;
 497       paren_seen++;
 498       i++;
 499     }
 500
 501   for (; string && (c = string[i]); i++)
 502     {
 503       if (paren)
 504         {
 505           if (c == '(')
 506             paren++;
 507           else if (c == ')')
 508             paren--;
 509
 510           continue;
 511         }
 512
 513       /* If the character following the close paren is a space or period,
 514          then this node name has no more characters associated with it. */
 515       if (c == '\t' ||
 516           c == ','  ||
 517           c == INFO_TAGSEP ||
 518           ((!newlines_okay) && (c == '\n')) ||
 519           ((paren_seen && string[i - 1] == ')') &&
 520            (c == ' ' || c == '.')) ||
 521           (c == '.' &&
 522            (
 523 #if 0
 524 /* This test causes a node name ending in a period, like `This.', not to
 525    be found.  The trailing . is stripped.  This occurs in the jargon
 526    file (`I see no X here.' is a node name).  */
 527            (!string[i + 1]) ||
 528 #endif
 529             (whitespace_or_newline (string[i + 1])) ||
 530             (string[i + 1] == ')'))))
 531         break;
 532     }
 533   return i;
 534 }
 535
 536 \f
 537 /* **************************************************************** */
 538 /*                                                                  */
 539 /*                   Searching FILE_BUFFER's                        */
 540 /*                                                                  */
 541 /* **************************************************************** */
 542
 543 /* Return the absolute position of the first occurence of a node separator in
 544    BINDING-buffer.  The search starts at BINDING->start.  Return -1 if no node
 545    separator was found. */
 546 long
 547 find_node_separator (SEARCH_BINDING *binding)
 548 {
 549   register long i;
 550   char *body;
 551
 552   body = binding->buffer;
 553
 554   /* A node is started by [^L]^_[^L]\n.  That is to say, the C-l's are
 555      optional, but the DELETE and NEWLINE are not.  This separator holds
 556      true for all separated elements in an Info file, including the tags
 557      table (if present) and the indirect tags table (if present). */
 558   for (i = binding->start; i < binding->end - 1; i++)
 559     if (((body[i] == INFO_FF && body[i + 1] == INFO_COOKIE) &&
 560          (body[i + 2] == '\n' ||
 561           (body[i + 2] == INFO_FF && body[i + 3] == '\n'))) ||
 562         ((body[i] == INFO_COOKIE) &&
 563          (body[i + 1] == '\n' ||
 564           (body[i + 1] == INFO_FF && body[i + 2] == '\n'))))
 565       return i;
 566   return -1;
 567 }
 568
 569 /* Return the length of the node separator characters that BODY is
 570    currently pointing at. */
 571 int
 572 skip_node_separator (char *body)
 573 {
 574   register int i;
 575
 576   i = 0;
 577
 578   if (body[i] == INFO_FF)
 579     i++;
 580
 581   if (body[i++] != INFO_COOKIE)
 582     return 0;
 583
 584   if (body[i] == INFO_FF)
 585     i++;
 586
 587   if (body[i++] != '\n')
 588     return 0;
 589
 590   return i;
 591 }
 592
 593 /* Return the number of characters from STRING to the start of
 594    the next line. */
 595 int
 596 skip_line (char *string)
 597 {
 598   register int i;
 599
 600   for (i = 0; string && string[i] && string[i] != '\n'; i++);
 601
 602   if (string[i] == '\n')
 603     i++;
 604
 605   return i;
 606 }
 607
 608 /* Return the absolute position of the beginning of a tags table in this
 609    binding starting the search at binding->start. */
 610 long
 611 find_tags_table (SEARCH_BINDING *binding)
 612 {
 613   SEARCH_BINDING tmp_search;
 614   long position;
 615
 616   tmp_search.buffer = binding->buffer;
 617   tmp_search.start = binding->start;
 618   tmp_search.end = binding->end;
 619   tmp_search.flags = S_FoldCase;
 620
 621   while ((position = find_node_separator (&tmp_search)) != -1 )
 622     {
 623       tmp_search.start = position;
 624       tmp_search.start += skip_node_separator (tmp_search.buffer
 625           + tmp_search.start);
 626
 627       if (looking_at (TAGS_TABLE_BEG_LABEL, &tmp_search))
 628         return position;
 629     }
 630   return -1;
 631 }
 632
 633 /* Return the absolute position of the node named NODENAME in BINDING.
 634    This is a brute force search, and we wish to avoid it when possible.
 635    This function is called when a tag (indirect or otherwise) doesn't
 636    really point to the right node.  It returns the absolute position of
 637    the separator preceding the node. */
 638 long
 639 find_node_in_binding (char *nodename, SEARCH_BINDING *binding)
 640 {
 641   long position;
 642   int offset, namelen;
 643   SEARCH_BINDING tmp_search;
 644
 645   namelen = strlen (nodename);
 646
 647   tmp_search.buffer = binding->buffer;
 648   tmp_search.start = binding->start;
 649   tmp_search.end = binding->end;
 650   tmp_search.flags = 0;
 651
 652   while ((position = find_node_separator (&tmp_search)) != -1)
 653     {
 654       tmp_search.start = position;
 655       tmp_search.start += skip_node_separator
 656         (tmp_search.buffer + tmp_search.start);
 657
 658       offset = string_in_line
 659         (INFO_NODE_LABEL, tmp_search.buffer + tmp_search.start);
 660
 661       if (offset == -1)
 662         continue;
 663
 664       tmp_search.start += offset;
 665       tmp_search.start += skip_whitespace (tmp_search.buffer + tmp_search.start);
 666       offset = skip_node_characters
 667         (tmp_search.buffer + tmp_search.start, DONT_SKIP_NEWLINES);
 668
 669       /* Notice that this is an exact match.  You cannot grovel through
 670          the buffer with this function looking for random nodes. */
 671        if ((offset == namelen) &&
 672            (tmp_search.buffer[tmp_search.start] == nodename[0]) &&
 673            (strncmp (tmp_search.buffer + tmp_search.start, nodename, offset) == 0))
 674          return position;
 675     }
 676   return -1;
 677 }