contrib/diffutils-2.8/src/io.c

   1 /* File I/O for GNU DIFF.
   2
   3    Copyright (C) 1988, 1989, 1992, 1993, 1994, 1995, 1998, 2001, 2002,
   4    2004 Free Software Foundation, Inc.
   5
   6    This file is part of GNU DIFF.
   7
   8    GNU DIFF is free software; you can redistribute it and/or modify
   9    it under the terms of the GNU General Public License as published by
  10    the Free Software Foundation; either version 2, or (at your option)
  11    any later version.
  12
  13    GNU DIFF is distributed in the hope that it will be useful,
  14    but WITHOUT ANY WARRANTY; without even the implied warranty of
  15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16    GNU General Public License for more details.
  17
  18    You should have received a copy of the GNU General Public License
  19    along with this program; see the file COPYING.
  20    If not, write to the Free Software Foundation,
  21    59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
  22
  23 #include "diff.h"
  24 #include <cmpbuf.h>
  25 #include <file-type.h>
  26 #include <setmode.h>
  27 #include <xalloc.h>
  28
  29 /* Rotate an unsigned value to the left.  */
  30 #define ROL(v, n) ((v) << (n) | (v) >> (sizeof (v) * CHAR_BIT - (n)))
  31
  32 /* Given a hash value and a new character, return a new hash value.  */
  33 #define HASH(h, c) ((c) + ROL (h, 7))
  34 \f
  35 /* The type of a hash value.  */
  36 typedef size_t hash_value;
  37 verify (hash_value_is_unsigned, ! TYPE_SIGNED (hash_value));
  38
  39 /* Lines are put into equivalence classes of lines that match in lines_differ.
  40    Each equivalence class is represented by one of these structures,
  41    but only while the classes are being computed.
  42    Afterward, each class is represented by a number.  */
  43 struct equivclass
  44 {
  45   lin next;             /* Next item in this bucket.  */
  46   hash_value hash;      /* Hash of lines in this class.  */
  47   char const *line;     /* A line that fits this class.  */
  48   size_t length;        /* That line's length, not counting its newline.  */
  49 };
  50
  51 /* Hash-table: array of buckets, each being a chain of equivalence classes.
  52    buckets[-1] is reserved for incomplete lines.  */
  53 static lin *buckets;
  54
  55 /* Number of buckets in the hash table array, not counting buckets[-1].  */
  56 static size_t nbuckets;
  57
  58 /* Array in which the equivalence classes are allocated.
  59    The bucket-chains go through the elements in this array.
  60    The number of an equivalence class is its index in this array.  */
  61 static struct equivclass *equivs;
  62
  63 /* Index of first free element in the array `equivs'.  */
  64 static lin equivs_index;
  65
  66 /* Number of elements allocated in the array `equivs'.  */
  67 static lin equivs_alloc;
  68 \f
  69 /* Read a block of data into a file buffer, checking for EOF and error.  */
  70
  71 void
  72 file_block_read (struct file_data *current, size_t size)
  73 {
  74   if (size && ! current->eof)
  75     {
  76       size_t s = block_read (current->desc,
  77                              FILE_BUFFER (current) + current->buffered, size);
  78       if (s == SIZE_MAX)
  79         pfatal_with_name (current->name);
  80       current->buffered += s;
  81       current->eof = s < size;
  82     }
  83 }
  84 \f
  85 /* Check for binary files and compare them for exact identity.  */
  86
  87 /* Return 1 if BUF contains a non text character.
  88    SIZE is the number of characters in BUF.  */
  89
  90 #define binary_file_p(buf, size) (memchr (buf, 0, size) != 0)
  91
  92 /* Get ready to read the current file.
  93    Return nonzero if SKIP_TEST is zero,
  94    and if it appears to be a binary file.  */
  95
  96 static bool
  97 sip (struct file_data *current, bool skip_test)
  98 {
  99   /* If we have a nonexistent file at this stage, treat it as empty.  */
 100   if (current->desc < 0)
 101     {
 102       /* Leave room for a sentinel.  */
 103       current->bufsize = sizeof (word);
 104       current->buffer = xmalloc (current->bufsize);
 105     }
 106   else
 107     {
 108       current->bufsize = buffer_lcm (sizeof (word),
 109                                      STAT_BLOCKSIZE (current->stat),
 110                                      PTRDIFF_MAX - 2 * sizeof (word));
 111       current->buffer = xmalloc (current->bufsize);
 112
 113       if (! skip_test)
 114         {
 115           /* Check first part of file to see if it's a binary file.  */
 116
 117           bool was_binary = set_binary_mode (current->desc, true);
 118           off_t buffered;
 119           file_block_read (current, current->bufsize);
 120           buffered = current->buffered;
 121
 122           if (! was_binary)
 123             {
 124               /* Revert to text mode and seek back to the beginning to
 125                  reread the file.  Use relative seek, since file
 126                  descriptors like stdin might not start at offset
 127                  zero.  */
 128
 129               if (lseek (current->desc, - buffered, SEEK_CUR) == -1)
 130                 pfatal_with_name (current->name);
 131               set_binary_mode (current->desc, false);
 132               current->buffered = 0;
 133               current->eof = false;
 134             }
 135
 136           return binary_file_p (current->buffer, buffered);
 137         }
 138     }
 139
 140   current->buffered = 0;
 141   current->eof = false;
 142   return false;
 143 }
 144
 145 /* Slurp the rest of the current file completely into memory.  */
 146
 147 static void
 148 slurp (struct file_data *current)
 149 {
 150   size_t cc;
 151
 152   if (current->desc < 0)
 153     {
 154       /* The file is nonexistent.  */
 155       return;
 156     }
 157
 158   if (S_ISREG (current->stat.st_mode))
 159     {
 160       /* It's a regular file; slurp in the rest all at once.  */
 161
 162       /* Get the size out of the stat block.
 163          Allocate just enough room for appended newline plus word sentinel,
 164          plus word-alignment since we want the buffer word-aligned.  */
 165       size_t file_size = current->stat.st_size;
 166       cc = file_size + 2 * sizeof (word) - file_size % sizeof (word);
 167       if (file_size != current->stat.st_size || cc < file_size
 168           || PTRDIFF_MAX <= cc)
 169         xalloc_die ();
 170
 171       if (current->bufsize < cc)
 172         {
 173           current->bufsize = cc;
 174           current->buffer = xrealloc (current->buffer, cc);
 175         }
 176
 177       /* Try to read at least 1 more byte than the size indicates, to
 178          detect whether the file is growing.  This is a nicety for
 179          users who run 'diff' on files while they are changing.  */
 180
 181       if (current->buffered <= file_size)
 182         {
 183           file_block_read (current, file_size + 1 - current->buffered);
 184           if (current->buffered <= file_size)
 185             return;
 186         }
 187     }
 188
 189   /* It's not a regular file, or it's a growing regular file; read it,
 190      growing the buffer as needed.  */
 191
 192   file_block_read (current, current->bufsize - current->buffered);
 193
 194   if (current->buffered)
 195     {
 196       while (current->buffered == current->bufsize)
 197         {
 198           if (PTRDIFF_MAX / 2 - sizeof (word) < current->bufsize)
 199             xalloc_die ();
 200           current->bufsize *= 2;
 201           current->buffer = xrealloc (current->buffer, current->bufsize);
 202           file_block_read (current, current->bufsize - current->buffered);
 203         }
 204
 205       /* Allocate just enough room for appended newline plus word
 206          sentinel, plus word-alignment.  */
 207       cc = current->buffered + 2 * sizeof (word);
 208       current->bufsize = cc - cc % sizeof (word);
 209       current->buffer = xrealloc (current->buffer, current->bufsize);
 210     }
 211 }
 212 \f
 213 /* Split the file into lines, simultaneously computing the equivalence
 214    class for each line.  */
 215
 216 static void
 217 find_and_hash_each_line (struct file_data *current)
 218 {
 219   hash_value h;
 220   char const *p = current->prefix_end;
 221   unsigned char c;
 222   lin i, *bucket;
 223   size_t length;
 224
 225   /* Cache often-used quantities in local variables to help the compiler.  */
 226   char const **linbuf = current->linbuf;
 227   lin alloc_lines = current->alloc_lines;
 228   lin line = 0;
 229   lin linbuf_base = current->linbuf_base;
 230   lin *cureqs = xmalloc (alloc_lines * sizeof *cureqs);
 231   struct equivclass *eqs = equivs;
 232   lin eqs_index = equivs_index;
 233   lin eqs_alloc = equivs_alloc;
 234   char const *suffix_begin = current->suffix_begin;
 235   char const *bufend = FILE_BUFFER (current) + current->buffered;
 236   bool diff_length_compare_anyway =
 237     ignore_white_space != IGNORE_NO_WHITE_SPACE;
 238   bool same_length_diff_contents_compare_anyway =
 239     diff_length_compare_anyway | ignore_case;
 240
 241   while (p < suffix_begin)
 242     {
 243       char const *ip = p;
 244
 245       h = 0;
 246
 247       /* Hash this line until we find a newline.  */
 248       if (ignore_case)
 249         switch (ignore_white_space)
 250           {
 251           case IGNORE_ALL_SPACE:
 252             while ((c = *p++) != '\n')
 253               if (! isspace (c))
 254                 h = HASH (h, tolower (c));
 255             break;
 256
 257           case IGNORE_SPACE_CHANGE:
 258             while ((c = *p++) != '\n')
 259               {
 260                 if (isspace (c))
 261                   {
 262                     do
 263                       if ((c = *p++) == '\n')
 264                         goto hashing_done;
 265                     while (isspace (c));
 266
 267                     h = HASH (h, ' ');
 268                   }
 269
 270                 /* C is now the first non-space.  */
 271                 h = HASH (h, tolower (c));
 272               }
 273             break;
 274
 275           case IGNORE_TAB_EXPANSION:
 276             {
 277               size_t column = 0;
 278               while ((c = *p++) != '\n')
 279                 {
 280                   size_t repetitions = 1;
 281
 282                   switch (c)
 283                     {
 284                     case '\b':
 285                       column -= 0 < column;
 286                       break;
 287
 288                     case '\t':
 289                       c = ' ';
 290                       repetitions = tabsize - column % tabsize;
 291                       column = (column + repetitions < column
 292                                 ? 0
 293                                 : column + repetitions);
 294                       break;
 295
 296                     case '\r':
 297                       column = 0;
 298                       break;
 299
 300                     default:
 301                       c = tolower (c);
 302                       column++;
 303                       break;
 304                     }
 305
 306                   do
 307                     h = HASH (h, c);
 308                   while (--repetitions != 0);
 309                 }
 310             }
 311             break;
 312
 313           default:
 314             while ((c = *p++) != '\n')
 315               h = HASH (h, tolower (c));
 316             break;
 317           }
 318       else
 319         switch (ignore_white_space)
 320           {
 321           case IGNORE_ALL_SPACE:
 322             while ((c = *p++) != '\n')
 323               if (! isspace (c))
 324                 h = HASH (h, c);
 325             break;
 326
 327           case IGNORE_SPACE_CHANGE:
 328             while ((c = *p++) != '\n')
 329               {
 330                 if (isspace (c))
 331                   {
 332                     do
 333                       if ((c = *p++) == '\n')
 334                         goto hashing_done;
 335                     while (isspace (c));
 336
 337                     h = HASH (h, ' ');
 338                   }
 339
 340                 /* C is now the first non-space.  */
 341                 h = HASH (h, c);
 342               }
 343             break;
 344
 345           case IGNORE_TAB_EXPANSION:
 346             {
 347               size_t column = 0;
 348               while ((c = *p++) != '\n')
 349                 {
 350                   size_t repetitions = 1;
 351
 352                   switch (c)
 353                     {
 354                     case '\b':
 355                       column -= 0 < column;
 356                       break;
 357
 358                     case '\t':
 359                       c = ' ';
 360                       repetitions = tabsize - column % tabsize;
 361                       column = (column + repetitions < column
 362                                 ? 0
 363                                 : column + repetitions);
 364                       break;
 365
 366                     case '\r':
 367                       column = 0;
 368                       break;
 369
 370                     default:
 371                       column++;
 372                       break;
 373                     }
 374
 375                   do
 376                     h = HASH (h, c);
 377                   while (--repetitions != 0);
 378                 }
 379             }
 380             break;
 381
 382           default:
 383             while ((c = *p++) != '\n')
 384               h = HASH (h, c);
 385             break;
 386           }
 387
 388    hashing_done:;
 389
 390       bucket = &buckets[h % nbuckets];
 391       length = p - ip - 1;
 392
 393       if (p == bufend
 394           && current->missing_newline
 395           && ROBUST_OUTPUT_STYLE (output_style))
 396         {
 397           /* This line is incomplete.  If this is significant,
 398              put the line into buckets[-1].  */
 399           if (ignore_white_space < IGNORE_SPACE_CHANGE)
 400             bucket = &buckets[-1];
 401
 402           /* Omit the inserted newline when computing linbuf later.  */
 403           p--;
 404           bufend = suffix_begin = p;
 405         }
 406
 407       for (i = *bucket;  ;  i = eqs[i].next)
 408         if (!i)
 409           {
 410             /* Create a new equivalence class in this bucket.  */
 411             i = eqs_index++;
 412             if (i == eqs_alloc)
 413               {
 414                 if (PTRDIFF_MAX / (2 * sizeof *eqs) <= eqs_alloc)
 415                   xalloc_die ();
 416                 eqs_alloc *= 2;
 417                 eqs = xrealloc (eqs, eqs_alloc * sizeof *eqs);
 418               }
 419             eqs[i].next = *bucket;
 420             eqs[i].hash = h;
 421             eqs[i].line = ip;
 422             eqs[i].length = length;
 423             *bucket = i;
 424             break;
 425           }
 426         else if (eqs[i].hash == h)
 427           {
 428             char const *eqline = eqs[i].line;
 429
 430             /* Reuse existing class if lines_differ reports the lines
 431                equal.  */
 432             if (eqs[i].length == length)
 433               {
 434                 /* Reuse existing equivalence class if the lines are identical.
 435                    This detects the common case of exact identity
 436                    faster than lines_differ would.  */
 437                 if (memcmp (eqline, ip, length) == 0)
 438                   break;
 439                 if (!same_length_diff_contents_compare_anyway)
 440                   continue;
 441               }
 442             else if (!diff_length_compare_anyway)
 443               continue;
 444
 445             if (! lines_differ (eqline, ip))
 446               break;
 447           }
 448
 449       /* Maybe increase the size of the line table.  */
 450       if (line == alloc_lines)
 451         {
 452           /* Double (alloc_lines - linbuf_base) by adding to alloc_lines.  */
 453           if (PTRDIFF_MAX / 3 <= alloc_lines
 454               || PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base
 455               || PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base)
 456             xalloc_die ();
 457           alloc_lines = 2 * alloc_lines - linbuf_base;
 458           cureqs = xrealloc (cureqs, alloc_lines * sizeof *cureqs);
 459           linbuf += linbuf_base;
 460           linbuf = xrealloc (linbuf,
 461                              (alloc_lines - linbuf_base) * sizeof *linbuf);
 462           linbuf -= linbuf_base;
 463         }
 464       linbuf[line] = ip;
 465       cureqs[line] = i;
 466       ++line;
 467     }
 468
 469   current->buffered_lines = line;
 470
 471   for (i = 0;  ;  i++)
 472     {
 473       /* Record the line start for lines in the suffix that we care about.
 474          Record one more line start than lines,
 475          so that we can compute the length of any buffered line.  */
 476       if (line == alloc_lines)
 477         {
 478           /* Double (alloc_lines - linbuf_base) by adding to alloc_lines.  */
 479           if (PTRDIFF_MAX / 3 <= alloc_lines
 480               || PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base
 481               || PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base)
 482             xalloc_die ();
 483           alloc_lines = 2 * alloc_lines - linbuf_base;
 484           linbuf += linbuf_base;
 485           linbuf = xrealloc (linbuf,
 486                              (alloc_lines - linbuf_base) * sizeof *linbuf);
 487           linbuf -= linbuf_base;
 488         }
 489       linbuf[line] = p;
 490
 491       if (p == bufend)
 492         break;
 493
 494       if (context <= i && no_diff_means_no_output)
 495         break;
 496
 497       line++;
 498
 499       while (*p++ != '\n')
 500         continue;
 501     }
 502
 503   /* Done with cache in local variables.  */
 504   current->linbuf = linbuf;
 505   current->valid_lines = line;
 506   current->alloc_lines = alloc_lines;
 507   current->equivs = cureqs;
 508   equivs = eqs;
 509   equivs_alloc = eqs_alloc;
 510   equivs_index = eqs_index;
 511 }
 512 \f
 513 /* Prepare the text.  Make sure the text end is initialized.
 514    Make sure text ends in a newline,
 515    but remember that we had to add one.
 516    Strip trailing CRs, if that was requested.  */
 517
 518 static void
 519 prepare_text (struct file_data *current)
 520 {
 521   size_t buffered = current->buffered;
 522   char *p = FILE_BUFFER (current);
 523   char *dst;
 524
 525   if (buffered == 0 || p[buffered - 1] == '\n')
 526     current->missing_newline = false;
 527   else
 528     {
 529       p[buffered++] = '\n';
 530       current->missing_newline = true;
 531     }
 532
 533   if (!p)
 534     return;
 535
 536   /* Don't use uninitialized storage when planting or using sentinels.  */
 537   memset (p + buffered, 0, sizeof (word));
 538
 539   if (strip_trailing_cr && (dst = memchr (p, '\r', buffered)))
 540     {
 541       char const *src = dst;
 542       char const *srclim = p + buffered;
 543
 544       do
 545         dst += ! ((*dst = *src++) == '\r' && *src == '\n');
 546       while (src < srclim);
 547
 548       buffered -= src - dst;
 549     }
 550
 551   current->buffered = buffered;
 552 }
 553 \f
 554 /* We have found N lines in a buffer of size S; guess the
 555    proportionate number of lines that will be found in a buffer of
 556    size T.  However, do not guess a number of lines so large that the
 557    resulting line table might cause overflow in size calculations.  */
 558 static lin
 559 guess_lines (lin n, size_t s, size_t t)
 560 {
 561   size_t guessed_bytes_per_line = n < 10 ? 32 : s / (n - 1);
 562   lin guessed_lines = MAX (1, t / guessed_bytes_per_line);
 563   return MIN (guessed_lines, PTRDIFF_MAX / (2 * sizeof (char *) + 1) - 5) + 5;
 564 }
 565
 566 /* Given a vector of two file_data objects, find the identical
 567    prefixes and suffixes of each object.  */
 568
 569 static void
 570 find_identical_ends (struct file_data filevec[])
 571 {
 572   word *w0, *w1;
 573   char *p0, *p1, *buffer0, *buffer1;
 574   char const *end0, *beg0;
 575   char const **linbuf0, **linbuf1;
 576   lin i, lines;
 577   size_t n0, n1;
 578   lin alloc_lines0, alloc_lines1;
 579   lin buffered_prefix, prefix_count, prefix_mask;
 580   lin middle_guess, suffix_guess;
 581
 582   slurp (&filevec[0]);
 583   prepare_text (&filevec[0]);
 584   if (filevec[0].desc != filevec[1].desc)
 585     {
 586       slurp (&filevec[1]);
 587       prepare_text (&filevec[1]);
 588     }
 589   else
 590     {
 591       filevec[1].buffer = filevec[0].buffer;
 592       filevec[1].bufsize = filevec[0].bufsize;
 593       filevec[1].buffered = filevec[0].buffered;
 594       filevec[1].missing_newline = filevec[0].missing_newline;
 595     }
 596
 597   /* Find identical prefix.  */
 598
 599   w0 = filevec[0].buffer;
 600   w1 = filevec[1].buffer;
 601   p0 = buffer0 = (char *) w0;
 602   p1 = buffer1 = (char *) w1;
 603   n0 = filevec[0].buffered;
 604   n1 = filevec[1].buffered;
 605
 606   if (p0 == p1)
 607     /* The buffers are the same; sentinels won't work.  */
 608     p0 = p1 += n1;
 609   else
 610     {
 611       /* Insert end sentinels, in this case characters that are guaranteed
 612          to make the equality test false, and thus terminate the loop.  */
 613
 614       if (n0 < n1)
 615         p0[n0] = ~p1[n0];
 616       else
 617         p1[n1] = ~p0[n1];
 618
 619       /* Loop until first mismatch, or to the sentinel characters.  */
 620
 621       /* Compare a word at a time for speed.  */
 622       while (*w0 == *w1)
 623         w0++, w1++;
 624
 625       /* Do the last few bytes of comparison a byte at a time.  */
 626       p0 = (char *) w0;
 627       p1 = (char *) w1;
 628       while (*p0 == *p1)
 629         p0++, p1++;
 630
 631       /* Don't mistakenly count missing newline as part of prefix.  */
 632       if (ROBUST_OUTPUT_STYLE (output_style)
 633           && ((buffer0 + n0 - filevec[0].missing_newline < p0)
 634               !=
 635               (buffer1 + n1 - filevec[1].missing_newline < p1)))
 636         p0--, p1--;
 637     }
 638
 639   /* Now P0 and P1 point at the first nonmatching characters.  */
 640
 641   /* Skip back to last line-beginning in the prefix,
 642      and then discard up to HORIZON_LINES lines from the prefix.  */
 643   i = horizon_lines;
 644   while (p0 != buffer0 && (p0[-1] != '\n' || i--))
 645     p0--, p1--;
 646
 647   /* Record the prefix.  */
 648   filevec[0].prefix_end = p0;
 649   filevec[1].prefix_end = p1;
 650
 651   /* Find identical suffix.  */
 652
 653   /* P0 and P1 point beyond the last chars not yet compared.  */
 654   p0 = buffer0 + n0;
 655   p1 = buffer1 + n1;
 656
 657   if (! ROBUST_OUTPUT_STYLE (output_style)
 658       || filevec[0].missing_newline == filevec[1].missing_newline)
 659     {
 660       end0 = p0;        /* Addr of last char in file 0.  */
 661
 662       /* Get value of P0 at which we should stop scanning backward:
 663          this is when either P0 or P1 points just past the last char
 664          of the identical prefix.  */
 665       beg0 = filevec[0].prefix_end + (n0 < n1 ? 0 : n0 - n1);
 666
 667       /* Scan back until chars don't match or we reach that point.  */
 668       for (; p0 != beg0; p0--, p1--)
 669         if (*p0 != *p1)
 670           {
 671             /* Point at the first char of the matching suffix.  */
 672             beg0 = p0;
 673             break;
 674           }
 675
 676       /* Are we at a line-beginning in both files?  If not, add the rest of
 677          this line to the main body.  Discard up to HORIZON_LINES lines from
 678          the identical suffix.  Also, discard one extra line,
 679          because shift_boundaries may need it.  */
 680       i = horizon_lines + !((buffer0 == p0 || p0[-1] == '\n')
 681                             &&
 682                             (buffer1 == p1 || p1[-1] == '\n'));
 683       while (i-- && p0 != end0)
 684         while (*p0++ != '\n')
 685           continue;
 686
 687       p1 += p0 - beg0;
 688     }
 689
 690   /* Record the suffix.  */
 691   filevec[0].suffix_begin = p0;
 692   filevec[1].suffix_begin = p1;
 693
 694   /* Calculate number of lines of prefix to save.
 695
 696      prefix_count == 0 means save the whole prefix;
 697      we need this for options like -D that output the whole file,
 698      or for enormous contexts (to avoid worrying about arithmetic overflow).
 699      We also need it for options like -F that output some preceding line;
 700      at least we will need to find the last few lines,
 701      but since we don't know how many, it's easiest to find them all.
 702
 703      Otherwise, prefix_count != 0.  Save just prefix_count lines at start
 704      of the line buffer; they'll be moved to the proper location later.
 705      Handle 1 more line than the context says (because we count 1 too many),
 706      rounded up to the next power of 2 to speed index computation.  */
 707
 708   if (no_diff_means_no_output && ! function_regexp.fastmap
 709       && context < LIN_MAX / 4 && context < n0)
 710     {
 711       middle_guess = guess_lines (0, 0, p0 - filevec[0].prefix_end);
 712       suffix_guess = guess_lines (0, 0, buffer0 + n0 - p0);
 713       for (prefix_count = 1;  prefix_count <= context;  prefix_count *= 2)
 714         continue;
 715       alloc_lines0 = (prefix_count + middle_guess
 716                       + MIN (context, suffix_guess));
 717     }
 718   else
 719     {
 720       prefix_count = 0;
 721       alloc_lines0 = guess_lines (0, 0, n0);
 722     }
 723
 724   prefix_mask = prefix_count - 1;
 725   lines = 0;
 726   linbuf0 = xmalloc (alloc_lines0 * sizeof *linbuf0);
 727   p0 = buffer0;
 728
 729   /* If the prefix is needed, find the prefix lines.  */
 730   if (! (no_diff_means_no_output
 731          && filevec[0].prefix_end == p0
 732          && filevec[1].prefix_end == p1))
 733     {
 734       end0 = filevec[0].prefix_end;
 735       while (p0 != end0)
 736         {
 737           lin l = lines++ & prefix_mask;
 738           if (l == alloc_lines0)
 739             {
 740               if (PTRDIFF_MAX / (2 * sizeof *linbuf0) <= alloc_lines0)
 741                 xalloc_die ();
 742               alloc_lines0 *= 2;
 743               linbuf0 = xrealloc (linbuf0, alloc_lines0 * sizeof *linbuf0);
 744             }
 745           linbuf0[l] = p0;
 746           while (*p0++ != '\n')
 747             continue;
 748         }
 749     }
 750   buffered_prefix = prefix_count && context < lines ? context : lines;
 751
 752   /* Allocate line buffer 1.  */
 753
 754   middle_guess = guess_lines (lines, p0 - buffer0, p1 - filevec[1].prefix_end);
 755   suffix_guess = guess_lines (lines, p0 - buffer0, buffer1 + n1 - p1);
 756   alloc_lines1 = buffered_prefix + middle_guess + MIN (context, suffix_guess);
 757   if (alloc_lines1 < buffered_prefix
 758       || PTRDIFF_MAX / sizeof *linbuf1 <= alloc_lines1)
 759     xalloc_die ();
 760   linbuf1 = xmalloc (alloc_lines1 * sizeof *linbuf1);
 761
 762   if (buffered_prefix != lines)
 763     {
 764       /* Rotate prefix lines to proper location.  */
 765       for (i = 0;  i < buffered_prefix;  i++)
 766         linbuf1[i] = linbuf0[(lines - context + i) & prefix_mask];
 767       for (i = 0;  i < buffered_prefix;  i++)
 768         linbuf0[i] = linbuf1[i];
 769     }
 770
 771   /* Initialize line buffer 1 from line buffer 0.  */
 772   for (i = 0; i < buffered_prefix; i++)
 773     linbuf1[i] = linbuf0[i] - buffer0 + buffer1;
 774
 775   /* Record the line buffer, adjusted so that
 776      linbuf[0] points at the first differing line.  */
 777   filevec[0].linbuf = linbuf0 + buffered_prefix;
 778   filevec[1].linbuf = linbuf1 + buffered_prefix;
 779   filevec[0].linbuf_base = filevec[1].linbuf_base = - buffered_prefix;
 780   filevec[0].alloc_lines = alloc_lines0 - buffered_prefix;
 781   filevec[1].alloc_lines = alloc_lines1 - buffered_prefix;
 782   filevec[0].prefix_lines = filevec[1].prefix_lines = lines;
 783 }
 784 \f
 785 /* If 1 < k, then (2**k - prime_offset[k]) is the largest prime less
 786    than 2**k.  This table is derived from Chris K. Caldwell's list
 787    <http://www.utm.edu/research/primes/lists/2small/>.  */
 788
 789 static unsigned char const prime_offset[] =
 790 {
 791   0, 0, 1, 1, 3, 1, 3, 1, 5, 3, 3, 9, 3, 1, 3, 19, 15, 1, 5, 1, 3, 9, 3,
 792   15, 3, 39, 5, 39, 57, 3, 35, 1, 5, 9, 41, 31, 5, 25, 45, 7, 87, 21,
 793   11, 57, 17, 55, 21, 115, 59, 81, 27, 129, 47, 111, 33, 55, 5, 13, 27,
 794   55, 93, 1, 57, 25
 795 };
 796
 797 /* Verify that this host's size_t is not too wide for the above table.  */
 798
 799 verify (enough_prime_offsets,
 800         sizeof (size_t) * CHAR_BIT <= sizeof prime_offset);
 801
 802 /* Given a vector of two file_data objects, read the file associated
 803    with each one, and build the table of equivalence classes.
 804    Return nonzero if either file appears to be a binary file.
 805    If PRETEND_BINARY is nonzero, pretend they are binary regardless.  */
 806
 807 bool
 808 read_files (struct file_data filevec[], bool pretend_binary)
 809 {
 810   int i;
 811   bool skip_test = text | pretend_binary;
 812   bool appears_binary = pretend_binary | sip (&filevec[0], skip_test);
 813
 814   if (filevec[0].desc != filevec[1].desc)
 815     appears_binary |= sip (&filevec[1], skip_test | appears_binary);
 816   else
 817     {
 818       filevec[1].buffer = filevec[0].buffer;
 819       filevec[1].bufsize = filevec[0].bufsize;
 820       filevec[1].buffered = filevec[0].buffered;
 821     }
 822   if (appears_binary)
 823     {
 824       set_binary_mode (filevec[0].desc, true);
 825       set_binary_mode (filevec[1].desc, true);
 826       return true;
 827     }
 828
 829   find_identical_ends (filevec);
 830
 831   equivs_alloc = filevec[0].alloc_lines + filevec[1].alloc_lines + 1;
 832   if (PTRDIFF_MAX / sizeof *equivs <= equivs_alloc)
 833     xalloc_die ();
 834   equivs = xmalloc (equivs_alloc * sizeof *equivs);
 835   /* Equivalence class 0 is permanently safe for lines that were not
 836      hashed.  Real equivalence classes start at 1.  */
 837   equivs_index = 1;
 838
 839   /* Allocate (one plus) a prime number of hash buckets.  Use a prime
 840      number between 1/3 and 2/3 of the value of equiv_allocs,
 841      approximately.  */
 842   for (i = 9; (size_t) 1 << i < equivs_alloc / 3; i++)
 843     continue;
 844   nbuckets = ((size_t) 1 << i) - prime_offset[i];
 845   if (PTRDIFF_MAX / sizeof *buckets <= nbuckets)
 846     xalloc_die ();
 847   buckets = zalloc ((nbuckets + 1) * sizeof *buckets);
 848   buckets++;
 849
 850   for (i = 0; i < 2; i++)
 851     find_and_hash_each_line (&filevec[i]);
 852
 853   filevec[0].equiv_max = filevec[1].equiv_max = equivs_index;
 854
 855   free (equivs);
 856   free (buckets - 1);
 857
 858   return false;
 859 }