contrib/diffutils-2.8.1/src/io.c

   1 /* File I/O for GNU DIFF.
   2
   3    Copyright (C) 1988, 1989, 1992, 1993, 1994, 1995, 1998, 2001, 2002
   4    Free Software Foundation, Inc.
   5
   6    This file is part of GNU DIFF.
   7
   8    GNU DIFF is free software; you can redistribute it and/or modify
   9    it under the terms of the GNU General Public License as published by
  10    the Free Software Foundation; either version 2, or (at your option)
  11    any later version.
  12
  13    GNU DIFF is distributed in the hope that it will be useful,
  14    but WITHOUT ANY WARRANTY; without even the implied warranty of
  15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16    GNU General Public License for more details.
  17
  18    You should have received a copy of the GNU General Public License
  19    along with this program; see the file COPYING.
  20    If not, write to the Free Software Foundation,
  21    59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
  22
  23 #include "diff.h"
  24 #include <cmpbuf.h>
  25 #include <regex.h>
  26 #include <setmode.h>
  27 #include <xalloc.h>
  28
  29 /* Rotate an unsigned value to the left.  */
  30 #define ROL(v, n) ((v) << (n) | (v) >> (sizeof (v) * CHAR_BIT - (n)))
  31
  32 /* Given a hash value and a new character, return a new hash value.  */
  33 #define HASH(h, c) ((c) + ROL (h, 7))
  34 \f
  35 /* The type of a hash value.  */
  36 typedef size_t hash_value;
  37 verify (hash_value_is_unsigned, ! TYPE_SIGNED (hash_value));
  38
  39 /* Lines are put into equivalence classes of lines that match in lines_differ.
  40    Each equivalence class is represented by one of these structures,
  41    but only while the classes are being computed.
  42    Afterward, each class is represented by a number.  */
  43 struct equivclass
  44 {
  45   lin next;             /* Next item in this bucket.  */
  46   hash_value hash;      /* Hash of lines in this class.  */
  47   char const *line;     /* A line that fits this class.  */
  48   size_t length;        /* That line's length, not counting its newline.  */
  49 };
  50
  51 /* Hash-table: array of buckets, each being a chain of equivalence classes.
  52    buckets[-1] is reserved for incomplete lines.  */
  53 static lin *buckets;
  54
  55 /* Number of buckets in the hash table array, not counting buckets[-1].  */
  56 static size_t nbuckets;
  57
  58 /* Array in which the equivalence classes are allocated.
  59    The bucket-chains go through the elements in this array.
  60    The number of an equivalence class is its index in this array.  */
  61 static struct equivclass *equivs;
  62
  63 /* Index of first free element in the array `equivs'.  */
  64 static lin equivs_index;
  65
  66 /* Number of elements allocated in the array `equivs'.  */
  67 static lin equivs_alloc;
  68 \f
  69 /* Read a block of data into a file buffer, checking for EOF and error.  */
  70
  71 void
  72 file_block_read (struct file_data *current, size_t size)
  73 {
  74   if (size && ! current->eof)
  75     {
  76       size_t s = block_read (current->desc,
  77                              FILE_BUFFER (current) + current->buffered, size);
  78       if (s == SIZE_MAX)
  79         pfatal_with_name (current->name);
  80       current->buffered += s;
  81       current->eof = s < size;
  82     }
  83 }
  84 \f
  85 /* Check for binary files and compare them for exact identity.  */
  86
  87 /* Return 1 if BUF contains a non text character.
  88    SIZE is the number of characters in BUF.  */
  89
  90 #define binary_file_p(buf, size) (memchr (buf, 0, size) != 0)
  91
  92 /* Get ready to read the current file.
  93    Return nonzero if SKIP_TEST is zero,
  94    and if it appears to be a binary file.  */
  95
  96 static bool
  97 sip (struct file_data *current, bool skip_test)
  98 {
  99   /* If we have a nonexistent file at this stage, treat it as empty.  */
 100   if (current->desc < 0)
 101     {
 102       /* Leave room for a sentinel.  */
 103       current->bufsize = sizeof (word);
 104       current->buffer = xmalloc (current->bufsize);
 105     }
 106   else
 107     {
 108       current->bufsize = buffer_lcm (sizeof (word),
 109                                      STAT_BLOCKSIZE (current->stat),
 110                                      PTRDIFF_MAX - 2 * sizeof (word));
 111       current->buffer = xmalloc (current->bufsize);
 112
 113       if (! skip_test)
 114         {
 115           /* Check first part of file to see if it's a binary file.  */
 116
 117           bool was_binary = set_binary_mode (current->desc, 1);
 118           off_t buffered;
 119           file_block_read (current, current->bufsize);
 120           buffered = current->buffered;
 121
 122           if (! was_binary)
 123             {
 124               /* Revert to text mode and seek back to the beginning to
 125                  reread the file.  Use relative seek, since file
 126                  descriptors like stdin might not start at offset
 127                  zero.  */
 128
 129               if (lseek (current->desc, - buffered, SEEK_CUR) == -1)
 130                 pfatal_with_name (current->name);
 131               set_binary_mode (current->desc, 0);
 132               current->buffered = 0;
 133               current->eof = 0;
 134             }
 135
 136           return binary_file_p (current->buffer, buffered);
 137         }
 138     }
 139
 140   current->buffered = 0;
 141   current->eof = 0;
 142   return 0;
 143 }
 144
 145 /* Slurp the rest of the current file completely into memory.  */
 146
 147 static void
 148 slurp (struct file_data *current)
 149 {
 150   size_t cc;
 151
 152   if (current->desc < 0)
 153     {
 154       /* The file is nonexistent.  */
 155       return;
 156     }
 157
 158   if (S_ISREG (current->stat.st_mode))
 159     {
 160       /* It's a regular file; slurp in the rest all at once.  */
 161
 162       /* Get the size out of the stat block.
 163          Allocate just enough room for appended newline plus word sentinel,
 164          plus word-alignment since we want the buffer word-aligned.  */
 165       size_t file_size = current->stat.st_size;
 166       cc = file_size + 2 * sizeof (word) - file_size % sizeof (word);
 167       if (file_size != current->stat.st_size || cc < file_size
 168           || PTRDIFF_MAX <= cc)
 169         xalloc_die ();
 170
 171       if (current->bufsize < cc)
 172         {
 173           current->bufsize = cc;
 174           current->buffer = xrealloc (current->buffer, cc);
 175         }
 176
 177       /* Try to read at least 1 more byte than the size indicates, to
 178          detect whether the file is growing.  This is a nicety for
 179          users who run 'diff' on files while they are changing.  */
 180
 181       if (current->buffered <= file_size)
 182         {
 183           file_block_read (current, file_size + 1 - current->buffered);
 184           if (current->buffered <= file_size)
 185             return;
 186         }
 187     }
 188
 189   /* It's not a regular file, or it's a growing regular file; read it,
 190      growing the buffer as needed.  */
 191
 192   file_block_read (current, current->bufsize - current->buffered);
 193
 194   if (current->buffered)
 195     {
 196       while (current->buffered == current->bufsize)
 197         {
 198           if (PTRDIFF_MAX / 2 - sizeof (word) < current->bufsize)
 199             xalloc_die ();
 200           current->bufsize *= 2;
 201           current->buffer = xrealloc (current->buffer, current->bufsize);
 202           file_block_read (current, current->bufsize - current->buffered);
 203         }
 204
 205       /* Allocate just enough room for appended newline plus word
 206          sentinel, plus word-alignment.  */
 207       cc = current->buffered + 2 * sizeof (word);
 208       current->bufsize = cc - cc % sizeof (word);
 209       current->buffer = xrealloc (current->buffer, current->bufsize);
 210     }
 211 }
 212 \f
 213 /* Split the file into lines, simultaneously computing the equivalence
 214    class for each line.  */
 215
 216 static void
 217 find_and_hash_each_line (struct file_data *current)
 218 {
 219   hash_value h;
 220   unsigned char const *p = (unsigned char const *) current->prefix_end;
 221   unsigned char c;
 222   lin i, *bucket;
 223   size_t length;
 224
 225   /* Cache often-used quantities in local variables to help the compiler.  */
 226   char const **linbuf = current->linbuf;
 227   lin alloc_lines = current->alloc_lines;
 228   lin line = 0;
 229   lin linbuf_base = current->linbuf_base;
 230   lin *cureqs = xmalloc (alloc_lines * sizeof *cureqs);
 231   struct equivclass *eqs = equivs;
 232   lin eqs_index = equivs_index;
 233   lin eqs_alloc = equivs_alloc;
 234   char const *suffix_begin = current->suffix_begin;
 235   char const *bufend = FILE_BUFFER (current) + current->buffered;
 236   bool diff_length_compare_anyway =
 237     ignore_white_space != IGNORE_NO_WHITE_SPACE;
 238   bool same_length_diff_contents_compare_anyway =
 239     diff_length_compare_anyway | ignore_case;
 240
 241   while ((char const *) p < suffix_begin)
 242     {
 243       char const *ip = (char const *) p;
 244
 245       h = 0;
 246
 247       /* Hash this line until we find a newline.  */
 248       if (ignore_case)
 249         switch (ignore_white_space)
 250           {
 251           case IGNORE_ALL_SPACE:
 252             while ((c = *p++) != '\n')
 253               if (! ISSPACE (c))
 254                 h = HASH (h, TOLOWER (c));
 255             break;
 256
 257           case IGNORE_SPACE_CHANGE:
 258             while ((c = *p++) != '\n')
 259               {
 260                 if (ISSPACE (c))
 261                   {
 262                     do
 263                       if ((c = *p++) == '\n')
 264                         goto hashing_done;
 265                     while (ISSPACE (c));
 266
 267                     h = HASH (h, ' ');
 268                   }
 269
 270                 /* C is now the first non-space.  */
 271                 h = HASH (h, TOLOWER (c));
 272               }
 273             break;
 274
 275           case IGNORE_TAB_EXPANSION:
 276             {
 277               size_t column = 0;
 278               while ((c = *p++) != '\n')
 279                 {
 280                   int repetitions = 1;
 281
 282                   switch (c)
 283                     {
 284                     case '\b':
 285                       column -= 0 < column;
 286                       break;
 287
 288                     case '\t':
 289                       c = ' ';
 290                       repetitions = TAB_WIDTH - column % TAB_WIDTH;
 291                       column += repetitions;
 292                       break;
 293
 294                     case '\r':
 295                       column = 0;
 296                       break;
 297
 298                     default:
 299                       c = TOLOWER (c);
 300                       column++;
 301                       break;
 302                     }
 303
 304                   do
 305                     h = HASH (h, c);
 306                   while (--repetitions != 0);
 307                 }
 308             }
 309             break;
 310
 311           default:
 312             while ((c = *p++) != '\n')
 313               h = HASH (h, TOLOWER (c));
 314             break;
 315           }
 316       else
 317         switch (ignore_white_space)
 318           {
 319           case IGNORE_ALL_SPACE:
 320             while ((c = *p++) != '\n')
 321               if (! ISSPACE (c))
 322                 h = HASH (h, c);
 323             break;
 324
 325           case IGNORE_SPACE_CHANGE:
 326             while ((c = *p++) != '\n')
 327               {
 328                 if (ISSPACE (c))
 329                   {
 330                     do
 331                       if ((c = *p++) == '\n')
 332                         goto hashing_done;
 333                     while (ISSPACE (c));
 334
 335                     h = HASH (h, ' ');
 336                   }
 337
 338                 /* C is now the first non-space.  */
 339                 h = HASH (h, c);
 340               }
 341             break;
 342
 343           case IGNORE_TAB_EXPANSION:
 344             {
 345               size_t column = 0;
 346               while ((c = *p++) != '\n')
 347                 {
 348                   int repetitions = 1;
 349
 350                   switch (c)
 351                     {
 352                     case '\b':
 353                       column -= 0 < column;
 354                       break;
 355
 356                     case '\t':
 357                       c = ' ';
 358                       repetitions = TAB_WIDTH - column % TAB_WIDTH;
 359                       column += repetitions;
 360                       break;
 361
 362                     case '\r':
 363                       column = 0;
 364                       break;
 365
 366                     default:
 367                       column++;
 368                       break;
 369                     }
 370
 371                   do
 372                     h = HASH (h, c);
 373                   while (--repetitions != 0);
 374                 }
 375             }
 376             break;
 377
 378           default:
 379             while ((c = *p++) != '\n')
 380               h = HASH (h, c);
 381             break;
 382           }
 383
 384    hashing_done:;
 385
 386       bucket = &buckets[h % nbuckets];
 387       length = (char const *) p - ip - 1;
 388
 389       if ((char const *) p == bufend
 390           && current->missing_newline
 391           && ROBUST_OUTPUT_STYLE (output_style))
 392         {
 393           /* This line is incomplete.  If this is significant,
 394              put the line into buckets[-1].  */
 395           if (ignore_white_space < IGNORE_SPACE_CHANGE)
 396             bucket = &buckets[-1];
 397
 398           /* Omit the inserted newline when computing linbuf later.  */
 399           p--;
 400           bufend = suffix_begin = (char const *) p;
 401         }
 402
 403       for (i = *bucket;  ;  i = eqs[i].next)
 404         if (!i)
 405           {
 406             /* Create a new equivalence class in this bucket.  */
 407             i = eqs_index++;
 408             if (i == eqs_alloc)
 409               {
 410                 if (PTRDIFF_MAX / (2 * sizeof *eqs) <= eqs_alloc)
 411                   xalloc_die ();
 412                 eqs_alloc *= 2;
 413                 eqs = xrealloc (eqs, eqs_alloc * sizeof *eqs);
 414               }
 415             eqs[i].next = *bucket;
 416             eqs[i].hash = h;
 417             eqs[i].line = ip;
 418             eqs[i].length = length;
 419             *bucket = i;
 420             break;
 421           }
 422         else if (eqs[i].hash == h)
 423           {
 424             char const *eqline = eqs[i].line;
 425
 426             /* Reuse existing class if lines_differ reports the lines
 427                equal.  */
 428             if (eqs[i].length == length)
 429               {
 430                 /* Reuse existing equivalence class if the lines are identical.
 431                    This detects the common case of exact identity
 432                    faster than lines_differ would.  */
 433                 if (memcmp (eqline, ip, length) == 0)
 434                   break;
 435                 if (!same_length_diff_contents_compare_anyway)
 436                   continue;
 437               }
 438             else if (!diff_length_compare_anyway)
 439               continue;
 440
 441             if (! lines_differ (eqline, ip))
 442               break;
 443           }
 444
 445       /* Maybe increase the size of the line table.  */
 446       if (line == alloc_lines)
 447         {
 448           /* Double (alloc_lines - linbuf_base) by adding to alloc_lines.  */
 449           if (PTRDIFF_MAX / 3 <= alloc_lines
 450               || PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base
 451               || PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base)
 452             xalloc_die ();
 453           alloc_lines = 2 * alloc_lines - linbuf_base;
 454           cureqs = xrealloc (cureqs, alloc_lines * sizeof *cureqs);
 455           linbuf += linbuf_base;
 456           linbuf = xrealloc (linbuf,
 457                              (alloc_lines - linbuf_base) * sizeof *linbuf);
 458           linbuf -= linbuf_base;
 459         }
 460       linbuf[line] = ip;
 461       cureqs[line] = i;
 462       ++line;
 463     }
 464
 465   current->buffered_lines = line;
 466
 467   for (i = 0;  ;  i++)
 468     {
 469       /* Record the line start for lines in the suffix that we care about.
 470          Record one more line start than lines,
 471          so that we can compute the length of any buffered line.  */
 472       if (line == alloc_lines)
 473         {
 474           /* Double (alloc_lines - linbuf_base) by adding to alloc_lines.  */
 475           if (PTRDIFF_MAX / 3 <= alloc_lines
 476               || PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base
 477               || PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base)
 478             xalloc_die ();
 479           alloc_lines = 2 * alloc_lines - linbuf_base;
 480           linbuf += linbuf_base;
 481           linbuf = xrealloc (linbuf,
 482                              (alloc_lines - linbuf_base) * sizeof *linbuf);
 483           linbuf -= linbuf_base;
 484         }
 485       linbuf[line] = (char const *) p;
 486
 487       if ((char const *) p == bufend)
 488         break;
 489
 490       if (context <= i && no_diff_means_no_output)
 491         break;
 492
 493       line++;
 494
 495       while (*p++ != '\n')
 496         continue;
 497     }
 498
 499   /* Done with cache in local variables.  */
 500   current->linbuf = linbuf;
 501   current->valid_lines = line;
 502   current->alloc_lines = alloc_lines;
 503   current->equivs = cureqs;
 504   equivs = eqs;
 505   equivs_alloc = eqs_alloc;
 506   equivs_index = eqs_index;
 507 }
 508 \f
 509 /* Prepare the text.  Make sure the text end is initialized.
 510    Make sure text ends in a newline,
 511    but remember that we had to add one.
 512    Strip trailing CRs, if that was requested.  */
 513
 514 static void
 515 prepare_text (struct file_data *current)
 516 {
 517   size_t buffered = current->buffered;
 518   char *p = FILE_BUFFER (current);
 519   char *dst;
 520
 521   if (buffered == 0 || p[buffered - 1] == '\n')
 522     current->missing_newline = 0;
 523   else
 524     {
 525       p[buffered++] = '\n';
 526       current->missing_newline = 1;
 527     }
 528
 529   if (!p)
 530     return;
 531
 532   /* Don't use uninitialized storage when planting or using sentinels.  */
 533   memset (p + buffered, 0, sizeof (word));
 534
 535   if (strip_trailing_cr && (dst = memchr (p, '\r', buffered)))
 536     {
 537       char const *src = dst;
 538       char const *srclim = p + buffered;
 539
 540       do
 541         dst += ! ((*dst = *src++) == '\r' && *src == '\n');
 542       while (src < srclim);
 543
 544       buffered -= src - dst;
 545     }
 546
 547   current->buffered = buffered;
 548 }
 549 \f
 550 /* We have found N lines in a buffer of size S; guess the
 551    proportionate number of lines that will be found in a buffer of
 552    size T.  However, do not guess a number of lines so large that the
 553    resulting line table might cause overflow in size calculations.  */
 554 static lin
 555 guess_lines (lin n, size_t s, size_t t)
 556 {
 557   size_t guessed_bytes_per_line = n < 10 ? 32 : s / (n - 1);
 558   lin guessed_lines = MAX (1, t / guessed_bytes_per_line);
 559   return MIN (guessed_lines, PTRDIFF_MAX / (2 * sizeof (char *) + 1) - 5) + 5;
 560 }
 561
 562 /* Given a vector of two file_data objects, find the identical
 563    prefixes and suffixes of each object.  */
 564
 565 static void
 566 find_identical_ends (struct file_data filevec[])
 567 {
 568   word *w0, *w1;
 569   char *p0, *p1, *buffer0, *buffer1;
 570   char const *end0, *beg0;
 571   char const **linbuf0, **linbuf1;
 572   lin i, lines;
 573   size_t n0, n1;
 574   lin alloc_lines0, alloc_lines1;
 575   lin buffered_prefix, prefix_count, prefix_mask;
 576   lin middle_guess, suffix_guess;
 577
 578   slurp (&filevec[0]);
 579   prepare_text (&filevec[0]);
 580   if (filevec[0].desc != filevec[1].desc)
 581     {
 582       slurp (&filevec[1]);
 583       prepare_text (&filevec[1]);
 584     }
 585   else
 586     {
 587       filevec[1].buffer = filevec[0].buffer;
 588       filevec[1].bufsize = filevec[0].bufsize;
 589       filevec[1].buffered = filevec[0].buffered;
 590       filevec[1].missing_newline = filevec[0].missing_newline;
 591     }
 592
 593   /* Find identical prefix.  */
 594
 595   w0 = filevec[0].buffer;
 596   w1 = filevec[1].buffer;
 597   p0 = buffer0 = (char *) w0;
 598   p1 = buffer1 = (char *) w1;
 599   n0 = filevec[0].buffered;
 600   n1 = filevec[1].buffered;
 601
 602   if (p0 == p1)
 603     /* The buffers are the same; sentinels won't work.  */
 604     p0 = p1 += n1;
 605   else
 606     {
 607       /* Insert end sentinels, in this case characters that are guaranteed
 608          to make the equality test false, and thus terminate the loop.  */
 609
 610       if (n0 < n1)
 611         p0[n0] = ~p1[n0];
 612       else
 613         p1[n1] = ~p0[n1];
 614
 615       /* Loop until first mismatch, or to the sentinel characters.  */
 616
 617       /* Compare a word at a time for speed.  */
 618       while (*w0 == *w1)
 619         w0++, w1++;
 620
 621       /* Do the last few bytes of comparison a byte at a time.  */
 622       p0 = (char *) w0;
 623       p1 = (char *) w1;
 624       while (*p0 == *p1)
 625         p0++, p1++;
 626
 627       /* Don't mistakenly count missing newline as part of prefix.  */
 628       if (ROBUST_OUTPUT_STYLE (output_style)
 629           && ((buffer0 + n0 - filevec[0].missing_newline < p0)
 630               !=
 631               (buffer1 + n1 - filevec[1].missing_newline < p1)))
 632         p0--, p1--;
 633     }
 634
 635   /* Now P0 and P1 point at the first nonmatching characters.  */
 636
 637   /* Skip back to last line-beginning in the prefix,
 638      and then discard up to HORIZON_LINES lines from the prefix.  */
 639   i = horizon_lines;
 640   while (p0 != buffer0 && (p0[-1] != '\n' || i--))
 641     p0--, p1--;
 642
 643   /* Record the prefix.  */
 644   filevec[0].prefix_end = p0;
 645   filevec[1].prefix_end = p1;
 646
 647   /* Find identical suffix.  */
 648
 649   /* P0 and P1 point beyond the last chars not yet compared.  */
 650   p0 = buffer0 + n0;
 651   p1 = buffer1 + n1;
 652
 653   if (! ROBUST_OUTPUT_STYLE (output_style)
 654       || filevec[0].missing_newline == filevec[1].missing_newline)
 655     {
 656       end0 = p0;        /* Addr of last char in file 0.  */
 657
 658       /* Get value of P0 at which we should stop scanning backward:
 659          this is when either P0 or P1 points just past the last char
 660          of the identical prefix.  */
 661       beg0 = filevec[0].prefix_end + (n0 < n1 ? 0 : n0 - n1);
 662
 663       /* Scan back until chars don't match or we reach that point.  */
 664       for (; p0 != beg0; p0--, p1--)
 665         if (*p0 != *p1)
 666           {
 667             /* Point at the first char of the matching suffix.  */
 668             beg0 = p0;
 669             break;
 670           }
 671
 672       /* Are we at a line-beginning in both files?  If not, add the rest of
 673          this line to the main body.  Discard up to HORIZON_LINES lines from
 674          the identical suffix.  Also, discard one extra line,
 675          because shift_boundaries may need it.  */
 676       i = horizon_lines + !((buffer0 == p0 || p0[-1] == '\n')
 677                             &&
 678                             (buffer1 == p1 || p1[-1] == '\n'));
 679       while (i-- && p0 != end0)
 680         while (*p0++ != '\n')
 681           continue;
 682
 683       p1 += p0 - beg0;
 684     }
 685
 686   /* Record the suffix.  */
 687   filevec[0].suffix_begin = p0;
 688   filevec[1].suffix_begin = p1;
 689
 690   /* Calculate number of lines of prefix to save.
 691
 692      prefix_count == 0 means save the whole prefix;
 693      we need this for options like -D that output the whole file,
 694      or for enormous contexts (to avoid worrying about arithmetic overflow).
 695      We also need it for options like -F that output some preceding line;
 696      at least we will need to find the last few lines,
 697      but since we don't know how many, it's easiest to find them all.
 698
 699      Otherwise, prefix_count != 0.  Save just prefix_count lines at start
 700      of the line buffer; they'll be moved to the proper location later.
 701      Handle 1 more line than the context says (because we count 1 too many),
 702      rounded up to the next power of 2 to speed index computation.  */
 703
 704   if (no_diff_means_no_output && ! function_regexp.fastmap
 705       && context < LIN_MAX / 4 && context < n0)
 706     {
 707       middle_guess = guess_lines (0, 0, p0 - filevec[0].prefix_end);
 708       suffix_guess = guess_lines (0, 0, buffer0 + n0 - p0);
 709       for (prefix_count = 1;  prefix_count <= context;  prefix_count *= 2)
 710         continue;
 711       alloc_lines0 = (prefix_count + middle_guess
 712                       + MIN (context, suffix_guess));
 713     }
 714   else
 715     {
 716       prefix_count = 0;
 717       alloc_lines0 = guess_lines (0, 0, n0);
 718     }
 719
 720   prefix_mask = prefix_count - 1;
 721   lines = 0;
 722   linbuf0 = xmalloc (alloc_lines0 * sizeof *linbuf0);
 723   p0 = buffer0;
 724
 725   /* If the prefix is needed, find the prefix lines.  */
 726   if (! (no_diff_means_no_output
 727          && filevec[0].prefix_end == p0
 728          && filevec[1].prefix_end == p1))
 729     {
 730       end0 = filevec[0].prefix_end;
 731       while (p0 != end0)
 732         {
 733           lin l = lines++ & prefix_mask;
 734           if (l == alloc_lines0)
 735             {
 736               if (PTRDIFF_MAX / (2 * sizeof *linbuf0) <= alloc_lines0)
 737                 xalloc_die ();
 738               alloc_lines0 *= 2;
 739               linbuf0 = xrealloc (linbuf0, alloc_lines0 * sizeof *linbuf0);
 740             }
 741           linbuf0[l] = p0;
 742           while (*p0++ != '\n')
 743             continue;
 744         }
 745     }
 746   buffered_prefix = prefix_count && context < lines ? context : lines;
 747
 748   /* Allocate line buffer 1.  */
 749
 750   middle_guess = guess_lines (lines, p0 - buffer0, p1 - filevec[1].prefix_end);
 751   suffix_guess = guess_lines (lines, p0 - buffer0, buffer1 + n1 - p1);
 752   alloc_lines1 = buffered_prefix + middle_guess + MIN (context, suffix_guess);
 753   if (alloc_lines1 < buffered_prefix
 754       || PTRDIFF_MAX / sizeof *linbuf1 <= alloc_lines1)
 755     xalloc_die ();
 756   linbuf1 = xmalloc (alloc_lines1 * sizeof *linbuf1);
 757
 758   if (buffered_prefix != lines)
 759     {
 760       /* Rotate prefix lines to proper location.  */
 761       for (i = 0;  i < buffered_prefix;  i++)
 762         linbuf1[i] = linbuf0[(lines - context + i) & prefix_mask];
 763       for (i = 0;  i < buffered_prefix;  i++)
 764         linbuf0[i] = linbuf1[i];
 765     }
 766
 767   /* Initialize line buffer 1 from line buffer 0.  */
 768   for (i = 0; i < buffered_prefix; i++)
 769     linbuf1[i] = linbuf0[i] - buffer0 + buffer1;
 770
 771   /* Record the line buffer, adjusted so that
 772      linbuf[0] points at the first differing line.  */
 773   filevec[0].linbuf = linbuf0 + buffered_prefix;
 774   filevec[1].linbuf = linbuf1 + buffered_prefix;
 775   filevec[0].linbuf_base = filevec[1].linbuf_base = - buffered_prefix;
 776   filevec[0].alloc_lines = alloc_lines0 - buffered_prefix;
 777   filevec[1].alloc_lines = alloc_lines1 - buffered_prefix;
 778   filevec[0].prefix_lines = filevec[1].prefix_lines = lines;
 779 }
 780 \f
 781 /* If 1 < k, then (2**k - prime_offset[k]) is the largest prime less
 782    than 2**k.  This table is derived from Chris K. Caldwell's list
 783    <http://www.utm.edu/research/primes/lists/2small/>.  */
 784
 785 static unsigned char const prime_offset[] =
 786 {
 787   0, 0, 1, 1, 3, 1, 3, 1, 5, 3, 3, 9, 3, 1, 3, 19, 15, 1, 5, 1, 3, 9, 3,
 788   15, 3, 39, 5, 39, 57, 3, 35, 1, 5, 9, 41, 31, 5, 25, 45, 7, 87, 21,
 789   11, 57, 17, 55, 21, 115, 59, 81, 27, 129, 47, 111, 33, 55, 5, 13, 27,
 790   55, 93, 1, 57, 25
 791 };
 792
 793 /* Verify that this host's size_t is not too wide for the above table.  */
 794
 795 verify (enough_prime_offsets,
 796         sizeof (size_t) * CHAR_BIT <= sizeof prime_offset);
 797
 798 /* Given a vector of two file_data objects, read the file associated
 799    with each one, and build the table of equivalence classes.
 800    Return nonzero if either file appears to be a binary file.
 801    If PRETEND_BINARY is nonzero, pretend they are binary regardless.  */
 802
 803 bool
 804 read_files (struct file_data filevec[], bool pretend_binary)
 805 {
 806   int i;
 807   bool skip_test = text | pretend_binary;
 808   bool appears_binary = pretend_binary | sip (&filevec[0], skip_test);
 809
 810   if (filevec[0].desc != filevec[1].desc)
 811     appears_binary |= sip (&filevec[1], skip_test | appears_binary);
 812   else
 813     {
 814       filevec[1].buffer = filevec[0].buffer;
 815       filevec[1].bufsize = filevec[0].bufsize;
 816       filevec[1].buffered = filevec[0].buffered;
 817     }
 818   if (appears_binary)
 819     {
 820       set_binary_mode (filevec[0].desc, 1);
 821       set_binary_mode (filevec[1].desc, 1);
 822       return 1;
 823     }
 824
 825   find_identical_ends (filevec);
 826
 827   equivs_alloc = filevec[0].alloc_lines + filevec[1].alloc_lines + 1;
 828   if (PTRDIFF_MAX / sizeof *equivs <= equivs_alloc)
 829     xalloc_die ();
 830   equivs = xmalloc (equivs_alloc * sizeof *equivs);
 831   /* Equivalence class 0 is permanently safe for lines that were not
 832      hashed.  Real equivalence classes start at 1.  */
 833   equivs_index = 1;
 834
 835   /* Allocate (one plus) a prime number of hash buckets.  Use a prime
 836      number between 1/3 and 2/3 of the value of equiv_allocs,
 837      approximately.  */
 838   for (i = 9; (size_t) 1 << i < equivs_alloc / 3; i++)
 839     continue;
 840   nbuckets = ((size_t) 1 << i) - prime_offset[i];
 841   if (PTRDIFF_MAX / sizeof *buckets <= nbuckets)
 842     xalloc_die ();
 843   buckets = zalloc ((nbuckets + 1) * sizeof *buckets);
 844   buckets++;
 845
 846   for (i = 0; i < 2; i++)
 847     find_and_hash_each_line (&filevec[i]);
 848
 849   filevec[0].equiv_max = filevec[1].equiv_max = equivs_index;
 850
 851   free (equivs);
 852   free (buckets - 1);
 853
 854   return 0;
 855 }