contrib/diffutils/src/io.c

   1 /* File I/O for GNU DIFF.
   2
   3    Copyright (C) 1988-1989, 1992-1995, 1998, 2001-2002, 2004, 2006, 2009-2011
   4    Free Software Foundation, Inc.
   5
   6    This file is part of GNU DIFF.
   7
   8    This program is free software: you can redistribute it and/or modify
   9    it under the terms of the GNU General Public License as published by
  10    the Free Software Foundation, either version 3 of the License, or
  11    (at your option) any later version.
  12
  13    This program is distributed in the hope that it will be useful,
  14    but WITHOUT ANY WARRANTY; without even the implied warranty of
  15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16    GNU General Public License for more details.
  17
  18    You should have received a copy of the GNU General Public License
  19    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  20
  21 #include "diff.h"
  22 #include <cmpbuf.h>
  23 #include <file-type.h>
  24 #include <xalloc.h>
  25
  26 /* Rotate an unsigned value to the left.  */
  27 #define ROL(v, n) ((v) << (n) | (v) >> (sizeof (v) * CHAR_BIT - (n)))
  28
  29 /* Given a hash value and a new character, return a new hash value.  */
  30 #define HASH(h, c) ((c) + ROL (h, 7))
  31 \f
  32 /* The type of a hash value.  */
  33 typedef size_t hash_value;
  34 verify (! TYPE_SIGNED (hash_value));
  35
  36 /* Lines are put into equivalence classes of lines that match in lines_differ.
  37    Each equivalence class is represented by one of these structures,
  38    but only while the classes are being computed.
  39    Afterward, each class is represented by a number.  */
  40 struct equivclass
  41 {
  42   lin next;             /* Next item in this bucket.  */
  43   hash_value hash;      /* Hash of lines in this class.  */
  44   char const *line;     /* A line that fits this class.  */
  45   size_t length;        /* That line's length, not counting its newline.  */
  46 };
  47
  48 /* Hash-table: array of buckets, each being a chain of equivalence classes.
  49    buckets[-1] is reserved for incomplete lines.  */
  50 static lin *buckets;
  51
  52 /* Number of buckets in the hash table array, not counting buckets[-1].  */
  53 static size_t nbuckets;
  54
  55 /* Array in which the equivalence classes are allocated.
  56    The bucket-chains go through the elements in this array.
  57    The number of an equivalence class is its index in this array.  */
  58 static struct equivclass *equivs;
  59
  60 /* Index of first free element in the array `equivs'.  */
  61 static lin equivs_index;
  62
  63 /* Number of elements allocated in the array `equivs'.  */
  64 static lin equivs_alloc;
  65 \f
  66 /* Read a block of data into a file buffer, checking for EOF and error.  */
  67
  68 void
  69 file_block_read (struct file_data *current, size_t size)
  70 {
  71   if (size && ! current->eof)
  72     {
  73       size_t s = block_read (current->desc,
  74                              FILE_BUFFER (current) + current->buffered, size);
  75       if (s == SIZE_MAX)
  76         pfatal_with_name (current->name);
  77       current->buffered += s;
  78       current->eof = s < size;
  79     }
  80 }
  81 \f
  82 /* Check for binary files and compare them for exact identity.  */
  83
  84 /* Return 1 if BUF contains a non text character.
  85    SIZE is the number of characters in BUF.  */
  86
  87 #define binary_file_p(buf, size) (memchr (buf, 0, size) != 0)
  88
  89 /* Get ready to read the current file.
  90    Return nonzero if SKIP_TEST is zero,
  91    and if it appears to be a binary file.  */
  92
  93 static bool
  94 sip (struct file_data *current, bool skip_test)
  95 {
  96   /* If we have a nonexistent file at this stage, treat it as empty.  */
  97   if (current->desc < 0)
  98     {
  99       /* Leave room for a sentinel.  */
 100       current->bufsize = sizeof (word);
 101       current->buffer = xmalloc (current->bufsize);
 102     }
 103   else
 104     {
 105       current->bufsize = buffer_lcm (sizeof (word),
 106                                      STAT_BLOCKSIZE (current->stat),
 107                                      PTRDIFF_MAX - 2 * sizeof (word));
 108       current->buffer = xmalloc (current->bufsize);
 109
 110       if (! skip_test)
 111         {
 112           /* Check first part of file to see if it's a binary file.  */
 113
 114           /* FIXME: if O_BINARY, this should revert to text mode
 115              if the file is not binary.  */
 116
 117           file_block_read (current, current->bufsize);
 118           return binary_file_p (current->buffer, current->buffered);
 119         }
 120     }
 121
 122   current->buffered = 0;
 123   current->eof = false;
 124   return false;
 125 }
 126
 127 /* Slurp the rest of the current file completely into memory.  */
 128
 129 static void
 130 slurp (struct file_data *current)
 131 {
 132   size_t cc;
 133
 134   if (current->desc < 0)
 135     {
 136       /* The file is nonexistent.  */
 137       return;
 138     }
 139
 140   if (S_ISREG (current->stat.st_mode))
 141     {
 142       /* It's a regular file; slurp in the rest all at once.  */
 143
 144       /* Get the size out of the stat block.
 145          Allocate just enough room for appended newline plus word sentinel,
 146          plus word-alignment since we want the buffer word-aligned.  */
 147       size_t file_size = current->stat.st_size;
 148       cc = file_size + 2 * sizeof (word) - file_size % sizeof (word);
 149       if (file_size != current->stat.st_size || cc < file_size
 150           || PTRDIFF_MAX <= cc)
 151         xalloc_die ();
 152
 153       if (current->bufsize < cc)
 154         {
 155           current->bufsize = cc;
 156           current->buffer = xrealloc (current->buffer, cc);
 157         }
 158
 159       /* Try to read at least 1 more byte than the size indicates, to
 160          detect whether the file is growing.  This is a nicety for
 161          users who run 'diff' on files while they are changing.  */
 162
 163       if (current->buffered <= file_size)
 164         {
 165           file_block_read (current, file_size + 1 - current->buffered);
 166           if (current->buffered <= file_size)
 167             return;
 168         }
 169     }
 170
 171   /* It's not a regular file, or it's a growing regular file; read it,
 172      growing the buffer as needed.  */
 173
 174   file_block_read (current, current->bufsize - current->buffered);
 175
 176   if (current->buffered)
 177     {
 178       while (current->buffered == current->bufsize)
 179         {
 180           if (PTRDIFF_MAX / 2 - sizeof (word) < current->bufsize)
 181             xalloc_die ();
 182           current->bufsize *= 2;
 183           current->buffer = xrealloc (current->buffer, current->bufsize);
 184           file_block_read (current, current->bufsize - current->buffered);
 185         }
 186
 187       /* Allocate just enough room for appended newline plus word
 188          sentinel, plus word-alignment.  */
 189       cc = current->buffered + 2 * sizeof (word);
 190       current->bufsize = cc - cc % sizeof (word);
 191       current->buffer = xrealloc (current->buffer, current->bufsize);
 192     }
 193 }
 194 \f
 195 /* Split the file into lines, simultaneously computing the equivalence
 196    class for each line.  */
 197
 198 static void
 199 find_and_hash_each_line (struct file_data *current)
 200 {
 201   char const *p = current->prefix_end;
 202   lin i, *bucket;
 203   size_t length;
 204
 205   /* Cache often-used quantities in local variables to help the compiler.  */
 206   char const **linbuf = current->linbuf;
 207   lin alloc_lines = current->alloc_lines;
 208   lin line = 0;
 209   lin linbuf_base = current->linbuf_base;
 210   lin *cureqs = xmalloc (alloc_lines * sizeof *cureqs);
 211   struct equivclass *eqs = equivs;
 212   lin eqs_index = equivs_index;
 213   lin eqs_alloc = equivs_alloc;
 214   char const *suffix_begin = current->suffix_begin;
 215   char const *bufend = FILE_BUFFER (current) + current->buffered;
 216   bool ig_case = ignore_case;
 217   enum DIFF_white_space ig_white_space = ignore_white_space;
 218   bool diff_length_compare_anyway =
 219     ig_white_space != IGNORE_NO_WHITE_SPACE;
 220   bool same_length_diff_contents_compare_anyway =
 221     diff_length_compare_anyway | ig_case;
 222
 223   while (p < suffix_begin)
 224     {
 225       char const *ip = p;
 226       hash_value h = 0;
 227       unsigned char c;
 228
 229       /* Hash this line until we find a newline.  */
 230       switch (ig_white_space)
 231         {
 232         case IGNORE_ALL_SPACE:
 233           while ((c = *p++) != '\n')
 234             if (! isspace (c))
 235               h = HASH (h, ig_case ? tolower (c) : c);
 236           break;
 237
 238         case IGNORE_SPACE_CHANGE:
 239           while ((c = *p++) != '\n')
 240             {
 241               if (isspace (c))
 242                 {
 243                   do
 244                     if ((c = *p++) == '\n')
 245                       goto hashing_done;
 246                   while (isspace (c));
 247
 248                   h = HASH (h, ' ');
 249                 }
 250
 251               /* C is now the first non-space.  */
 252               h = HASH (h, ig_case ? tolower (c) : c);
 253             }
 254           break;
 255
 256         case IGNORE_TAB_EXPANSION:
 257         case IGNORE_TAB_EXPANSION_AND_TRAILING_SPACE:
 258         case IGNORE_TRAILING_SPACE:
 259           {
 260             size_t column = 0;
 261             while ((c = *p++) != '\n')
 262               {
 263                 if (ig_white_space & IGNORE_TRAILING_SPACE
 264                     && isspace (c))
 265                   {
 266                     char const *p1 = p;
 267                     unsigned char c1;
 268                     do
 269                       if ((c1 = *p1++) == '\n')
 270                         {
 271                           p = p1;
 272                           goto hashing_done;
 273                         }
 274                     while (isspace (c1));
 275                   }
 276
 277                 size_t repetitions = 1;
 278
 279                 if (ig_white_space & IGNORE_TAB_EXPANSION)
 280                   switch (c)
 281                     {
 282                     case '\b':
 283                       column -= 0 < column;
 284                       break;
 285
 286                     case '\t':
 287                       c = ' ';
 288                       repetitions = tabsize - column % tabsize;
 289                       column = (column + repetitions < column
 290                                 ? 0
 291                                 : column + repetitions);
 292                       break;
 293
 294                     case '\r':
 295                       column = 0;
 296                       break;
 297
 298                     default:
 299                       column++;
 300                       break;
 301                     }
 302
 303                 if (ig_case)
 304                   c = tolower (c);
 305
 306                 do
 307                   h = HASH (h, c);
 308                 while (--repetitions != 0);
 309               }
 310           }
 311           break;
 312
 313         default:
 314           if (ig_case)
 315             while ((c = *p++) != '\n')
 316               h = HASH (h, tolower (c));
 317           else
 318             while ((c = *p++) != '\n')
 319               h = HASH (h, c);
 320           break;
 321         }
 322
 323    hashing_done:;
 324
 325       bucket = &buckets[h % nbuckets];
 326       length = p - ip - 1;
 327
 328       if (p == bufend
 329           && current->missing_newline
 330           && ROBUST_OUTPUT_STYLE (output_style))
 331         {
 332           /* The last line is incomplete and we do not silently
 333              complete lines.  If the line cannot compare equal to any
 334              complete line, put it into buckets[-1] so that it can
 335              compare equal only to the other file's incomplete line
 336              (if one exists).  */
 337           if (ig_white_space < IGNORE_TRAILING_SPACE)
 338             bucket = &buckets[-1];
 339         }
 340
 341       for (i = *bucket;  ;  i = eqs[i].next)
 342         if (!i)
 343           {
 344             /* Create a new equivalence class in this bucket.  */
 345             i = eqs_index++;
 346             if (i == eqs_alloc)
 347               {
 348                 if (PTRDIFF_MAX / (2 * sizeof *eqs) <= eqs_alloc)
 349                   xalloc_die ();
 350                 eqs_alloc *= 2;
 351                 eqs = xrealloc (eqs, eqs_alloc * sizeof *eqs);
 352               }
 353             eqs[i].next = *bucket;
 354             eqs[i].hash = h;
 355             eqs[i].line = ip;
 356             eqs[i].length = length;
 357             *bucket = i;
 358             break;
 359           }
 360         else if (eqs[i].hash == h)
 361           {
 362             char const *eqline = eqs[i].line;
 363
 364             /* Reuse existing class if lines_differ reports the lines
 365                equal.  */
 366             if (eqs[i].length == length)
 367               {
 368                 /* Reuse existing equivalence class if the lines are identical.
 369                    This detects the common case of exact identity
 370                    faster than lines_differ would.  */
 371                 if (memcmp (eqline, ip, length) == 0)
 372                   break;
 373                 if (!same_length_diff_contents_compare_anyway)
 374                   continue;
 375               }
 376             else if (!diff_length_compare_anyway)
 377               continue;
 378
 379             if (! lines_differ (eqline, ip))
 380               break;
 381           }
 382
 383       /* Maybe increase the size of the line table.  */
 384       if (line == alloc_lines)
 385         {
 386           /* Double (alloc_lines - linbuf_base) by adding to alloc_lines.  */
 387           if (PTRDIFF_MAX / 3 <= alloc_lines
 388               || PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base
 389               || PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base)
 390             xalloc_die ();
 391           alloc_lines = 2 * alloc_lines - linbuf_base;
 392           cureqs = xrealloc (cureqs, alloc_lines * sizeof *cureqs);
 393           linbuf += linbuf_base;
 394           linbuf = xrealloc (linbuf,
 395                              (alloc_lines - linbuf_base) * sizeof *linbuf);
 396           linbuf -= linbuf_base;
 397         }
 398       linbuf[line] = ip;
 399       cureqs[line] = i;
 400       ++line;
 401     }
 402
 403   current->buffered_lines = line;
 404
 405   for (i = 0;  ;  i++)
 406     {
 407       /* Record the line start for lines in the suffix that we care about.
 408          Record one more line start than lines,
 409          so that we can compute the length of any buffered line.  */
 410       if (line == alloc_lines)
 411         {
 412           /* Double (alloc_lines - linbuf_base) by adding to alloc_lines.  */
 413           if (PTRDIFF_MAX / 3 <= alloc_lines
 414               || PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base
 415               || PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base)
 416             xalloc_die ();
 417           alloc_lines = 2 * alloc_lines - linbuf_base;
 418           linbuf += linbuf_base;
 419           linbuf = xrealloc (linbuf,
 420                              (alloc_lines - linbuf_base) * sizeof *linbuf);
 421           linbuf -= linbuf_base;
 422         }
 423       linbuf[line] = p;
 424
 425       if (p == bufend)
 426         {
 427           /* If the last line is incomplete and we do not silently
 428              complete lines, don't count its appended newline.  */
 429           if (current->missing_newline && ROBUST_OUTPUT_STYLE (output_style))
 430             linbuf[line]--;
 431           break;
 432         }
 433
 434       if (context <= i && no_diff_means_no_output)
 435         break;
 436
 437       line++;
 438
 439       while (*p++ != '\n')
 440         continue;
 441     }
 442
 443   /* Done with cache in local variables.  */
 444   current->linbuf = linbuf;
 445   current->valid_lines = line;
 446   current->alloc_lines = alloc_lines;
 447   current->equivs = cureqs;
 448   equivs = eqs;
 449   equivs_alloc = eqs_alloc;
 450   equivs_index = eqs_index;
 451 }
 452 \f
 453 /* Prepare the text.  Make sure the text end is initialized.
 454    Make sure text ends in a newline,
 455    but remember that we had to add one.
 456    Strip trailing CRs, if that was requested.  */
 457
 458 static void
 459 prepare_text (struct file_data *current)
 460 {
 461   size_t buffered = current->buffered;
 462   char *p = FILE_BUFFER (current);
 463   char *dst;
 464
 465   if (buffered == 0 || p[buffered - 1] == '\n')
 466     current->missing_newline = false;
 467   else
 468     {
 469       p[buffered++] = '\n';
 470       current->missing_newline = true;
 471     }
 472
 473   if (!p)
 474     return;
 475
 476   /* Don't use uninitialized storage when planting or using sentinels.  */
 477   memset (p + buffered, 0, sizeof (word));
 478
 479   if (strip_trailing_cr && (dst = memchr (p, '\r', buffered)))
 480     {
 481       char const *src = dst;
 482       char const *srclim = p + buffered;
 483
 484       do
 485         dst += ! ((*dst = *src++) == '\r' && *src == '\n');
 486       while (src < srclim);
 487
 488       buffered -= src - dst;
 489     }
 490
 491   current->buffered = buffered;
 492 }
 493 \f
 494 /* We have found N lines in a buffer of size S; guess the
 495    proportionate number of lines that will be found in a buffer of
 496    size T.  However, do not guess a number of lines so large that the
 497    resulting line table might cause overflow in size calculations.  */
 498 static lin
 499 guess_lines (lin n, size_t s, size_t t)
 500 {
 501   size_t guessed_bytes_per_line = n < 10 ? 32 : s / (n - 1);
 502   lin guessed_lines = MAX (1, t / guessed_bytes_per_line);
 503   return MIN (guessed_lines, PTRDIFF_MAX / (2 * sizeof (char *) + 1) - 5) + 5;
 504 }
 505
 506 /* Given a vector of two file_data objects, find the identical
 507    prefixes and suffixes of each object.  */
 508
 509 static void
 510 find_identical_ends (struct file_data filevec[])
 511 {
 512   word *w0, *w1;
 513   char *p0, *p1, *buffer0, *buffer1;
 514   char const *end0, *beg0;
 515   char const **linbuf0, **linbuf1;
 516   lin i, lines;
 517   size_t n0, n1;
 518   lin alloc_lines0, alloc_lines1;
 519   lin buffered_prefix, prefix_count, prefix_mask;
 520   lin middle_guess, suffix_guess;
 521
 522   slurp (&filevec[0]);
 523   prepare_text (&filevec[0]);
 524   if (filevec[0].desc != filevec[1].desc)
 525     {
 526       slurp (&filevec[1]);
 527       prepare_text (&filevec[1]);
 528     }
 529   else
 530     {
 531       filevec[1].buffer = filevec[0].buffer;
 532       filevec[1].bufsize = filevec[0].bufsize;
 533       filevec[1].buffered = filevec[0].buffered;
 534       filevec[1].missing_newline = filevec[0].missing_newline;
 535     }
 536
 537   /* Find identical prefix.  */
 538
 539   w0 = filevec[0].buffer;
 540   w1 = filevec[1].buffer;
 541   p0 = buffer0 = (char *) w0;
 542   p1 = buffer1 = (char *) w1;
 543   n0 = filevec[0].buffered;
 544   n1 = filevec[1].buffered;
 545
 546   if (p0 == p1)
 547     /* The buffers are the same; sentinels won't work.  */
 548     p0 = p1 += n1;
 549   else
 550     {
 551       /* Insert end sentinels, in this case characters that are guaranteed
 552          to make the equality test false, and thus terminate the loop.  */
 553
 554       if (n0 < n1)
 555         p0[n0] = ~p1[n0];
 556       else
 557         p1[n1] = ~p0[n1];
 558
 559       /* Loop until first mismatch, or to the sentinel characters.  */
 560
 561       /* Compare a word at a time for speed.  */
 562       while (*w0 == *w1)
 563         w0++, w1++;
 564
 565       /* Do the last few bytes of comparison a byte at a time.  */
 566       p0 = (char *) w0;
 567       p1 = (char *) w1;
 568       while (*p0 == *p1)
 569         p0++, p1++;
 570
 571       /* Don't mistakenly count missing newline as part of prefix.  */
 572       if (ROBUST_OUTPUT_STYLE (output_style)
 573           && ((buffer0 + n0 - filevec[0].missing_newline < p0)
 574               !=
 575               (buffer1 + n1 - filevec[1].missing_newline < p1)))
 576         p0--, p1--;
 577     }
 578
 579   /* Now P0 and P1 point at the first nonmatching characters.  */
 580
 581   /* Skip back to last line-beginning in the prefix,
 582      and then discard up to HORIZON_LINES lines from the prefix.  */
 583   i = horizon_lines;
 584   while (p0 != buffer0 && (p0[-1] != '\n' || i--))
 585     p0--, p1--;
 586
 587   /* Record the prefix.  */
 588   filevec[0].prefix_end = p0;
 589   filevec[1].prefix_end = p1;
 590
 591   /* Find identical suffix.  */
 592
 593   /* P0 and P1 point beyond the last chars not yet compared.  */
 594   p0 = buffer0 + n0;
 595   p1 = buffer1 + n1;
 596
 597   if (! ROBUST_OUTPUT_STYLE (output_style)
 598       || filevec[0].missing_newline == filevec[1].missing_newline)
 599     {
 600       end0 = p0;        /* Addr of last char in file 0.  */
 601
 602       /* Get value of P0 at which we should stop scanning backward:
 603          this is when either P0 or P1 points just past the last char
 604          of the identical prefix.  */
 605       beg0 = filevec[0].prefix_end + (n0 < n1 ? 0 : n0 - n1);
 606
 607       /* Scan back until chars don't match or we reach that point.  */
 608       while (p0 != beg0)
 609         if (*--p0 != *--p1)
 610           {
 611             /* Point at the first char of the matching suffix.  */
 612             ++p0, ++p1;
 613             beg0 = p0;
 614             break;
 615           }
 616
 617       /* Are we at a line-beginning in both files?  If not, add the rest of
 618          this line to the main body.  Discard up to HORIZON_LINES lines from
 619          the identical suffix.  Also, discard one extra line,
 620          because shift_boundaries may need it.  */
 621       i = horizon_lines + !((buffer0 == p0 || p0[-1] == '\n')
 622                             &&
 623                             (buffer1 == p1 || p1[-1] == '\n'));
 624       while (i-- && p0 != end0)
 625         while (*p0++ != '\n')
 626           continue;
 627
 628       p1 += p0 - beg0;
 629     }
 630
 631   /* Record the suffix.  */
 632   filevec[0].suffix_begin = p0;
 633   filevec[1].suffix_begin = p1;
 634
 635   /* Calculate number of lines of prefix to save.
 636
 637      prefix_count == 0 means save the whole prefix;
 638      we need this for options like -D that output the whole file,
 639      or for enormous contexts (to avoid worrying about arithmetic overflow).
 640      We also need it for options like -F that output some preceding line;
 641      at least we will need to find the last few lines,
 642      but since we don't know how many, it's easiest to find them all.
 643
 644      Otherwise, prefix_count != 0.  Save just prefix_count lines at start
 645      of the line buffer; they'll be moved to the proper location later.
 646      Handle 1 more line than the context says (because we count 1 too many),
 647      rounded up to the next power of 2 to speed index computation.  */
 648
 649   if (no_diff_means_no_output && ! function_regexp.fastmap
 650       && context < LIN_MAX / 4 && context < n0)
 651     {
 652       middle_guess = guess_lines (0, 0, p0 - filevec[0].prefix_end);
 653       suffix_guess = guess_lines (0, 0, buffer0 + n0 - p0);
 654       for (prefix_count = 1;  prefix_count <= context;  prefix_count *= 2)
 655         continue;
 656       alloc_lines0 = (prefix_count + middle_guess
 657                       + MIN (context, suffix_guess));
 658     }
 659   else
 660     {
 661       prefix_count = 0;
 662       alloc_lines0 = guess_lines (0, 0, n0);
 663     }
 664
 665   prefix_mask = prefix_count - 1;
 666   lines = 0;
 667   linbuf0 = xmalloc (alloc_lines0 * sizeof *linbuf0);
 668   p0 = buffer0;
 669
 670   /* If the prefix is needed, find the prefix lines.  */
 671   if (! (no_diff_means_no_output
 672          && filevec[0].prefix_end == p0
 673          && filevec[1].prefix_end == p1))
 674     {
 675       end0 = filevec[0].prefix_end;
 676       while (p0 != end0)
 677         {
 678           lin l = lines++ & prefix_mask;
 679           if (l == alloc_lines0)
 680             {
 681               if (PTRDIFF_MAX / (2 * sizeof *linbuf0) <= alloc_lines0)
 682                 xalloc_die ();
 683               alloc_lines0 *= 2;
 684               linbuf0 = xrealloc (linbuf0, alloc_lines0 * sizeof *linbuf0);
 685             }
 686           linbuf0[l] = p0;
 687           while (*p0++ != '\n')
 688             continue;
 689         }
 690     }
 691   buffered_prefix = prefix_count && context < lines ? context : lines;
 692
 693   /* Allocate line buffer 1.  */
 694
 695   middle_guess = guess_lines (lines, p0 - buffer0, p1 - filevec[1].prefix_end);
 696   suffix_guess = guess_lines (lines, p0 - buffer0, buffer1 + n1 - p1);
 697   alloc_lines1 = buffered_prefix + middle_guess + MIN (context, suffix_guess);
 698   if (alloc_lines1 < buffered_prefix
 699       || PTRDIFF_MAX / sizeof *linbuf1 <= alloc_lines1)
 700     xalloc_die ();
 701   linbuf1 = xmalloc (alloc_lines1 * sizeof *linbuf1);
 702
 703   if (buffered_prefix != lines)
 704     {
 705       /* Rotate prefix lines to proper location.  */
 706       for (i = 0;  i < buffered_prefix;  i++)
 707         linbuf1[i] = linbuf0[(lines - context + i) & prefix_mask];
 708       for (i = 0;  i < buffered_prefix;  i++)
 709         linbuf0[i] = linbuf1[i];
 710     }
 711
 712   /* Initialize line buffer 1 from line buffer 0.  */
 713   for (i = 0; i < buffered_prefix; i++)
 714     linbuf1[i] = linbuf0[i] - buffer0 + buffer1;
 715
 716   /* Record the line buffer, adjusted so that
 717      linbuf[0] points at the first differing line.  */
 718   filevec[0].linbuf = linbuf0 + buffered_prefix;
 719   filevec[1].linbuf = linbuf1 + buffered_prefix;
 720   filevec[0].linbuf_base = filevec[1].linbuf_base = - buffered_prefix;
 721   filevec[0].alloc_lines = alloc_lines0 - buffered_prefix;
 722   filevec[1].alloc_lines = alloc_lines1 - buffered_prefix;
 723   filevec[0].prefix_lines = filevec[1].prefix_lines = lines;
 724 }
 725 \f
 726 /* If 1 < k, then (2**k - prime_offset[k]) is the largest prime less
 727    than 2**k.  This table is derived from Chris K. Caldwell's list
 728    <http://www.utm.edu/research/primes/lists/2small/>.  */
 729
 730 static unsigned char const prime_offset[] =
 731 {
 732   0, 0, 1, 1, 3, 1, 3, 1, 5, 3, 3, 9, 3, 1, 3, 19, 15, 1, 5, 1, 3, 9, 3,
 733   15, 3, 39, 5, 39, 57, 3, 35, 1, 5, 9, 41, 31, 5, 25, 45, 7, 87, 21,
 734   11, 57, 17, 55, 21, 115, 59, 81, 27, 129, 47, 111, 33, 55, 5, 13, 27,
 735   55, 93, 1, 57, 25
 736 };
 737
 738 /* Verify that this host's size_t is not too wide for the above table.  */
 739
 740 verify (sizeof (size_t) * CHAR_BIT <= sizeof prime_offset);
 741
 742 /* Given a vector of two file_data objects, read the file associated
 743    with each one, and build the table of equivalence classes.
 744    Return nonzero if either file appears to be a binary file.
 745    If PRETEND_BINARY is nonzero, pretend they are binary regardless.  */
 746
 747 bool
 748 read_files (struct file_data filevec[], bool pretend_binary)
 749 {
 750   int i;
 751   bool skip_test = text | pretend_binary;
 752   bool appears_binary = pretend_binary | sip (&filevec[0], skip_test);
 753
 754   if (filevec[0].desc != filevec[1].desc)
 755     appears_binary |= sip (&filevec[1], skip_test | appears_binary);
 756   else
 757     {
 758       filevec[1].buffer = filevec[0].buffer;
 759       filevec[1].bufsize = filevec[0].bufsize;
 760       filevec[1].buffered = filevec[0].buffered;
 761     }
 762   if (appears_binary)
 763     {
 764       /* FIXME: If O_BINARY, this should set both files to binary mode.  */
 765       return true;
 766     }
 767
 768   find_identical_ends (filevec);
 769
 770   equivs_alloc = filevec[0].alloc_lines + filevec[1].alloc_lines + 1;
 771   if (PTRDIFF_MAX / sizeof *equivs <= equivs_alloc)
 772     xalloc_die ();
 773   equivs = xmalloc (equivs_alloc * sizeof *equivs);
 774   /* Equivalence class 0 is permanently safe for lines that were not
 775      hashed.  Real equivalence classes start at 1.  */
 776   equivs_index = 1;
 777
 778   /* Allocate (one plus) a prime number of hash buckets.  Use a prime
 779      number between 1/3 and 2/3 of the value of equiv_allocs,
 780      approximately.  */
 781   for (i = 9; (size_t) 1 << i < equivs_alloc / 3; i++)
 782     continue;
 783   nbuckets = ((size_t) 1 << i) - prime_offset[i];
 784   if (PTRDIFF_MAX / sizeof *buckets <= nbuckets)
 785     xalloc_die ();
 786   buckets = zalloc ((nbuckets + 1) * sizeof *buckets);
 787   buckets++;
 788
 789   for (i = 0; i < 2; i++)
 790     find_and_hash_each_line (&filevec[i]);
 791
 792   filevec[0].equiv_max = filevec[1].equiv_max = equivs_index;
 793
 794   free (equivs);
 795   free (buckets - 1);
 796
 797   return false;
 798 }