sys/i386/i386/in_cksum.c

   1 /*-
   2  * Copyright (c) 1990 The Regents of the University of California.
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  * 3. All advertising materials mentioning features or use of this software
  14  *    must display the following acknowledgement:
  15  *      This product includes software developed by the University of
  16  *      California, Berkeley and its contributors.
  17  * 4. Neither the name of the University nor the names of its contributors
  18  *    may be used to endorse or promote products derived from this software
  19  *    without specific prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  31  * SUCH DAMAGE.
  32  *
  33  *      from tahoe:     in_cksum.c      1.2     86/01/05
  34  *      from:           @(#)in_cksum.c  1.3 (Berkeley) 1/19/91
  35  * $FreeBSD: src/sys/i386/i386/in_cksum.c,v 1.17.2.3 2002/07/02 04:03:00 jdp Exp $
  36  */
  37
  38 #include <sys/param.h>
  39 #include <sys/systm.h>
  40 #include <sys/mbuf.h>
  41
  42 #include <netinet/in.h>
  43 #include <netinet/in_systm.h>
  44 #include <netinet/ip.h>
  45
  46 #include <machine/in_cksum.h>
  47
  48 /*
  49  * Checksum routine for Internet Protocol family headers.
  50  *
  51  * This routine is very heavily used in the network
  52  * code and should be modified for each CPU to be as fast as possible.
  53  *
  54  * This implementation is 386 version.
  55  */
  56
  57 #undef  ADDCARRY
  58 #define ADDCARRY(x)     if ((x) > 0xffff) (x) -= 0xffff
  59 #define REDUCE          {sum = (sum & 0xffff) + (sum >> 16); ADDCARRY(sum);}
  60
  61 /*
  62  * These asm statements require __volatile because they pass information
  63  * via the condition codes.  GCC does not currently provide a way to specify
  64  * the condition codes as an input or output operand.
  65  *
  66  * The LOAD macro below is effectively a prefetch into cache.  GCC will
  67  * load the value into a register but will not use it.  Since modern CPUs
  68  * reorder operations, this will generally take place in parallel with
  69  * other calculations.
  70  */
  71 #define ADD(n)  __asm __volatile \
  72                 ("addl %1, %0" : "+r" (sum) : \
  73                 "g" (((const u_int32_t *)w)[n / 4]))
  74 #define ADDC(n) __asm __volatile \
  75                 ("adcl %1, %0" : "+r" (sum) : \
  76                 "g" (((const u_int32_t *)w)[n / 4]))
  77 #define LOAD(n) __asm __volatile \
  78                 ("" : : "r" (((const u_int32_t *)w)[n / 4]))
  79 #define MOP     __asm __volatile \
  80                 ("adcl         $0, %0" : "+r" (sum))
  81
  82 int
  83 in_cksum(m, len)
  84         register struct mbuf *m;
  85         register int len;
  86 {
  87         register u_short *w;
  88         register unsigned sum = 0;
  89         register int mlen = 0;
  90         int byte_swapped = 0;
  91         union { char    c[2]; u_short   s; } su;
  92
  93         for (;m && len; m = m->m_next) {
  94                 if (m->m_len == 0)
  95                         continue;
  96                 w = mtod(m, u_short *);
  97                 if (mlen == -1) {
  98                         /*
  99                          * The first byte of this mbuf is the continuation
 100                          * of a word spanning between this mbuf and the
 101                          * last mbuf.
 102                          */
 103
 104                         /* su.c[0] is already saved when scanning previous
 105                          * mbuf.  sum was REDUCEd when we found mlen == -1
 106                          */
 107                         su.c[1] = *(u_char *)w;
 108                         sum += su.s;
 109                         w = (u_short *)((char *)w + 1);
 110                         mlen = m->m_len - 1;
 111                         len--;
 112                 } else
 113                         mlen = m->m_len;
 114                 if (len < mlen)
 115                         mlen = len;
 116                 len -= mlen;
 117                 /*
 118                  * Force to long boundary so we do longword aligned
 119                  * memory operations
 120                  */
 121                 if (3 & (int) w) {
 122                         REDUCE;
 123                         if ((1 & (int) w) && (mlen > 0)) {
 124                                 sum <<= 8;
 125                                 su.c[0] = *(char *)w;
 126                                 w = (u_short *)((char *)w + 1);
 127                                 mlen--;
 128                                 byte_swapped = 1;
 129                         }
 130                         if ((2 & (int) w) && (mlen >= 2)) {
 131                                 sum += *w++;
 132                                 mlen -= 2;
 133                         }
 134                 }
 135                 /*
 136                  * Advance to a 486 cache line boundary.
 137                  */
 138                 if (4 & (int) w && mlen >= 4) {
 139                         ADD(0);
 140                         MOP;
 141                         w += 2;
 142                         mlen -= 4;
 143                 }
 144                 if (8 & (int) w && mlen >= 8) {
 145                         ADD(0);
 146                         ADDC(4);
 147                         MOP;
 148                         w += 4;
 149                         mlen -= 8;
 150                 }
 151                 /*
 152                  * Do as much of the checksum as possible 32 bits at at time.
 153                  * In fact, this loop is unrolled to make overhead from
 154                  * branches &c small.
 155                  */
 156                 mlen -= 1;
 157                 while ((mlen -= 32) >= 0) {
 158                         /*
 159                          * Add with carry 16 words and fold in the last
 160                          * carry by adding a 0 with carry.
 161                          *
 162                          * The early ADD(16) and the LOAD(32) are to load
 163                          * the next 2 cache lines in advance on 486's.  The
 164                          * 486 has a penalty of 2 clock cycles for loading
 165                          * a cache line, plus whatever time the external
 166                          * memory takes to load the first word(s) addressed.
 167                          * These penalties are unavoidable.  Subsequent
 168                          * accesses to a cache line being loaded (and to
 169                          * other external memory?) are delayed until the
 170                          * whole load finishes.  These penalties are mostly
 171                          * avoided by not accessing external memory for
 172                          * 8 cycles after the ADD(16) and 12 cycles after
 173                          * the LOAD(32).  The loop terminates when mlen
 174                          * is initially 33 (not 32) to guaranteed that
 175                          * the LOAD(32) is within bounds.
 176                          */
 177                         ADD(16);
 178                         ADDC(0);
 179                         ADDC(4);
 180                         ADDC(8);
 181                         ADDC(12);
 182                         LOAD(32);
 183                         ADDC(20);
 184                         ADDC(24);
 185                         ADDC(28);
 186                         MOP;
 187                         w += 16;
 188                 }
 189                 mlen += 32 + 1;
 190                 if (mlen >= 32) {
 191                         ADD(16);
 192                         ADDC(0);
 193                         ADDC(4);
 194                         ADDC(8);
 195                         ADDC(12);
 196                         ADDC(20);
 197                         ADDC(24);
 198                         ADDC(28);
 199                         MOP;
 200                         w += 16;
 201                         mlen -= 32;
 202                 }
 203                 if (mlen >= 16) {
 204                         ADD(0);
 205                         ADDC(4);
 206                         ADDC(8);
 207                         ADDC(12);
 208                         MOP;
 209                         w += 8;
 210                         mlen -= 16;
 211                 }
 212                 if (mlen >= 8) {
 213                         ADD(0);
 214                         ADDC(4);
 215                         MOP;
 216                         w += 4;
 217                         mlen -= 8;
 218                 }
 219                 if (mlen == 0 && byte_swapped == 0)
 220                         continue;       /* worth 1% maybe ?? */
 221                 REDUCE;
 222                 while ((mlen -= 2) >= 0) {
 223                         sum += *w++;
 224                 }
 225                 if (byte_swapped) {
 226                         sum <<= 8;
 227                         byte_swapped = 0;
 228                         if (mlen == -1) {
 229                                 su.c[1] = *(char *)w;
 230                                 sum += su.s;
 231                                 mlen = 0;
 232                         } else
 233                                 mlen = -1;
 234                 } else if (mlen == -1)
 235                         /*
 236                          * This mbuf has odd number of bytes.
 237                          * There could be a word split betwen
 238                          * this mbuf and the next mbuf.
 239                          * Save the last byte (to prepend to next mbuf).
 240                          */
 241                         su.c[0] = *(char *)w;
 242         }
 243
 244         if (len)
 245                 printf("%s: out of data by %d\n", __func__, len);
 246         if (mlen == -1) {
 247                 /* The last mbuf has odd # of bytes. Follow the
 248                    standard (the odd byte is shifted left by 8 bits) */
 249                 su.c[1] = 0;
 250                 sum += su.s;
 251         }
 252         REDUCE;
 253         return (~sum & 0xffff);
 254 }
 255
 256 u_short
 257 in_cksum_skip(m, len, skip)
 258         struct mbuf *m;
 259         int len;
 260         int skip;
 261 {
 262         register u_short *w;
 263         register unsigned sum = 0;
 264         register int mlen = 0;
 265         int byte_swapped = 0;
 266         union { char    c[2]; u_short   s; } su;
 267
 268         len -= skip;
 269         for (; skip && m; m = m->m_next) {
 270                 if (m->m_len > skip) {
 271                         mlen = m->m_len - skip;
 272                         w = (u_short *)(mtod(m, u_char *) + skip);
 273                         goto skip_start;
 274                 } else {
 275                         skip -= m->m_len;
 276                 }
 277         }
 278
 279         for (;m && len; m = m->m_next) {
 280                 if (m->m_len == 0)
 281                         continue;
 282                 w = mtod(m, u_short *);
 283                 if (mlen == -1) {
 284                         /*
 285                          * The first byte of this mbuf is the continuation
 286                          * of a word spanning between this mbuf and the
 287                          * last mbuf.
 288                          */
 289
 290                         /* su.c[0] is already saved when scanning previous
 291                          * mbuf.  sum was REDUCEd when we found mlen == -1
 292                          */
 293                         su.c[1] = *(u_char *)w;
 294                         sum += su.s;
 295                         w = (u_short *)((char *)w + 1);
 296                         mlen = m->m_len - 1;
 297                         len--;
 298                 } else
 299                         mlen = m->m_len;
 300 skip_start:
 301                 if (len < mlen)
 302                         mlen = len;
 303                 len -= mlen;
 304                 /*
 305                  * Force to long boundary so we do longword aligned
 306                  * memory operations
 307                  */
 308                 if (3 & (int) w) {
 309                         REDUCE;
 310                         if ((1 & (int) w) && (mlen > 0)) {
 311                                 sum <<= 8;
 312                                 su.c[0] = *(char *)w;
 313                                 w = (u_short *)((char *)w + 1);
 314                                 mlen--;
 315                                 byte_swapped = 1;
 316                         }
 317                         if ((2 & (int) w) && (mlen >= 2)) {
 318                                 sum += *w++;
 319                                 mlen -= 2;
 320                         }
 321                 }
 322                 /*
 323                  * Advance to a 486 cache line boundary.
 324                  */
 325                 if (4 & (int) w && mlen >= 4) {
 326                         ADD(0);
 327                         MOP;
 328                         w += 2;
 329                         mlen -= 4;
 330                 }
 331                 if (8 & (int) w && mlen >= 8) {
 332                         ADD(0);
 333                         ADDC(4);
 334                         MOP;
 335                         w += 4;
 336                         mlen -= 8;
 337                 }
 338                 /*
 339                  * Do as much of the checksum as possible 32 bits at at time.
 340                  * In fact, this loop is unrolled to make overhead from
 341                  * branches &c small.
 342                  */
 343                 mlen -= 1;
 344                 while ((mlen -= 32) >= 0) {
 345                         /*
 346                          * Add with carry 16 words and fold in the last
 347                          * carry by adding a 0 with carry.
 348                          *
 349                          * The early ADD(16) and the LOAD(32) are to load
 350                          * the next 2 cache lines in advance on 486's.  The
 351                          * 486 has a penalty of 2 clock cycles for loading
 352                          * a cache line, plus whatever time the external
 353                          * memory takes to load the first word(s) addressed.
 354                          * These penalties are unavoidable.  Subsequent
 355                          * accesses to a cache line being loaded (and to
 356                          * other external memory?) are delayed until the
 357                          * whole load finishes.  These penalties are mostly
 358                          * avoided by not accessing external memory for
 359                          * 8 cycles after the ADD(16) and 12 cycles after
 360                          * the LOAD(32).  The loop terminates when mlen
 361                          * is initially 33 (not 32) to guaranteed that
 362                          * the LOAD(32) is within bounds.
 363                          */
 364                         ADD(16);
 365                         ADDC(0);
 366                         ADDC(4);
 367                         ADDC(8);
 368                         ADDC(12);
 369                         LOAD(32);
 370                         ADDC(20);
 371                         ADDC(24);
 372                         ADDC(28);
 373                         MOP;
 374                         w += 16;
 375                 }
 376                 mlen += 32 + 1;
 377                 if (mlen >= 32) {
 378                         ADD(16);
 379                         ADDC(0);
 380                         ADDC(4);
 381                         ADDC(8);
 382                         ADDC(12);
 383                         ADDC(20);
 384                         ADDC(24);
 385                         ADDC(28);
 386                         MOP;
 387                         w += 16;
 388                         mlen -= 32;
 389                 }
 390                 if (mlen >= 16) {
 391                         ADD(0);
 392                         ADDC(4);
 393                         ADDC(8);
 394                         ADDC(12);
 395                         MOP;
 396                         w += 8;
 397                         mlen -= 16;
 398                 }
 399                 if (mlen >= 8) {
 400                         ADD(0);
 401                         ADDC(4);
 402                         MOP;
 403                         w += 4;
 404                         mlen -= 8;
 405                 }
 406                 if (mlen == 0 && byte_swapped == 0)
 407                         continue;       /* worth 1% maybe ?? */
 408                 REDUCE;
 409                 while ((mlen -= 2) >= 0) {
 410                         sum += *w++;
 411                 }
 412                 if (byte_swapped) {
 413                         sum <<= 8;
 414                         byte_swapped = 0;
 415                         if (mlen == -1) {
 416                                 su.c[1] = *(char *)w;
 417                                 sum += su.s;
 418                                 mlen = 0;
 419                         } else
 420                                 mlen = -1;
 421                 } else if (mlen == -1)
 422                         /*
 423                          * This mbuf has odd number of bytes.
 424                          * There could be a word split betwen
 425                          * this mbuf and the next mbuf.
 426                          * Save the last byte (to prepend to next mbuf).
 427                          */
 428                         su.c[0] = *(char *)w;
 429         }
 430
 431         if (len)
 432                 printf("%s: out of data by %d\n", __func__, len);
 433         if (mlen == -1) {
 434                 /* The last mbuf has odd # of bytes. Follow the
 435                    standard (the odd byte is shifted left by 8 bits) */
 436                 su.c[1] = 0;
 437                 sum += su.s;
 438         }
 439         REDUCE;
 440         return (~sum & 0xffff);
 441 }
 442
 443 /*
 444  * This is the exact same algorithm as above with a few exceptions:
 445  * (1) it is designed to operate on buffers, not mbufs
 446  * (2) it returns an intermediate form of the sum which has to be
 447  *     explicitly finalized (but this can be delayed)
 448  * (3) it accepts an intermediate sum
 449  *
 450  * This is particularly useful when building packets quickly,
 451  * since one can compute the checksum of the pseudoheader ahead of
 452  * time and then use this function to complete the work.  That way,
 453  * the pseudoheader never actually has to exist in the packet buffer,
 454  * which avoids needless duplication of work.
 455  */
 456 in_psum_t
 457 in_cksum_partial(psum, w, len)
 458         in_psum_t psum;
 459         const u_short *w;
 460         int len;
 461 {
 462         register in_psum_t sum = psum;
 463         int byte_swapped = 0;
 464         union { char    c[2]; u_short   s; } su;
 465
 466         /*
 467          * Force to long boundary so we do longword aligned
 468          * memory operations
 469          */
 470         if (3 & (int) w) {
 471                 REDUCE;
 472                 if ((1 & (int) w) && (len > 0)) {
 473                         sum <<= 8;
 474                         su.c[0] = *(const char *)w;
 475                         w = (const u_short *)((const char *)w + 1);
 476                         len--;
 477                         byte_swapped = 1;
 478                 }
 479                 if ((2 & (int) w) && (len >= 2)) {
 480                         sum += *w++;
 481                         len -= 2;
 482                 }
 483         }
 484         /*
 485          * Advance to a 486 cache line boundary.
 486          */
 487         if (4 & (int) w && len >= 4) {
 488                 ADD(0);
 489                 MOP;
 490                 w += 2;
 491                 len -= 4;
 492         }
 493         if (8 & (int) w && len >= 8) {
 494                 ADD(0);
 495                 ADDC(4);
 496                 MOP;
 497                 w += 4;
 498                 len -= 8;
 499         }
 500         /*
 501          * Do as much of the checksum as possible 32 bits at at time.
 502          * In fact, this loop is unrolled to make overhead from
 503          * branches &c small.
 504          */
 505         len -= 1;
 506         while ((len -= 32) >= 0) {
 507                 /*
 508                  * Add with carry 16 words and fold in the last
 509                  * carry by adding a 0 with carry.
 510                  *
 511                  * The early ADD(16) and the LOAD(32) are to load
 512                  * the next 2 cache lines in advance on 486's.  The
 513                  * 486 has a penalty of 2 clock cycles for loading
 514                  * a cache line, plus whatever time the external
 515                  * memory takes to load the first word(s) addressed.
 516                  * These penalties are unavoidable.  Subsequent
 517                  * accesses to a cache line being loaded (and to
 518                  * other external memory?) are delayed until the
 519                  * whole load finishes.  These penalties are mostly
 520                  * avoided by not accessing external memory for
 521                  * 8 cycles after the ADD(16) and 12 cycles after
 522                  * the LOAD(32).  The loop terminates when len
 523                  * is initially 33 (not 32) to guaranteed that
 524                  * the LOAD(32) is within bounds.
 525                  */
 526                 ADD(16);
 527                 ADDC(0);
 528                 ADDC(4);
 529                 ADDC(8);
 530                 ADDC(12);
 531                 LOAD(32);
 532                 ADDC(20);
 533                 ADDC(24);
 534                 ADDC(28);
 535                 MOP;
 536                 w += 16;
 537         }
 538         len += 32 + 1;
 539         if (len >= 32) {
 540                 ADD(16);
 541                 ADDC(0);
 542                 ADDC(4);
 543                 ADDC(8);
 544                 ADDC(12);
 545                 ADDC(20);
 546                 ADDC(24);
 547                 ADDC(28);
 548                 MOP;
 549                 w += 16;
 550                 len -= 32;
 551         }
 552         if (len >= 16) {
 553                 ADD(0);
 554                 ADDC(4);
 555                 ADDC(8);
 556                 ADDC(12);
 557                 MOP;
 558                 w += 8;
 559                 len -= 16;
 560         }
 561         if (len >= 8) {
 562                 ADD(0);
 563                 ADDC(4);
 564                 MOP;
 565                 w += 4;
 566                 len -= 8;
 567         }
 568         if (len == 0 && byte_swapped == 0)
 569                 goto out;
 570         REDUCE;
 571         while ((len -= 2) >= 0) {
 572                 sum += *w++;
 573         }
 574         if (byte_swapped) {
 575                 sum <<= 8;
 576                 byte_swapped = 0;
 577                 if (len == -1) {
 578                         su.c[1] = *(const char *)w;
 579                         sum += su.s;
 580                         len = 0;
 581                 } else
 582                         len = -1;
 583         } else if (len == -1) {
 584                 /*
 585                  * This buffer has odd number of bytes.
 586                  * There could be a word split betwen
 587                  * this buffer and the next.
 588                  */
 589                 su.c[0] = *(const char *)w;
 590         }
 591 out:
 592         if (len == -1) {
 593                 /* The last buffer has odd # of bytes. Follow the
 594                    standard (the odd byte is shifted left by 8 bits) */
 595                 su.c[1] = 0;
 596                 sum += su.s;
 597         }
 598         return sum;
 599 }
 600
 601 int
 602 in_cksum_finalize(psum)
 603         in_psum_t psum;
 604 {
 605         in_psum_t sum = psum;
 606         REDUCE;
 607         return (~sum & 0xffff);
 608 }