From 60dd8bbf5d17ffac927886fdc12049d6867a29eb Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Sat, 14 Feb 2004 02:09:28 +0000 Subject: [PATCH] Rewrite the IP checksum code. Get rid of all the inline assembly garbage, get rid of old APIs that are no longer used, and build a new 'core' checksum function in C capable of handling any alignment. Add an assembly procedure (in its own file) for handling large blocks of 32-bit aligned data. This new code is better suited for modern cpus and should at least as good as the previous code, and it is far more portable. Note that in_cksum_hdr() still assumes 32 bit alignment. This will be fixed soon. --- sys/conf/files.i386 | 3 +- sys/cpu/i386/misc/in_cksum2.s | 81 ++++ sys/i386/i386/in_cksum.c | 696 ++++++--------------------- sys/i386/i386/in_cksum2.s | 81 ++++ sys/i386/include/in_cksum.h | 104 ++-- sys/netinet/igmp.c | 4 +- sys/netinet/in.h | 3 +- sys/netinet/ip_icmp.c | 4 +- sys/platform/pc32/i386/in_cksum.c | 696 ++++++--------------------- sys/platform/pc32/include/in_cksum.h | 104 ++-- 10 files changed, 519 insertions(+), 1257 deletions(-) create mode 100644 sys/cpu/i386/misc/in_cksum2.s create mode 100644 sys/i386/i386/in_cksum2.s diff --git a/sys/conf/files.i386 b/sys/conf/files.i386 index af9e721675..690ad90772 100644 --- a/sys/conf/files.i386 +++ b/sys/conf/files.i386 @@ -2,7 +2,7 @@ # files marked standard are always included. # # $FreeBSD: src/sys/conf/files.i386,v 1.307.2.38 2003/01/02 20:41:33 kan Exp $ -# $DragonFly: src/sys/conf/Attic/files.i386,v 1.11 2004/02/13 21:15:12 joerg Exp $ +# $DragonFly: src/sys/conf/Attic/files.i386,v 1.12 2004/02/14 02:09:25 dillon Exp $ # # The long compile-with and dependency lines are required because of # limitations in config: backslash-newline doesn't work in strings, and @@ -196,6 +196,7 @@ i386/i386/i386-gdbstub.c optional ddb i386/i386/i686_mem.c standard i386/i386/identcpu.c standard i386/i386/in_cksum.c optional inet +i386/i386/in_cksum2.s optional inet i386/i386/initcpu.c standard i386/i386/k6_mem.c standard # locore.s needs to be handled in Makefile to put it first. Otherwise it's diff --git a/sys/cpu/i386/misc/in_cksum2.s b/sys/cpu/i386/misc/in_cksum2.s new file mode 100644 index 0000000000..11255060e4 --- /dev/null +++ b/sys/cpu/i386/misc/in_cksum2.s @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2003 Matthew Dillon + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $DragonFly: src/sys/cpu/i386/misc/in_cksum2.s,v 1.1 2004/02/14 02:09:26 dillon Exp $ + */ + +#include /* miscellaneous asm macros */ +#include +#include + +#include "assym.s" + + .text + + /* + * asm_ones32(32bitalignedbuffer, numberof32bitwords) + * + * Returns the 32 bit one complement partial checksum. This is + * basically a 1's complement checksum without the inversion (~) + * at the end. A 32 bit value is returned. If the caller is + * calculating a 16 bit 1's complement checksum the caller must + * collapse the 32 bit return value via: + * + * result = (result >> 16) + (result & 0xFFFF) + * if (result > 0xFFFF) + * result -= 0xFFFF; <<< same as (result + 1) & 0xFFFF + * within the range of result. + * Note that worst case 0xFFFFFFFF + 0xFFFFFFFF = 0xFFFFFFFE + CARRY, + * so no double-carry ever occurs. + */ + .p2align 4 +ENTRY(asm_ones32) + movl 4(%esp),%edx /* %edx = buffer pointer */ + movl 8(%esp),%ecx /* %ecx = counter */ + subl %eax,%eax /* %eax = checksum */ + cmpl $5,%ecx + jl 2f +1: + subl $5,%ecx + addl (%edx),%eax + adcl 4(%edx),%eax + adcl 8(%edx),%eax + adcl 12(%edx),%eax + adcl 16(%edx),%eax + adcl $0,%eax + addl $20,%edx + cmpl $5,%ecx + jge 1b +2: + testl %ecx,%ecx + je 4f +3: + addl (%edx),%eax + adcl $0,%eax + addl $4,%edx + decl %ecx + jnz 3b +4: + ret diff --git a/sys/i386/i386/in_cksum.c b/sys/i386/i386/in_cksum.c index 80a4e5516c..cf0003c5c5 100644 --- a/sys/i386/i386/in_cksum.c +++ b/sys/i386/i386/in_cksum.c @@ -1,5 +1,5 @@ -/*- - * Copyright (c) 1990 The Regents of the University of California. +/* + * Copyright (c) 2003 Matthew Dillon * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -10,18 +10,11 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) @@ -30,10 +23,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * from tahoe: in_cksum.c 1.2 86/01/05 - * from: @(#)in_cksum.c 1.3 (Berkeley) 1/19/91 - * $FreeBSD: src/sys/i386/i386/in_cksum.c,v 1.17.2.3 2002/07/02 04:03:00 jdp Exp $ - * $DragonFly: src/sys/i386/i386/Attic/in_cksum.c,v 1.3 2003/07/26 19:07:47 rob Exp $ + * $DragonFly: src/sys/i386/i386/Attic/in_cksum.c,v 1.4 2004/02/14 02:09:26 dillon Exp $ */ #include @@ -44,566 +34,160 @@ #include #include +#include #include /* - * Checksum routine for Internet Protocol family headers. + * Return the 16 bit 1's complement checksum in network byte order. Devolve + * the mbuf into 32 bit aligned segments that we can pass to assembly and + * do the rest manually. Even though we return a 16 bit unsigned value, + * we declare it as a 32 bit unsigned value to reduce unnecessary assembly + * conversions. * - * This routine is very heavily used in the network - * code and should be modified for each CPU to be as fast as possible. + * Byte ordering issues. Note two things. First, no secondary carry occurs, + * and second, a one's complement checksum is endian-independant. If we are + * given a data buffer in network byte order, our checksum will be in network + * byte order. * - * This implementation is 386 version. - */ - -#undef ADDCARRY -#define ADDCARRY(x) if ((x) > 0xffff) (x) -= 0xffff -#define REDUCE {sum = (sum & 0xffff) + (sum >> 16); ADDCARRY(sum);} - -/* - * These asm statements require __volatile because they pass information - * via the condition codes. GCC does not currently provide a way to specify - * the condition codes as an input or output operand. + * 0xffff + 0xffff = 0xfffe + C = 0xffff (so no second carry occurs). * - * The LOAD macro below is effectively a prefetch into cache. GCC will - * load the value into a register but will not use it. Since modern CPUs - * reorder operations, this will generally take place in parallel with - * other calculations. - */ -#define ADD(n) __asm __volatile \ - ("addl %1, %0" : "+r" (sum) : \ - "g" (((const u_int32_t *)w)[n / 4])) -#define ADDC(n) __asm __volatile \ - ("adcl %1, %0" : "+r" (sum) : \ - "g" (((const u_int32_t *)w)[n / 4])) -#define LOAD(n) __asm __volatile \ - ("" : : "r" (((const u_int32_t *)w)[n / 4])) -#define MOP __asm __volatile \ - ("adcl $0, %0" : "+r" (sum)) - -int -in_cksum(m, len) - struct mbuf *m; - int len; -{ - u_short *w; - unsigned sum = 0; - int mlen = 0; - int byte_swapped = 0; - union { char c[2]; u_short s; } su; - - for (;m && len; m = m->m_next) { - if (m->m_len == 0) - continue; - w = mtod(m, u_short *); - if (mlen == -1) { - /* - * The first byte of this mbuf is the continuation - * of a word spanning between this mbuf and the - * last mbuf. - */ - - /* su.c[0] is already saved when scanning previous - * mbuf. sum was REDUCEd when we found mlen == -1 - */ - su.c[1] = *(u_char *)w; - sum += su.s; - w = (u_short *)((char *)w + 1); - mlen = m->m_len - 1; - len--; - } else - mlen = m->m_len; - if (len < mlen) - mlen = len; - len -= mlen; - /* - * Force to long boundary so we do longword aligned - * memory operations - */ - if (3 & (int) w) { - REDUCE; - if ((1 & (int) w) && (mlen > 0)) { - sum <<= 8; - su.c[0] = *(char *)w; - w = (u_short *)((char *)w + 1); - mlen--; - byte_swapped = 1; - } - if ((2 & (int) w) && (mlen >= 2)) { - sum += *w++; - mlen -= 2; - } - } - /* - * Advance to a 486 cache line boundary. - */ - if (4 & (int) w && mlen >= 4) { - ADD(0); - MOP; - w += 2; - mlen -= 4; - } - if (8 & (int) w && mlen >= 8) { - ADD(0); - ADDC(4); - MOP; - w += 4; - mlen -= 8; - } - /* - * Do as much of the checksum as possible 32 bits at at time. - * In fact, this loop is unrolled to make overhead from - * branches &c small. - */ - mlen -= 1; - while ((mlen -= 32) >= 0) { - /* - * Add with carry 16 words and fold in the last - * carry by adding a 0 with carry. - * - * The early ADD(16) and the LOAD(32) are to load - * the next 2 cache lines in advance on 486's. The - * 486 has a penalty of 2 clock cycles for loading - * a cache line, plus whatever time the external - * memory takes to load the first word(s) addressed. - * These penalties are unavoidable. Subsequent - * accesses to a cache line being loaded (and to - * other external memory?) are delayed until the - * whole load finishes. These penalties are mostly - * avoided by not accessing external memory for - * 8 cycles after the ADD(16) and 12 cycles after - * the LOAD(32). The loop terminates when mlen - * is initially 33 (not 32) to guaranteed that - * the LOAD(32) is within bounds. - */ - ADD(16); - ADDC(0); - ADDC(4); - ADDC(8); - ADDC(12); - LOAD(32); - ADDC(20); - ADDC(24); - ADDC(28); - MOP; - w += 16; - } - mlen += 32 + 1; - if (mlen >= 32) { - ADD(16); - ADDC(0); - ADDC(4); - ADDC(8); - ADDC(12); - ADDC(20); - ADDC(24); - ADDC(28); - MOP; - w += 16; - mlen -= 32; - } - if (mlen >= 16) { - ADD(0); - ADDC(4); - ADDC(8); - ADDC(12); - MOP; - w += 8; - mlen -= 16; - } - if (mlen >= 8) { - ADD(0); - ADDC(4); - MOP; - w += 4; - mlen -= 8; - } - if (mlen == 0 && byte_swapped == 0) - continue; /* worth 1% maybe ?? */ - REDUCE; - while ((mlen -= 2) >= 0) { - sum += *w++; - } - if (byte_swapped) { - sum <<= 8; - byte_swapped = 0; - if (mlen == -1) { - su.c[1] = *(char *)w; - sum += su.s; - mlen = 0; - } else - mlen = -1; - } else if (mlen == -1) - /* - * This mbuf has odd number of bytes. - * There could be a word split betwen - * this mbuf and the next mbuf. - * Save the last byte (to prepend to next mbuf). - */ - su.c[0] = *(char *)w; - } - - if (len) - printf("%s: out of data by %d\n", __func__, len); - if (mlen == -1) { - /* The last mbuf has odd # of bytes. Follow the - standard (the odd byte is shifted left by 8 bits) */ - su.c[1] = 0; - sum += su.s; - } - REDUCE; - return (~sum & 0xffff); -} - -u_short -in_cksum_skip(m, len, skip) - struct mbuf *m; - int len; - int skip; -{ - u_short *w; - unsigned sum = 0; - int mlen = 0; - int byte_swapped = 0; - union { char c[2]; u_short s; } su; - - len -= skip; - for (; skip && m; m = m->m_next) { - if (m->m_len > skip) { - mlen = m->m_len - skip; - w = (u_short *)(mtod(m, u_char *) + skip); - goto skip_start; - } else { - skip -= m->m_len; - } - } - - for (;m && len; m = m->m_next) { - if (m->m_len == 0) - continue; - w = mtod(m, u_short *); - if (mlen == -1) { - /* - * The first byte of this mbuf is the continuation - * of a word spanning between this mbuf and the - * last mbuf. - */ - - /* su.c[0] is already saved when scanning previous - * mbuf. sum was REDUCEd when we found mlen == -1 - */ - su.c[1] = *(u_char *)w; - sum += su.s; - w = (u_short *)((char *)w + 1); - mlen = m->m_len - 1; - len--; - } else - mlen = m->m_len; -skip_start: - if (len < mlen) - mlen = len; - len -= mlen; - /* - * Force to long boundary so we do longword aligned - * memory operations - */ - if (3 & (int) w) { - REDUCE; - if ((1 & (int) w) && (mlen > 0)) { - sum <<= 8; - su.c[0] = *(char *)w; - w = (u_short *)((char *)w + 1); - mlen--; - byte_swapped = 1; - } - if ((2 & (int) w) && (mlen >= 2)) { - sum += *w++; - mlen -= 2; - } - } - /* - * Advance to a 486 cache line boundary. - */ - if (4 & (int) w && mlen >= 4) { - ADD(0); - MOP; - w += 2; - mlen -= 4; - } - if (8 & (int) w && mlen >= 8) { - ADD(0); - ADDC(4); - MOP; - w += 4; - mlen -= 8; - } - /* - * Do as much of the checksum as possible 32 bits at at time. - * In fact, this loop is unrolled to make overhead from - * branches &c small. - */ - mlen -= 1; - while ((mlen -= 32) >= 0) { - /* - * Add with carry 16 words and fold in the last - * carry by adding a 0 with carry. - * - * The early ADD(16) and the LOAD(32) are to load - * the next 2 cache lines in advance on 486's. The - * 486 has a penalty of 2 clock cycles for loading - * a cache line, plus whatever time the external - * memory takes to load the first word(s) addressed. - * These penalties are unavoidable. Subsequent - * accesses to a cache line being loaded (and to - * other external memory?) are delayed until the - * whole load finishes. These penalties are mostly - * avoided by not accessing external memory for - * 8 cycles after the ADD(16) and 12 cycles after - * the LOAD(32). The loop terminates when mlen - * is initially 33 (not 32) to guaranteed that - * the LOAD(32) is within bounds. - */ - ADD(16); - ADDC(0); - ADDC(4); - ADDC(8); - ADDC(12); - LOAD(32); - ADDC(20); - ADDC(24); - ADDC(28); - MOP; - w += 16; - } - mlen += 32 + 1; - if (mlen >= 32) { - ADD(16); - ADDC(0); - ADDC(4); - ADDC(8); - ADDC(12); - ADDC(20); - ADDC(24); - ADDC(28); - MOP; - w += 16; - mlen -= 32; - } - if (mlen >= 16) { - ADD(0); - ADDC(4); - ADDC(8); - ADDC(12); - MOP; - w += 8; - mlen -= 16; - } - if (mlen >= 8) { - ADD(0); - ADDC(4); - MOP; - w += 4; - mlen -= 8; - } - if (mlen == 0 && byte_swapped == 0) - continue; /* worth 1% maybe ?? */ - REDUCE; - while ((mlen -= 2) >= 0) { - sum += *w++; - } - if (byte_swapped) { - sum <<= 8; - byte_swapped = 0; - if (mlen == -1) { - su.c[1] = *(char *)w; - sum += su.s; - mlen = 0; - } else - mlen = -1; - } else if (mlen == -1) - /* - * This mbuf has odd number of bytes. - * There could be a word split betwen - * this mbuf and the next mbuf. - * Save the last byte (to prepend to next mbuf). - */ - su.c[0] = *(char *)w; - } - - if (len) - printf("%s: out of data by %d\n", __func__, len); - if (mlen == -1) { - /* The last mbuf has odd # of bytes. Follow the - standard (the odd byte is shifted left by 8 bits) */ - su.c[1] = 0; - sum += su.s; - } - REDUCE; - return (~sum & 0xffff); -} - -/* - * This is the exact same algorithm as above with a few exceptions: - * (1) it is designed to operate on buffers, not mbufs - * (2) it returns an intermediate form of the sum which has to be - * explicitly finalized (but this can be delayed) - * (3) it accepts an intermediate sum + * 0x8142 + 0x8243 = 0x0385 + C = 0x0386 (checksum is in same byte order + * 0x4281 + 0x4382 = 0x8603 as the data regardless of arch) * - * This is particularly useful when building packets quickly, - * since one can compute the checksum of the pseudoheader ahead of - * time and then use this function to complete the work. That way, - * the pseudoheader never actually has to exist in the packet buffer, - * which avoids needless duplication of work. + * This works with 16, 32, 64, etc... bits as long as we deal with the + * carry when collapsing it back down to 16 bits. */ -in_psum_t -in_cksum_partial(psum, w, len) - in_psum_t psum; - const u_short *w; - int len; +__uint32_t +in_cksum_range(struct mbuf *m, int offset, int bytes) { - in_psum_t sum = psum; - int byte_swapped = 0; - union { char c[2]; u_short s; } su; - + __uint8_t *ptr; + __uint32_t sum0; + __uint32_t sum1; + int n; + int flip; + + /* + * Skip fully engulfed mbufs. Branch predict optimal. + */ + while (m && offset >= m->m_len) { + offset -= m->m_len; + m = m->m_next; + } + + /* + * Process the checksum for each segment. Note that the code below is + * branch-predict optimal, so it's faster then you might otherwise + * believe. When we are buffer-aligned but also odd-byte-aligned from + * the point of view of the IP packet, we accumulate to sum1 instead of + * sum0. + * + * Initial offsets do not pre-set flip (assert that offset is even?) + */ + sum0 = 0; + sum1 = 0; + flip = 0; + while (bytes > 0 && m) { /* - * Force to long boundary so we do longword aligned - * memory operations + * Calculate pointer base and number of bytes to snarf, account + * for snarfed bytes. */ - if (3 & (int) w) { - REDUCE; - if ((1 & (int) w) && (len > 0)) { - sum <<= 8; - su.c[0] = *(const char *)w; - w = (const u_short *)((const char *)w + 1); - len--; - byte_swapped = 1; - } - if ((2 & (int) w) && (len >= 2)) { - sum += *w++; - len -= 2; - } - } + ptr = mtod(m, __uint8_t *) + offset; + if ((n = m->m_len - offset) > bytes) + n = bytes; + bytes -= n; + /* - * Advance to a 486 cache line boundary. + * First 16-bit-align our buffer by eating a byte if necessary, + * then 32-bit-align our buffer by eating a word if necessary. + * + * We are endian-sensitive when chomping a byte. WARNING! Be + * careful optimizing this! 16 ane 32 bit words must be aligned + * for this to be generic code. */ - if (4 & (int) w && len >= 4) { - ADD(0); - MOP; - w += 2; - len -= 4; + if (((intptr_t)ptr & 1) && n) { +#if BYTE_ORDER == LITTLE_ENDIAN + if (flip) + sum1 += ptr[0]; + else + sum0 += ptr[0]; +#else + if (flip) + sum0 += ptr[0]; + else + sum1 += ptr[0]; +#endif + ++ptr; + --n; + flip = 1 - flip; } - if (8 & (int) w && len >= 8) { - ADD(0); - ADDC(4); - MOP; - w += 4; - len -= 8; + if (((intptr_t)ptr & 2) && n > 1) { + if (flip) + sum1 += *(__uint16_t *)ptr; + else + sum0 += *(__uint16_t *)ptr; + ptr += 2; + n -= 2; } + /* - * Do as much of the checksum as possible 32 bits at at time. - * In fact, this loop is unrolled to make overhead from - * branches &c small. + * Process a 32-bit aligned data buffer and accumulate the result + * in sum0 or sum1. Allow only one 16 bit overflow carry. */ - len -= 1; - while ((len -= 32) >= 0) { - /* - * Add with carry 16 words and fold in the last - * carry by adding a 0 with carry. - * - * The early ADD(16) and the LOAD(32) are to load - * the next 2 cache lines in advance on 486's. The - * 486 has a penalty of 2 clock cycles for loading - * a cache line, plus whatever time the external - * memory takes to load the first word(s) addressed. - * These penalties are unavoidable. Subsequent - * accesses to a cache line being loaded (and to - * other external memory?) are delayed until the - * whole load finishes. These penalties are mostly - * avoided by not accessing external memory for - * 8 cycles after the ADD(16) and 12 cycles after - * the LOAD(32). The loop terminates when len - * is initially 33 (not 32) to guaranteed that - * the LOAD(32) is within bounds. - */ - ADD(16); - ADDC(0); - ADDC(4); - ADDC(8); - ADDC(12); - LOAD(32); - ADDC(20); - ADDC(24); - ADDC(28); - MOP; - w += 16; + if (n >= 4) { + __uint32_t sum32; + + sum32 = asm_ones32((void *)ptr, n >> 2); + sum32 = (sum32 >> 16) + (sum32 & 0xffff); + if (flip) + sum1 += sum32; + else + sum0 += sum32; + ptr += n & ~3; + /* n &= 3; dontcare */ } - len += 32 + 1; - if (len >= 32) { - ADD(16); - ADDC(0); - ADDC(4); - ADDC(8); - ADDC(12); - ADDC(20); - ADDC(24); - ADDC(28); - MOP; - w += 16; - len -= 32; - } - if (len >= 16) { - ADD(0); - ADDC(4); - ADDC(8); - ADDC(12); - MOP; - w += 8; - len -= 16; - } - if (len >= 8) { - ADD(0); - ADDC(4); - MOP; - w += 4; - len -= 8; - } - if (len == 0 && byte_swapped == 0) - goto out; - REDUCE; - while ((len -= 2) >= 0) { - sum += *w++; - } - if (byte_swapped) { - sum <<= 8; - byte_swapped = 0; - if (len == -1) { - su.c[1] = *(const char *)w; - sum += su.s; - len = 0; - } else - len = -1; - } else if (len == -1) { - /* - * This buffer has odd number of bytes. - * There could be a word split betwen - * this buffer and the next. - */ - su.c[0] = *(const char *)w; + + /* + * Handle oddly-sized buffers. Handle word issues first while + * ptr is still aligned. + */ + if (n & 2) { + if (flip) + sum1 += *(__uint16_t *)ptr; + else + sum0 += *(__uint16_t *)ptr; + ptr += 2; + /* n -= 2; dontcare */ } -out: - if (len == -1) { - /* The last buffer has odd # of bytes. Follow the - standard (the odd byte is shifted left by 8 bits) */ - su.c[1] = 0; - sum += su.s; + if (n & 1) { +#if BYTE_ORDER == LITTLE_ENDIAN + if (flip) + sum1 += ptr[0]; + else + sum0 += ptr[0]; +#else + if (flip) + sum0 += ptr[0]; + else + sum1 += ptr[0]; +#endif + /* ++ptr; dontcare */ + /* --n; dontcare */ + flip = 1 - flip; } - return sum; + m = m->m_next; + offset = 0; + } + + /* + * Due to byte aligned or oddly-sized buffers we may have a checksum + * in sum1 which needs to be shifted and added to our main sum. There + * is a presumption here that no more then 255 overflows occured which + * is 255/3 byte aligned mbufs in the worst case. + */ + sum0 += sum1 << 8; + sum0 = (sum0 >> 16) + (sum0 & 0xffff); + if (sum0 > 0xffff) + ++sum0; + return(~sum0 & 0xffff); } -int -in_cksum_finalize(psum) - in_psum_t psum; -{ - in_psum_t sum = psum; - REDUCE; - return (~sum & 0xffff); -} diff --git a/sys/i386/i386/in_cksum2.s b/sys/i386/i386/in_cksum2.s new file mode 100644 index 0000000000..6682a055b8 --- /dev/null +++ b/sys/i386/i386/in_cksum2.s @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2003 Matthew Dillon + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $DragonFly: src/sys/i386/i386/Attic/in_cksum2.s,v 1.1 2004/02/14 02:09:26 dillon Exp $ + */ + +#include /* miscellaneous asm macros */ +#include +#include + +#include "assym.s" + + .text + + /* + * asm_ones32(32bitalignedbuffer, numberof32bitwords) + * + * Returns the 32 bit one complement partial checksum. This is + * basically a 1's complement checksum without the inversion (~) + * at the end. A 32 bit value is returned. If the caller is + * calculating a 16 bit 1's complement checksum the caller must + * collapse the 32 bit return value via: + * + * result = (result >> 16) + (result & 0xFFFF) + * if (result > 0xFFFF) + * result -= 0xFFFF; <<< same as (result + 1) & 0xFFFF + * within the range of result. + * Note that worst case 0xFFFFFFFF + 0xFFFFFFFF = 0xFFFFFFFE + CARRY, + * so no double-carry ever occurs. + */ + .p2align 4 +ENTRY(asm_ones32) + movl 4(%esp),%edx /* %edx = buffer pointer */ + movl 8(%esp),%ecx /* %ecx = counter */ + subl %eax,%eax /* %eax = checksum */ + cmpl $5,%ecx + jl 2f +1: + subl $5,%ecx + addl (%edx),%eax + adcl 4(%edx),%eax + adcl 8(%edx),%eax + adcl 12(%edx),%eax + adcl 16(%edx),%eax + adcl $0,%eax + addl $20,%edx + cmpl $5,%ecx + jge 1b +2: + testl %ecx,%ecx + je 4f +3: + addl (%edx),%eax + adcl $0,%eax + addl $4,%edx + decl %ecx + jnz 3b +4: + ret diff --git a/sys/i386/include/in_cksum.h b/sys/i386/include/in_cksum.h index bdf5779456..00ce5d90a3 100644 --- a/sys/i386/include/in_cksum.h +++ b/sys/i386/include/in_cksum.h @@ -34,97 +34,61 @@ * from: @(#)in_cksum.c 1.3 (Berkeley) 1/19/91 * from: Id: in_cksum.c,v 1.8 1995/12/03 18:35:19 bde Exp * $FreeBSD: src/sys/i386/include/in_cksum.h,v 1.7.2.2 2002/07/02 04:03:04 jdp Exp $ - * $DragonFly: src/sys/i386/include/Attic/in_cksum.h,v 1.4 2003/08/26 21:42:18 rob Exp $ + * $DragonFly: src/sys/i386/include/Attic/in_cksum.h,v 1.5 2004/02/14 02:09:27 dillon Exp $ */ #ifndef _MACHINE_IN_CKSUM_H_ -#define _MACHINE_IN_CKSUM_H_ 1 +#define _MACHINE_IN_CKSUM_H_ -#include +#ifdef _KERNEL +__uint32_t in_cksum_range(struct mbuf *m, int offset, int bytes); +__uint32_t asm_ones32(const void *buf, int count); /* in 32 bit words */ +#endif -/* - * It it useful to have an Internet checksum routine which is inlineable - * and optimized specifically for the task of computing IP header checksums - * in the normal case (where there are no options and the header length is - * therefore always exactly five 32-bit words. - */ -#ifdef __GNUC__ static __inline u_int -in_cksum_hdr(const struct ip *ip) +in_cksum(struct mbuf *m, int len) { - u_int sum = 0; - -/* __volatile is necessary here because the condition codes are used. */ -#define ADD(n) __asm __volatile ("addl %1, %0" : "+r" (sum) : \ - "g" (((const u_int32_t *)ip)[n / 4])) -#define ADDC(n) __asm __volatile ("adcl %1, %0" : "+r" (sum) : \ - "g" (((const u_int32_t *)ip)[n / 4])) -#define MOP __asm __volatile ("adcl $0, %0" : "+r" (sum)) - - ADD(0); - ADDC(4); - ADDC(8); - ADDC(12); - ADDC(16); - MOP; -#undef ADD -#undef ADDC -#undef MOP - sum = (sum & 0xffff) + (sum >> 16); - if (sum > 0xffff) - sum -= 0xffff; - - return ~sum & 0xffff; + return(in_cksum_range(m, 0, len)); } -static __inline void -in_cksum_update(struct ip *ip) +static __inline u_int +in_cksum_skip(struct mbuf *m, int len, int skip) { - int __tmpsum; - __tmpsum = (int)ntohs(ip->ip_sum) + 256; - ip->ip_sum = htons(__tmpsum + (__tmpsum >> 16)); + return(in_cksum_range(m, skip, len - skip)); +} + +static __inline u_int +in_cksum_hdr(const struct ip *ip) +{ + __uint32_t sum; + + sum = asm_ones32((const void *)ip, 5); /* 5x4 = 20 bytes */ + sum = (sum >> 16) + (sum & 0xFFFF); + if (sum > 0xFFFF) + ++sum; + return(~sum & 0xFFFF); } static __inline u_short in_addword(u_short sum, u_short b) { - /* __volatile is necessary because the condition codes are used. */ - __asm __volatile ("addw %1, %0" : "+r" (sum) : "r" (b)); - __asm __volatile ("adcw $0, %0" : "+r" (sum)); + /* __volatile is necessary because the condition codes are used. */ + __asm __volatile ("addw %1, %0; adcw $0,%0" : "+r" (sum) : "r" (b)); - return (sum); + return (sum); } static __inline u_short in_pseudo(u_int sum, u_int b, u_int c) { - /* __volatile is necessary because the condition codes are used. */ - __asm __volatile ("addl %1, %0" : "+r" (sum) : "g" (b)); - __asm __volatile ("adcl %1, %0" : "+r" (sum) : "g" (c)); - __asm __volatile ("adcl $0, %0" : "+r" (sum)); - - sum = (sum & 0xffff) + (sum >> 16); - if (sum > 0xffff) - sum -= 0xffff; - return (sum); + /* __volatile is necessary because the condition codes are used. */ + __asm __volatile ("addl %1,%0; adcl %2,%0; adcl $0,%0" + : "+r" (sum) + : "g" (b), "g" (c)); + sum = (sum & 0xffff) + (sum >> 16); + if (sum > 0xffff) + sum -= 0xffff; + return (sum); } -#else -u_int in_cksum_hdr (const struct ip *); -#define in_cksum_update(ip) \ - do { \ - int __tmpsum; \ - __tmpsum = (int)ntohs(ip->ip_sum) + 256; \ - ip->ip_sum = htons(__tmpsum + (__tmpsum >> 16)); \ - } while(0) - -#endif - -typedef unsigned in_psum_t; -#ifdef _KERNEL -u_short in_cksum_skip(struct mbuf *m, int len, int skip); -in_psum_t in_cksum_partial(in_psum_t psum, const u_short *w, int len); -int in_cksum_finalize(in_psum_t psum); -#endif /* _KERNEL */ - #endif /* _MACHINE_IN_CKSUM_H_ */ diff --git a/sys/netinet/igmp.c b/sys/netinet/igmp.c index 127f8652e0..dd47cb1f96 100644 --- a/sys/netinet/igmp.c +++ b/sys/netinet/igmp.c @@ -36,7 +36,7 @@ * * @(#)igmp.c 8.1 (Berkeley) 7/19/93 * $FreeBSD: src/sys/netinet/igmp.c,v 1.29.2.2 2003/01/23 21:06:44 sam Exp $ - * $DragonFly: src/sys/netinet/igmp.c,v 1.4 2003/08/23 11:18:00 rob Exp $ + * $DragonFly: src/sys/netinet/igmp.c,v 1.5 2004/02/14 02:09:28 dillon Exp $ */ /* @@ -70,6 +70,8 @@ #include #include +#include + static MALLOC_DEFINE(M_IGMP, "igmp", "igmp state"); static struct router_info * diff --git a/sys/netinet/in.h b/sys/netinet/in.h index 8534d1d674..153e3bed98 100644 --- a/sys/netinet/in.h +++ b/sys/netinet/in.h @@ -32,7 +32,7 @@ * * @(#)in.h 8.3 (Berkeley) 1/3/94 * $FreeBSD: src/sys/netinet/in.h,v 1.48.2.10 2003/08/24 08:24:38 hsu Exp $ - * $DragonFly: src/sys/netinet/in.h,v 1.5 2003/08/24 23:07:07 hsu Exp $ + * $DragonFly: src/sys/netinet/in.h,v 1.6 2004/02/14 02:09:28 dillon Exp $ */ #ifndef _NETINET_IN_H_ @@ -492,7 +492,6 @@ struct thread; int in_broadcast (struct in_addr, struct ifnet *); int in_canforward (struct in_addr); -int in_cksum (struct mbuf *, int); int in_localaddr (struct in_addr); char *inet_ntoa (struct in_addr); /* in libkern */ diff --git a/sys/netinet/ip_icmp.c b/sys/netinet/ip_icmp.c index 91b33c1492..c319bf2765 100644 --- a/sys/netinet/ip_icmp.c +++ b/sys/netinet/ip_icmp.c @@ -32,7 +32,7 @@ * * @(#)ip_icmp.c 8.2 (Berkeley) 1/4/94 * $FreeBSD: src/sys/netinet/ip_icmp.c,v 1.39.2.19 2003/01/24 05:11:34 sam Exp $ - * $DragonFly: src/sys/netinet/ip_icmp.c,v 1.5 2003/08/23 11:18:00 rob Exp $ + * $DragonFly: src/sys/netinet/ip_icmp.c,v 1.6 2004/02/14 02:09:28 dillon Exp $ */ #include "opt_ipsec.h" @@ -70,6 +70,8 @@ #define IPSEC #endif +#include + /* * ICMP routines: error generation, receive packet processing, and * routines to turnaround packets back to the originator, and diff --git a/sys/platform/pc32/i386/in_cksum.c b/sys/platform/pc32/i386/in_cksum.c index b33415bdfb..da0050af9d 100644 --- a/sys/platform/pc32/i386/in_cksum.c +++ b/sys/platform/pc32/i386/in_cksum.c @@ -1,5 +1,5 @@ -/*- - * Copyright (c) 1990 The Regents of the University of California. +/* + * Copyright (c) 2003 Matthew Dillon * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -10,18 +10,11 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) @@ -30,10 +23,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * from tahoe: in_cksum.c 1.2 86/01/05 - * from: @(#)in_cksum.c 1.3 (Berkeley) 1/19/91 - * $FreeBSD: src/sys/i386/i386/in_cksum.c,v 1.17.2.3 2002/07/02 04:03:00 jdp Exp $ - * $DragonFly: src/sys/platform/pc32/i386/Attic/in_cksum.c,v 1.3 2003/07/26 19:07:47 rob Exp $ + * $DragonFly: src/sys/platform/pc32/i386/Attic/in_cksum.c,v 1.4 2004/02/14 02:09:26 dillon Exp $ */ #include @@ -44,566 +34,160 @@ #include #include +#include #include /* - * Checksum routine for Internet Protocol family headers. + * Return the 16 bit 1's complement checksum in network byte order. Devolve + * the mbuf into 32 bit aligned segments that we can pass to assembly and + * do the rest manually. Even though we return a 16 bit unsigned value, + * we declare it as a 32 bit unsigned value to reduce unnecessary assembly + * conversions. * - * This routine is very heavily used in the network - * code and should be modified for each CPU to be as fast as possible. + * Byte ordering issues. Note two things. First, no secondary carry occurs, + * and second, a one's complement checksum is endian-independant. If we are + * given a data buffer in network byte order, our checksum will be in network + * byte order. * - * This implementation is 386 version. - */ - -#undef ADDCARRY -#define ADDCARRY(x) if ((x) > 0xffff) (x) -= 0xffff -#define REDUCE {sum = (sum & 0xffff) + (sum >> 16); ADDCARRY(sum);} - -/* - * These asm statements require __volatile because they pass information - * via the condition codes. GCC does not currently provide a way to specify - * the condition codes as an input or output operand. + * 0xffff + 0xffff = 0xfffe + C = 0xffff (so no second carry occurs). * - * The LOAD macro below is effectively a prefetch into cache. GCC will - * load the value into a register but will not use it. Since modern CPUs - * reorder operations, this will generally take place in parallel with - * other calculations. - */ -#define ADD(n) __asm __volatile \ - ("addl %1, %0" : "+r" (sum) : \ - "g" (((const u_int32_t *)w)[n / 4])) -#define ADDC(n) __asm __volatile \ - ("adcl %1, %0" : "+r" (sum) : \ - "g" (((const u_int32_t *)w)[n / 4])) -#define LOAD(n) __asm __volatile \ - ("" : : "r" (((const u_int32_t *)w)[n / 4])) -#define MOP __asm __volatile \ - ("adcl $0, %0" : "+r" (sum)) - -int -in_cksum(m, len) - struct mbuf *m; - int len; -{ - u_short *w; - unsigned sum = 0; - int mlen = 0; - int byte_swapped = 0; - union { char c[2]; u_short s; } su; - - for (;m && len; m = m->m_next) { - if (m->m_len == 0) - continue; - w = mtod(m, u_short *); - if (mlen == -1) { - /* - * The first byte of this mbuf is the continuation - * of a word spanning between this mbuf and the - * last mbuf. - */ - - /* su.c[0] is already saved when scanning previous - * mbuf. sum was REDUCEd when we found mlen == -1 - */ - su.c[1] = *(u_char *)w; - sum += su.s; - w = (u_short *)((char *)w + 1); - mlen = m->m_len - 1; - len--; - } else - mlen = m->m_len; - if (len < mlen) - mlen = len; - len -= mlen; - /* - * Force to long boundary so we do longword aligned - * memory operations - */ - if (3 & (int) w) { - REDUCE; - if ((1 & (int) w) && (mlen > 0)) { - sum <<= 8; - su.c[0] = *(char *)w; - w = (u_short *)((char *)w + 1); - mlen--; - byte_swapped = 1; - } - if ((2 & (int) w) && (mlen >= 2)) { - sum += *w++; - mlen -= 2; - } - } - /* - * Advance to a 486 cache line boundary. - */ - if (4 & (int) w && mlen >= 4) { - ADD(0); - MOP; - w += 2; - mlen -= 4; - } - if (8 & (int) w && mlen >= 8) { - ADD(0); - ADDC(4); - MOP; - w += 4; - mlen -= 8; - } - /* - * Do as much of the checksum as possible 32 bits at at time. - * In fact, this loop is unrolled to make overhead from - * branches &c small. - */ - mlen -= 1; - while ((mlen -= 32) >= 0) { - /* - * Add with carry 16 words and fold in the last - * carry by adding a 0 with carry. - * - * The early ADD(16) and the LOAD(32) are to load - * the next 2 cache lines in advance on 486's. The - * 486 has a penalty of 2 clock cycles for loading - * a cache line, plus whatever time the external - * memory takes to load the first word(s) addressed. - * These penalties are unavoidable. Subsequent - * accesses to a cache line being loaded (and to - * other external memory?) are delayed until the - * whole load finishes. These penalties are mostly - * avoided by not accessing external memory for - * 8 cycles after the ADD(16) and 12 cycles after - * the LOAD(32). The loop terminates when mlen - * is initially 33 (not 32) to guaranteed that - * the LOAD(32) is within bounds. - */ - ADD(16); - ADDC(0); - ADDC(4); - ADDC(8); - ADDC(12); - LOAD(32); - ADDC(20); - ADDC(24); - ADDC(28); - MOP; - w += 16; - } - mlen += 32 + 1; - if (mlen >= 32) { - ADD(16); - ADDC(0); - ADDC(4); - ADDC(8); - ADDC(12); - ADDC(20); - ADDC(24); - ADDC(28); - MOP; - w += 16; - mlen -= 32; - } - if (mlen >= 16) { - ADD(0); - ADDC(4); - ADDC(8); - ADDC(12); - MOP; - w += 8; - mlen -= 16; - } - if (mlen >= 8) { - ADD(0); - ADDC(4); - MOP; - w += 4; - mlen -= 8; - } - if (mlen == 0 && byte_swapped == 0) - continue; /* worth 1% maybe ?? */ - REDUCE; - while ((mlen -= 2) >= 0) { - sum += *w++; - } - if (byte_swapped) { - sum <<= 8; - byte_swapped = 0; - if (mlen == -1) { - su.c[1] = *(char *)w; - sum += su.s; - mlen = 0; - } else - mlen = -1; - } else if (mlen == -1) - /* - * This mbuf has odd number of bytes. - * There could be a word split betwen - * this mbuf and the next mbuf. - * Save the last byte (to prepend to next mbuf). - */ - su.c[0] = *(char *)w; - } - - if (len) - printf("%s: out of data by %d\n", __func__, len); - if (mlen == -1) { - /* The last mbuf has odd # of bytes. Follow the - standard (the odd byte is shifted left by 8 bits) */ - su.c[1] = 0; - sum += su.s; - } - REDUCE; - return (~sum & 0xffff); -} - -u_short -in_cksum_skip(m, len, skip) - struct mbuf *m; - int len; - int skip; -{ - u_short *w; - unsigned sum = 0; - int mlen = 0; - int byte_swapped = 0; - union { char c[2]; u_short s; } su; - - len -= skip; - for (; skip && m; m = m->m_next) { - if (m->m_len > skip) { - mlen = m->m_len - skip; - w = (u_short *)(mtod(m, u_char *) + skip); - goto skip_start; - } else { - skip -= m->m_len; - } - } - - for (;m && len; m = m->m_next) { - if (m->m_len == 0) - continue; - w = mtod(m, u_short *); - if (mlen == -1) { - /* - * The first byte of this mbuf is the continuation - * of a word spanning between this mbuf and the - * last mbuf. - */ - - /* su.c[0] is already saved when scanning previous - * mbuf. sum was REDUCEd when we found mlen == -1 - */ - su.c[1] = *(u_char *)w; - sum += su.s; - w = (u_short *)((char *)w + 1); - mlen = m->m_len - 1; - len--; - } else - mlen = m->m_len; -skip_start: - if (len < mlen) - mlen = len; - len -= mlen; - /* - * Force to long boundary so we do longword aligned - * memory operations - */ - if (3 & (int) w) { - REDUCE; - if ((1 & (int) w) && (mlen > 0)) { - sum <<= 8; - su.c[0] = *(char *)w; - w = (u_short *)((char *)w + 1); - mlen--; - byte_swapped = 1; - } - if ((2 & (int) w) && (mlen >= 2)) { - sum += *w++; - mlen -= 2; - } - } - /* - * Advance to a 486 cache line boundary. - */ - if (4 & (int) w && mlen >= 4) { - ADD(0); - MOP; - w += 2; - mlen -= 4; - } - if (8 & (int) w && mlen >= 8) { - ADD(0); - ADDC(4); - MOP; - w += 4; - mlen -= 8; - } - /* - * Do as much of the checksum as possible 32 bits at at time. - * In fact, this loop is unrolled to make overhead from - * branches &c small. - */ - mlen -= 1; - while ((mlen -= 32) >= 0) { - /* - * Add with carry 16 words and fold in the last - * carry by adding a 0 with carry. - * - * The early ADD(16) and the LOAD(32) are to load - * the next 2 cache lines in advance on 486's. The - * 486 has a penalty of 2 clock cycles for loading - * a cache line, plus whatever time the external - * memory takes to load the first word(s) addressed. - * These penalties are unavoidable. Subsequent - * accesses to a cache line being loaded (and to - * other external memory?) are delayed until the - * whole load finishes. These penalties are mostly - * avoided by not accessing external memory for - * 8 cycles after the ADD(16) and 12 cycles after - * the LOAD(32). The loop terminates when mlen - * is initially 33 (not 32) to guaranteed that - * the LOAD(32) is within bounds. - */ - ADD(16); - ADDC(0); - ADDC(4); - ADDC(8); - ADDC(12); - LOAD(32); - ADDC(20); - ADDC(24); - ADDC(28); - MOP; - w += 16; - } - mlen += 32 + 1; - if (mlen >= 32) { - ADD(16); - ADDC(0); - ADDC(4); - ADDC(8); - ADDC(12); - ADDC(20); - ADDC(24); - ADDC(28); - MOP; - w += 16; - mlen -= 32; - } - if (mlen >= 16) { - ADD(0); - ADDC(4); - ADDC(8); - ADDC(12); - MOP; - w += 8; - mlen -= 16; - } - if (mlen >= 8) { - ADD(0); - ADDC(4); - MOP; - w += 4; - mlen -= 8; - } - if (mlen == 0 && byte_swapped == 0) - continue; /* worth 1% maybe ?? */ - REDUCE; - while ((mlen -= 2) >= 0) { - sum += *w++; - } - if (byte_swapped) { - sum <<= 8; - byte_swapped = 0; - if (mlen == -1) { - su.c[1] = *(char *)w; - sum += su.s; - mlen = 0; - } else - mlen = -1; - } else if (mlen == -1) - /* - * This mbuf has odd number of bytes. - * There could be a word split betwen - * this mbuf and the next mbuf. - * Save the last byte (to prepend to next mbuf). - */ - su.c[0] = *(char *)w; - } - - if (len) - printf("%s: out of data by %d\n", __func__, len); - if (mlen == -1) { - /* The last mbuf has odd # of bytes. Follow the - standard (the odd byte is shifted left by 8 bits) */ - su.c[1] = 0; - sum += su.s; - } - REDUCE; - return (~sum & 0xffff); -} - -/* - * This is the exact same algorithm as above with a few exceptions: - * (1) it is designed to operate on buffers, not mbufs - * (2) it returns an intermediate form of the sum which has to be - * explicitly finalized (but this can be delayed) - * (3) it accepts an intermediate sum + * 0x8142 + 0x8243 = 0x0385 + C = 0x0386 (checksum is in same byte order + * 0x4281 + 0x4382 = 0x8603 as the data regardless of arch) * - * This is particularly useful when building packets quickly, - * since one can compute the checksum of the pseudoheader ahead of - * time and then use this function to complete the work. That way, - * the pseudoheader never actually has to exist in the packet buffer, - * which avoids needless duplication of work. + * This works with 16, 32, 64, etc... bits as long as we deal with the + * carry when collapsing it back down to 16 bits. */ -in_psum_t -in_cksum_partial(psum, w, len) - in_psum_t psum; - const u_short *w; - int len; +__uint32_t +in_cksum_range(struct mbuf *m, int offset, int bytes) { - in_psum_t sum = psum; - int byte_swapped = 0; - union { char c[2]; u_short s; } su; - + __uint8_t *ptr; + __uint32_t sum0; + __uint32_t sum1; + int n; + int flip; + + /* + * Skip fully engulfed mbufs. Branch predict optimal. + */ + while (m && offset >= m->m_len) { + offset -= m->m_len; + m = m->m_next; + } + + /* + * Process the checksum for each segment. Note that the code below is + * branch-predict optimal, so it's faster then you might otherwise + * believe. When we are buffer-aligned but also odd-byte-aligned from + * the point of view of the IP packet, we accumulate to sum1 instead of + * sum0. + * + * Initial offsets do not pre-set flip (assert that offset is even?) + */ + sum0 = 0; + sum1 = 0; + flip = 0; + while (bytes > 0 && m) { /* - * Force to long boundary so we do longword aligned - * memory operations + * Calculate pointer base and number of bytes to snarf, account + * for snarfed bytes. */ - if (3 & (int) w) { - REDUCE; - if ((1 & (int) w) && (len > 0)) { - sum <<= 8; - su.c[0] = *(const char *)w; - w = (const u_short *)((const char *)w + 1); - len--; - byte_swapped = 1; - } - if ((2 & (int) w) && (len >= 2)) { - sum += *w++; - len -= 2; - } - } + ptr = mtod(m, __uint8_t *) + offset; + if ((n = m->m_len - offset) > bytes) + n = bytes; + bytes -= n; + /* - * Advance to a 486 cache line boundary. + * First 16-bit-align our buffer by eating a byte if necessary, + * then 32-bit-align our buffer by eating a word if necessary. + * + * We are endian-sensitive when chomping a byte. WARNING! Be + * careful optimizing this! 16 ane 32 bit words must be aligned + * for this to be generic code. */ - if (4 & (int) w && len >= 4) { - ADD(0); - MOP; - w += 2; - len -= 4; + if (((intptr_t)ptr & 1) && n) { +#if BYTE_ORDER == LITTLE_ENDIAN + if (flip) + sum1 += ptr[0]; + else + sum0 += ptr[0]; +#else + if (flip) + sum0 += ptr[0]; + else + sum1 += ptr[0]; +#endif + ++ptr; + --n; + flip = 1 - flip; } - if (8 & (int) w && len >= 8) { - ADD(0); - ADDC(4); - MOP; - w += 4; - len -= 8; + if (((intptr_t)ptr & 2) && n > 1) { + if (flip) + sum1 += *(__uint16_t *)ptr; + else + sum0 += *(__uint16_t *)ptr; + ptr += 2; + n -= 2; } + /* - * Do as much of the checksum as possible 32 bits at at time. - * In fact, this loop is unrolled to make overhead from - * branches &c small. + * Process a 32-bit aligned data buffer and accumulate the result + * in sum0 or sum1. Allow only one 16 bit overflow carry. */ - len -= 1; - while ((len -= 32) >= 0) { - /* - * Add with carry 16 words and fold in the last - * carry by adding a 0 with carry. - * - * The early ADD(16) and the LOAD(32) are to load - * the next 2 cache lines in advance on 486's. The - * 486 has a penalty of 2 clock cycles for loading - * a cache line, plus whatever time the external - * memory takes to load the first word(s) addressed. - * These penalties are unavoidable. Subsequent - * accesses to a cache line being loaded (and to - * other external memory?) are delayed until the - * whole load finishes. These penalties are mostly - * avoided by not accessing external memory for - * 8 cycles after the ADD(16) and 12 cycles after - * the LOAD(32). The loop terminates when len - * is initially 33 (not 32) to guaranteed that - * the LOAD(32) is within bounds. - */ - ADD(16); - ADDC(0); - ADDC(4); - ADDC(8); - ADDC(12); - LOAD(32); - ADDC(20); - ADDC(24); - ADDC(28); - MOP; - w += 16; + if (n >= 4) { + __uint32_t sum32; + + sum32 = asm_ones32((void *)ptr, n >> 2); + sum32 = (sum32 >> 16) + (sum32 & 0xffff); + if (flip) + sum1 += sum32; + else + sum0 += sum32; + ptr += n & ~3; + /* n &= 3; dontcare */ } - len += 32 + 1; - if (len >= 32) { - ADD(16); - ADDC(0); - ADDC(4); - ADDC(8); - ADDC(12); - ADDC(20); - ADDC(24); - ADDC(28); - MOP; - w += 16; - len -= 32; - } - if (len >= 16) { - ADD(0); - ADDC(4); - ADDC(8); - ADDC(12); - MOP; - w += 8; - len -= 16; - } - if (len >= 8) { - ADD(0); - ADDC(4); - MOP; - w += 4; - len -= 8; - } - if (len == 0 && byte_swapped == 0) - goto out; - REDUCE; - while ((len -= 2) >= 0) { - sum += *w++; - } - if (byte_swapped) { - sum <<= 8; - byte_swapped = 0; - if (len == -1) { - su.c[1] = *(const char *)w; - sum += su.s; - len = 0; - } else - len = -1; - } else if (len == -1) { - /* - * This buffer has odd number of bytes. - * There could be a word split betwen - * this buffer and the next. - */ - su.c[0] = *(const char *)w; + + /* + * Handle oddly-sized buffers. Handle word issues first while + * ptr is still aligned. + */ + if (n & 2) { + if (flip) + sum1 += *(__uint16_t *)ptr; + else + sum0 += *(__uint16_t *)ptr; + ptr += 2; + /* n -= 2; dontcare */ } -out: - if (len == -1) { - /* The last buffer has odd # of bytes. Follow the - standard (the odd byte is shifted left by 8 bits) */ - su.c[1] = 0; - sum += su.s; + if (n & 1) { +#if BYTE_ORDER == LITTLE_ENDIAN + if (flip) + sum1 += ptr[0]; + else + sum0 += ptr[0]; +#else + if (flip) + sum0 += ptr[0]; + else + sum1 += ptr[0]; +#endif + /* ++ptr; dontcare */ + /* --n; dontcare */ + flip = 1 - flip; } - return sum; + m = m->m_next; + offset = 0; + } + + /* + * Due to byte aligned or oddly-sized buffers we may have a checksum + * in sum1 which needs to be shifted and added to our main sum. There + * is a presumption here that no more then 255 overflows occured which + * is 255/3 byte aligned mbufs in the worst case. + */ + sum0 += sum1 << 8; + sum0 = (sum0 >> 16) + (sum0 & 0xffff); + if (sum0 > 0xffff) + ++sum0; + return(~sum0 & 0xffff); } -int -in_cksum_finalize(psum) - in_psum_t psum; -{ - in_psum_t sum = psum; - REDUCE; - return (~sum & 0xffff); -} diff --git a/sys/platform/pc32/include/in_cksum.h b/sys/platform/pc32/include/in_cksum.h index ac3739e668..d4842ba270 100644 --- a/sys/platform/pc32/include/in_cksum.h +++ b/sys/platform/pc32/include/in_cksum.h @@ -34,97 +34,61 @@ * from: @(#)in_cksum.c 1.3 (Berkeley) 1/19/91 * from: Id: in_cksum.c,v 1.8 1995/12/03 18:35:19 bde Exp * $FreeBSD: src/sys/i386/include/in_cksum.h,v 1.7.2.2 2002/07/02 04:03:04 jdp Exp $ - * $DragonFly: src/sys/platform/pc32/include/Attic/in_cksum.h,v 1.4 2003/08/26 21:42:18 rob Exp $ + * $DragonFly: src/sys/platform/pc32/include/Attic/in_cksum.h,v 1.5 2004/02/14 02:09:27 dillon Exp $ */ #ifndef _MACHINE_IN_CKSUM_H_ -#define _MACHINE_IN_CKSUM_H_ 1 +#define _MACHINE_IN_CKSUM_H_ -#include +#ifdef _KERNEL +__uint32_t in_cksum_range(struct mbuf *m, int offset, int bytes); +__uint32_t asm_ones32(const void *buf, int count); /* in 32 bit words */ +#endif -/* - * It it useful to have an Internet checksum routine which is inlineable - * and optimized specifically for the task of computing IP header checksums - * in the normal case (where there are no options and the header length is - * therefore always exactly five 32-bit words. - */ -#ifdef __GNUC__ static __inline u_int -in_cksum_hdr(const struct ip *ip) +in_cksum(struct mbuf *m, int len) { - u_int sum = 0; - -/* __volatile is necessary here because the condition codes are used. */ -#define ADD(n) __asm __volatile ("addl %1, %0" : "+r" (sum) : \ - "g" (((const u_int32_t *)ip)[n / 4])) -#define ADDC(n) __asm __volatile ("adcl %1, %0" : "+r" (sum) : \ - "g" (((const u_int32_t *)ip)[n / 4])) -#define MOP __asm __volatile ("adcl $0, %0" : "+r" (sum)) - - ADD(0); - ADDC(4); - ADDC(8); - ADDC(12); - ADDC(16); - MOP; -#undef ADD -#undef ADDC -#undef MOP - sum = (sum & 0xffff) + (sum >> 16); - if (sum > 0xffff) - sum -= 0xffff; - - return ~sum & 0xffff; + return(in_cksum_range(m, 0, len)); } -static __inline void -in_cksum_update(struct ip *ip) +static __inline u_int +in_cksum_skip(struct mbuf *m, int len, int skip) { - int __tmpsum; - __tmpsum = (int)ntohs(ip->ip_sum) + 256; - ip->ip_sum = htons(__tmpsum + (__tmpsum >> 16)); + return(in_cksum_range(m, skip, len - skip)); +} + +static __inline u_int +in_cksum_hdr(const struct ip *ip) +{ + __uint32_t sum; + + sum = asm_ones32((const void *)ip, 5); /* 5x4 = 20 bytes */ + sum = (sum >> 16) + (sum & 0xFFFF); + if (sum > 0xFFFF) + ++sum; + return(~sum & 0xFFFF); } static __inline u_short in_addword(u_short sum, u_short b) { - /* __volatile is necessary because the condition codes are used. */ - __asm __volatile ("addw %1, %0" : "+r" (sum) : "r" (b)); - __asm __volatile ("adcw $0, %0" : "+r" (sum)); + /* __volatile is necessary because the condition codes are used. */ + __asm __volatile ("addw %1, %0; adcw $0,%0" : "+r" (sum) : "r" (b)); - return (sum); + return (sum); } static __inline u_short in_pseudo(u_int sum, u_int b, u_int c) { - /* __volatile is necessary because the condition codes are used. */ - __asm __volatile ("addl %1, %0" : "+r" (sum) : "g" (b)); - __asm __volatile ("adcl %1, %0" : "+r" (sum) : "g" (c)); - __asm __volatile ("adcl $0, %0" : "+r" (sum)); - - sum = (sum & 0xffff) + (sum >> 16); - if (sum > 0xffff) - sum -= 0xffff; - return (sum); + /* __volatile is necessary because the condition codes are used. */ + __asm __volatile ("addl %1,%0; adcl %2,%0; adcl $0,%0" + : "+r" (sum) + : "g" (b), "g" (c)); + sum = (sum & 0xffff) + (sum >> 16); + if (sum > 0xffff) + sum -= 0xffff; + return (sum); } -#else -u_int in_cksum_hdr (const struct ip *); -#define in_cksum_update(ip) \ - do { \ - int __tmpsum; \ - __tmpsum = (int)ntohs(ip->ip_sum) + 256; \ - ip->ip_sum = htons(__tmpsum + (__tmpsum >> 16)); \ - } while(0) - -#endif - -typedef unsigned in_psum_t; -#ifdef _KERNEL -u_short in_cksum_skip(struct mbuf *m, int len, int skip); -in_psum_t in_cksum_partial(in_psum_t psum, const u_short *w, int len); -int in_cksum_finalize(in_psum_t psum); -#endif /* _KERNEL */ - #endif /* _MACHINE_IN_CKSUM_H_ */ -- 2.41.0