kernel tree reorganization stage 1: Major cvs repository work (not logged as
[dragonfly.git] / sys / i386 / i386 / in_cksum.c
... / ...
CommitLineData
1/*-
2 * Copyright (c) 1990 The Regents of the University of California.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 * must display the following acknowledgement:
15 * This product includes software developed by the University of
16 * California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 * from tahoe: in_cksum.c 1.2 86/01/05
34 * from: @(#)in_cksum.c 1.3 (Berkeley) 1/19/91
35 * $FreeBSD: src/sys/i386/i386/in_cksum.c,v 1.17.2.3 2002/07/02 04:03:00 jdp Exp $
36 * $DragonFly: src/sys/i386/i386/Attic/in_cksum.c,v 1.3 2003/07/26 19:07:47 rob Exp $
37 */
38
39#include <sys/param.h>
40#include <sys/systm.h>
41#include <sys/mbuf.h>
42
43#include <netinet/in.h>
44#include <netinet/in_systm.h>
45#include <netinet/ip.h>
46
47#include <machine/in_cksum.h>
48
49/*
50 * Checksum routine for Internet Protocol family headers.
51 *
52 * This routine is very heavily used in the network
53 * code and should be modified for each CPU to be as fast as possible.
54 *
55 * This implementation is 386 version.
56 */
57
58#undef ADDCARRY
59#define ADDCARRY(x) if ((x) > 0xffff) (x) -= 0xffff
60#define REDUCE {sum = (sum & 0xffff) + (sum >> 16); ADDCARRY(sum);}
61
62/*
63 * These asm statements require __volatile because they pass information
64 * via the condition codes. GCC does not currently provide a way to specify
65 * the condition codes as an input or output operand.
66 *
67 * The LOAD macro below is effectively a prefetch into cache. GCC will
68 * load the value into a register but will not use it. Since modern CPUs
69 * reorder operations, this will generally take place in parallel with
70 * other calculations.
71 */
72#define ADD(n) __asm __volatile \
73 ("addl %1, %0" : "+r" (sum) : \
74 "g" (((const u_int32_t *)w)[n / 4]))
75#define ADDC(n) __asm __volatile \
76 ("adcl %1, %0" : "+r" (sum) : \
77 "g" (((const u_int32_t *)w)[n / 4]))
78#define LOAD(n) __asm __volatile \
79 ("" : : "r" (((const u_int32_t *)w)[n / 4]))
80#define MOP __asm __volatile \
81 ("adcl $0, %0" : "+r" (sum))
82
83int
84in_cksum(m, len)
85 struct mbuf *m;
86 int len;
87{
88 u_short *w;
89 unsigned sum = 0;
90 int mlen = 0;
91 int byte_swapped = 0;
92 union { char c[2]; u_short s; } su;
93
94 for (;m && len; m = m->m_next) {
95 if (m->m_len == 0)
96 continue;
97 w = mtod(m, u_short *);
98 if (mlen == -1) {
99 /*
100 * The first byte of this mbuf is the continuation
101 * of a word spanning between this mbuf and the
102 * last mbuf.
103 */
104
105 /* su.c[0] is already saved when scanning previous
106 * mbuf. sum was REDUCEd when we found mlen == -1
107 */
108 su.c[1] = *(u_char *)w;
109 sum += su.s;
110 w = (u_short *)((char *)w + 1);
111 mlen = m->m_len - 1;
112 len--;
113 } else
114 mlen = m->m_len;
115 if (len < mlen)
116 mlen = len;
117 len -= mlen;
118 /*
119 * Force to long boundary so we do longword aligned
120 * memory operations
121 */
122 if (3 & (int) w) {
123 REDUCE;
124 if ((1 & (int) w) && (mlen > 0)) {
125 sum <<= 8;
126 su.c[0] = *(char *)w;
127 w = (u_short *)((char *)w + 1);
128 mlen--;
129 byte_swapped = 1;
130 }
131 if ((2 & (int) w) && (mlen >= 2)) {
132 sum += *w++;
133 mlen -= 2;
134 }
135 }
136 /*
137 * Advance to a 486 cache line boundary.
138 */
139 if (4 & (int) w && mlen >= 4) {
140 ADD(0);
141 MOP;
142 w += 2;
143 mlen -= 4;
144 }
145 if (8 & (int) w && mlen >= 8) {
146 ADD(0);
147 ADDC(4);
148 MOP;
149 w += 4;
150 mlen -= 8;
151 }
152 /*
153 * Do as much of the checksum as possible 32 bits at at time.
154 * In fact, this loop is unrolled to make overhead from
155 * branches &c small.
156 */
157 mlen -= 1;
158 while ((mlen -= 32) >= 0) {
159 /*
160 * Add with carry 16 words and fold in the last
161 * carry by adding a 0 with carry.
162 *
163 * The early ADD(16) and the LOAD(32) are to load
164 * the next 2 cache lines in advance on 486's. The
165 * 486 has a penalty of 2 clock cycles for loading
166 * a cache line, plus whatever time the external
167 * memory takes to load the first word(s) addressed.
168 * These penalties are unavoidable. Subsequent
169 * accesses to a cache line being loaded (and to
170 * other external memory?) are delayed until the
171 * whole load finishes. These penalties are mostly
172 * avoided by not accessing external memory for
173 * 8 cycles after the ADD(16) and 12 cycles after
174 * the LOAD(32). The loop terminates when mlen
175 * is initially 33 (not 32) to guaranteed that
176 * the LOAD(32) is within bounds.
177 */
178 ADD(16);
179 ADDC(0);
180 ADDC(4);
181 ADDC(8);
182 ADDC(12);
183 LOAD(32);
184 ADDC(20);
185 ADDC(24);
186 ADDC(28);
187 MOP;
188 w += 16;
189 }
190 mlen += 32 + 1;
191 if (mlen >= 32) {
192 ADD(16);
193 ADDC(0);
194 ADDC(4);
195 ADDC(8);
196 ADDC(12);
197 ADDC(20);
198 ADDC(24);
199 ADDC(28);
200 MOP;
201 w += 16;
202 mlen -= 32;
203 }
204 if (mlen >= 16) {
205 ADD(0);
206 ADDC(4);
207 ADDC(8);
208 ADDC(12);
209 MOP;
210 w += 8;
211 mlen -= 16;
212 }
213 if (mlen >= 8) {
214 ADD(0);
215 ADDC(4);
216 MOP;
217 w += 4;
218 mlen -= 8;
219 }
220 if (mlen == 0 && byte_swapped == 0)
221 continue; /* worth 1% maybe ?? */
222 REDUCE;
223 while ((mlen -= 2) >= 0) {
224 sum += *w++;
225 }
226 if (byte_swapped) {
227 sum <<= 8;
228 byte_swapped = 0;
229 if (mlen == -1) {
230 su.c[1] = *(char *)w;
231 sum += su.s;
232 mlen = 0;
233 } else
234 mlen = -1;
235 } else if (mlen == -1)
236 /*
237 * This mbuf has odd number of bytes.
238 * There could be a word split betwen
239 * this mbuf and the next mbuf.
240 * Save the last byte (to prepend to next mbuf).
241 */
242 su.c[0] = *(char *)w;
243 }
244
245 if (len)
246 printf("%s: out of data by %d\n", __func__, len);
247 if (mlen == -1) {
248 /* The last mbuf has odd # of bytes. Follow the
249 standard (the odd byte is shifted left by 8 bits) */
250 su.c[1] = 0;
251 sum += su.s;
252 }
253 REDUCE;
254 return (~sum & 0xffff);
255}
256
257u_short
258in_cksum_skip(m, len, skip)
259 struct mbuf *m;
260 int len;
261 int skip;
262{
263 u_short *w;
264 unsigned sum = 0;
265 int mlen = 0;
266 int byte_swapped = 0;
267 union { char c[2]; u_short s; } su;
268
269 len -= skip;
270 for (; skip && m; m = m->m_next) {
271 if (m->m_len > skip) {
272 mlen = m->m_len - skip;
273 w = (u_short *)(mtod(m, u_char *) + skip);
274 goto skip_start;
275 } else {
276 skip -= m->m_len;
277 }
278 }
279
280 for (;m && len; m = m->m_next) {
281 if (m->m_len == 0)
282 continue;
283 w = mtod(m, u_short *);
284 if (mlen == -1) {
285 /*
286 * The first byte of this mbuf is the continuation
287 * of a word spanning between this mbuf and the
288 * last mbuf.
289 */
290
291 /* su.c[0] is already saved when scanning previous
292 * mbuf. sum was REDUCEd when we found mlen == -1
293 */
294 su.c[1] = *(u_char *)w;
295 sum += su.s;
296 w = (u_short *)((char *)w + 1);
297 mlen = m->m_len - 1;
298 len--;
299 } else
300 mlen = m->m_len;
301skip_start:
302 if (len < mlen)
303 mlen = len;
304 len -= mlen;
305 /*
306 * Force to long boundary so we do longword aligned
307 * memory operations
308 */
309 if (3 & (int) w) {
310 REDUCE;
311 if ((1 & (int) w) && (mlen > 0)) {
312 sum <<= 8;
313 su.c[0] = *(char *)w;
314 w = (u_short *)((char *)w + 1);
315 mlen--;
316 byte_swapped = 1;
317 }
318 if ((2 & (int) w) && (mlen >= 2)) {
319 sum += *w++;
320 mlen -= 2;
321 }
322 }
323 /*
324 * Advance to a 486 cache line boundary.
325 */
326 if (4 & (int) w && mlen >= 4) {
327 ADD(0);
328 MOP;
329 w += 2;
330 mlen -= 4;
331 }
332 if (8 & (int) w && mlen >= 8) {
333 ADD(0);
334 ADDC(4);
335 MOP;
336 w += 4;
337 mlen -= 8;
338 }
339 /*
340 * Do as much of the checksum as possible 32 bits at at time.
341 * In fact, this loop is unrolled to make overhead from
342 * branches &c small.
343 */
344 mlen -= 1;
345 while ((mlen -= 32) >= 0) {
346 /*
347 * Add with carry 16 words and fold in the last
348 * carry by adding a 0 with carry.
349 *
350 * The early ADD(16) and the LOAD(32) are to load
351 * the next 2 cache lines in advance on 486's. The
352 * 486 has a penalty of 2 clock cycles for loading
353 * a cache line, plus whatever time the external
354 * memory takes to load the first word(s) addressed.
355 * These penalties are unavoidable. Subsequent
356 * accesses to a cache line being loaded (and to
357 * other external memory?) are delayed until the
358 * whole load finishes. These penalties are mostly
359 * avoided by not accessing external memory for
360 * 8 cycles after the ADD(16) and 12 cycles after
361 * the LOAD(32). The loop terminates when mlen
362 * is initially 33 (not 32) to guaranteed that
363 * the LOAD(32) is within bounds.
364 */
365 ADD(16);
366 ADDC(0);
367 ADDC(4);
368 ADDC(8);
369 ADDC(12);
370 LOAD(32);
371 ADDC(20);
372 ADDC(24);
373 ADDC(28);
374 MOP;
375 w += 16;
376 }
377 mlen += 32 + 1;
378 if (mlen >= 32) {
379 ADD(16);
380 ADDC(0);
381 ADDC(4);
382 ADDC(8);
383 ADDC(12);
384 ADDC(20);
385 ADDC(24);
386 ADDC(28);
387 MOP;
388 w += 16;
389 mlen -= 32;
390 }
391 if (mlen >= 16) {
392 ADD(0);
393 ADDC(4);
394 ADDC(8);
395 ADDC(12);
396 MOP;
397 w += 8;
398 mlen -= 16;
399 }
400 if (mlen >= 8) {
401 ADD(0);
402 ADDC(4);
403 MOP;
404 w += 4;
405 mlen -= 8;
406 }
407 if (mlen == 0 && byte_swapped == 0)
408 continue; /* worth 1% maybe ?? */
409 REDUCE;
410 while ((mlen -= 2) >= 0) {
411 sum += *w++;
412 }
413 if (byte_swapped) {
414 sum <<= 8;
415 byte_swapped = 0;
416 if (mlen == -1) {
417 su.c[1] = *(char *)w;
418 sum += su.s;
419 mlen = 0;
420 } else
421 mlen = -1;
422 } else if (mlen == -1)
423 /*
424 * This mbuf has odd number of bytes.
425 * There could be a word split betwen
426 * this mbuf and the next mbuf.
427 * Save the last byte (to prepend to next mbuf).
428 */
429 su.c[0] = *(char *)w;
430 }
431
432 if (len)
433 printf("%s: out of data by %d\n", __func__, len);
434 if (mlen == -1) {
435 /* The last mbuf has odd # of bytes. Follow the
436 standard (the odd byte is shifted left by 8 bits) */
437 su.c[1] = 0;
438 sum += su.s;
439 }
440 REDUCE;
441 return (~sum & 0xffff);
442}
443
444/*
445 * This is the exact same algorithm as above with a few exceptions:
446 * (1) it is designed to operate on buffers, not mbufs
447 * (2) it returns an intermediate form of the sum which has to be
448 * explicitly finalized (but this can be delayed)
449 * (3) it accepts an intermediate sum
450 *
451 * This is particularly useful when building packets quickly,
452 * since one can compute the checksum of the pseudoheader ahead of
453 * time and then use this function to complete the work. That way,
454 * the pseudoheader never actually has to exist in the packet buffer,
455 * which avoids needless duplication of work.
456 */
457in_psum_t
458in_cksum_partial(psum, w, len)
459 in_psum_t psum;
460 const u_short *w;
461 int len;
462{
463 in_psum_t sum = psum;
464 int byte_swapped = 0;
465 union { char c[2]; u_short s; } su;
466
467 /*
468 * Force to long boundary so we do longword aligned
469 * memory operations
470 */
471 if (3 & (int) w) {
472 REDUCE;
473 if ((1 & (int) w) && (len > 0)) {
474 sum <<= 8;
475 su.c[0] = *(const char *)w;
476 w = (const u_short *)((const char *)w + 1);
477 len--;
478 byte_swapped = 1;
479 }
480 if ((2 & (int) w) && (len >= 2)) {
481 sum += *w++;
482 len -= 2;
483 }
484 }
485 /*
486 * Advance to a 486 cache line boundary.
487 */
488 if (4 & (int) w && len >= 4) {
489 ADD(0);
490 MOP;
491 w += 2;
492 len -= 4;
493 }
494 if (8 & (int) w && len >= 8) {
495 ADD(0);
496 ADDC(4);
497 MOP;
498 w += 4;
499 len -= 8;
500 }
501 /*
502 * Do as much of the checksum as possible 32 bits at at time.
503 * In fact, this loop is unrolled to make overhead from
504 * branches &c small.
505 */
506 len -= 1;
507 while ((len -= 32) >= 0) {
508 /*
509 * Add with carry 16 words and fold in the last
510 * carry by adding a 0 with carry.
511 *
512 * The early ADD(16) and the LOAD(32) are to load
513 * the next 2 cache lines in advance on 486's. The
514 * 486 has a penalty of 2 clock cycles for loading
515 * a cache line, plus whatever time the external
516 * memory takes to load the first word(s) addressed.
517 * These penalties are unavoidable. Subsequent
518 * accesses to a cache line being loaded (and to
519 * other external memory?) are delayed until the
520 * whole load finishes. These penalties are mostly
521 * avoided by not accessing external memory for
522 * 8 cycles after the ADD(16) and 12 cycles after
523 * the LOAD(32). The loop terminates when len
524 * is initially 33 (not 32) to guaranteed that
525 * the LOAD(32) is within bounds.
526 */
527 ADD(16);
528 ADDC(0);
529 ADDC(4);
530 ADDC(8);
531 ADDC(12);
532 LOAD(32);
533 ADDC(20);
534 ADDC(24);
535 ADDC(28);
536 MOP;
537 w += 16;
538 }
539 len += 32 + 1;
540 if (len >= 32) {
541 ADD(16);
542 ADDC(0);
543 ADDC(4);
544 ADDC(8);
545 ADDC(12);
546 ADDC(20);
547 ADDC(24);
548 ADDC(28);
549 MOP;
550 w += 16;
551 len -= 32;
552 }
553 if (len >= 16) {
554 ADD(0);
555 ADDC(4);
556 ADDC(8);
557 ADDC(12);
558 MOP;
559 w += 8;
560 len -= 16;
561 }
562 if (len >= 8) {
563 ADD(0);
564 ADDC(4);
565 MOP;
566 w += 4;
567 len -= 8;
568 }
569 if (len == 0 && byte_swapped == 0)
570 goto out;
571 REDUCE;
572 while ((len -= 2) >= 0) {
573 sum += *w++;
574 }
575 if (byte_swapped) {
576 sum <<= 8;
577 byte_swapped = 0;
578 if (len == -1) {
579 su.c[1] = *(const char *)w;
580 sum += su.s;
581 len = 0;
582 } else
583 len = -1;
584 } else if (len == -1) {
585 /*
586 * This buffer has odd number of bytes.
587 * There could be a word split betwen
588 * this buffer and the next.
589 */
590 su.c[0] = *(const char *)w;
591 }
592out:
593 if (len == -1) {
594 /* The last buffer has odd # of bytes. Follow the
595 standard (the odd byte is shifted left by 8 bits) */
596 su.c[1] = 0;
597 sum += su.s;
598 }
599 return sum;
600}
601
602int
603in_cksum_finalize(psum)
604 in_psum_t psum;
605{
606 in_psum_t sum = psum;
607 REDUCE;
608 return (~sum & 0xffff);
609}