462164ba3ed662856699704a31d5cdaa33a81fba
[dragonfly.git] / sys / dev / netif / mxge / mxge_lro.c
1 /******************************************************************************
2
3 Copyright (c) 2007-2008, Myricom Inc.
4 All rights reserved.
5
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11
12  2. Neither the name of the Myricom Inc, nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27
28 ***************************************************************************/
29
30 #include <sys/cdefs.h>
31 /*__FBSDID("$FreeBSD: src/sys/dev/mxge/mxge_lro.c,v 1.8 2009/06/23 17:42:06 gallatin Exp $");*/
32
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/endian.h>
36 #include <sys/mbuf.h>
37 #include <sys/kernel.h>
38 #include <sys/socket.h>
39 #include <sys/sysctl.h>
40 #include <sys/bus.h>
41
42 #include <net/if.h>
43 #include <net/ethernet.h>
44 #include <net/if_media.h>
45
46 #include <netinet/in_systm.h>
47 #include <netinet/in.h>
48 #include <netinet/ip.h>
49 #include <netinet/tcp.h>
50
51 #include <machine/bus.h>
52 #include <machine/in_cksum.h>
53
54 #include <dev/netif/mxge/mxge_mcp.h>
55 #include <dev/netif/mxge/if_mxge_var.h>
56
57 #include "opt_inet.h"
58
59 #ifdef INET
60
61 /* Assume len is a multiple of 4 */
62 static uint16_t
63 mxge_csum_generic(uint16_t *raw, int len)
64 {
65         uint32_t csum;
66         csum = 0;
67         while (len > 0) {
68                 csum += *raw;
69                 raw++;
70                 csum += *raw;
71                 raw++;
72                 len -= 4;
73         }
74         csum = (csum >> 16) + (csum & 0xffff);
75         csum = (csum >> 16) + (csum & 0xffff);
76         return (uint16_t)csum;
77 }
78
79
80 void
81 mxge_lro_flush(struct mxge_slice_state *ss, struct lro_entry *lro)
82 {
83         mxge_softc_t *mgp = ss->sc;
84         struct ifnet *ifp;
85         struct ip *ip;
86         struct tcphdr *tcp;
87         uint32_t *ts_ptr;
88         uint32_t tcplen, tcp_csum;
89
90         if (lro->append_cnt) {
91                 /* incorporate the new len into the ip header and
92                  * re-calculate the checksum */
93                 ip = lro->ip;
94                 ip->ip_len = htons(lro->len - ETHER_HDR_LEN);
95                 ip->ip_sum = 0;
96                 ip->ip_sum = 0xffff ^ 
97                         mxge_csum_generic((uint16_t*)ip,
98                                               sizeof (*ip));
99
100                 lro->m_head->m_pkthdr.csum_flags = CSUM_IP_CHECKED |
101                         CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
102                 lro->m_head->m_pkthdr.csum_data = 0xffff;
103                 lro->m_head->m_pkthdr.len = lro->len;
104
105                 /* incorporate the latest ack into the tcp header */
106                 tcp = (struct tcphdr *) (ip + 1);
107                 tcp->th_ack = lro->ack_seq;
108                 tcp->th_win = lro->window;
109                 /* incorporate latest timestamp into the tcp header */
110                 if (lro->timestamp) {
111                         ts_ptr = (uint32_t *)(tcp + 1);
112                         ts_ptr[1] = htonl(lro->tsval);
113                         ts_ptr[2] = lro->tsecr;
114                 }
115                 /* 
116                  * update checksum in tcp header by re-calculating the
117                  * tcp pseudoheader checksum, and adding it to the checksum
118                  * of the tcp payload data 
119                  */
120                 tcp->th_sum = 0;
121                 tcplen = lro->len - sizeof(*ip) - ETHER_HDR_LEN;
122                 tcp_csum = lro->data_csum;
123                 tcp_csum += in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
124                                       htons(tcplen + IPPROTO_TCP));
125                 tcp_csum += mxge_csum_generic((uint16_t*)tcp,
126                                                   tcp->th_off << 2);
127                 tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16);
128                 tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16);
129 #if 0
130                 IOLog("pseudo = 0x%x, generic = 0x%x, sum = %x\n", 
131                       in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
132                                 htons(tcplen + IPPROTO_TCP)),
133                       mxge_csum_generic((uint16_t*)tcp,
134                                             tcp->th_off << 2),
135                       htons(0xffff ^ tcp_csum));
136 #endif
137                 tcp->th_sum = 0xffff ^ tcp_csum;
138         }
139         ifp = mgp->ifp;
140         (*ifp->if_input)(mgp->ifp, lro->m_head);
141         ss->lro_queued += lro->append_cnt + 1;
142         ss->lro_flushed++;
143         lro->m_head = NULL;
144         lro->timestamp = 0;
145         lro->append_cnt = 0;
146         SLIST_INSERT_HEAD(&ss->lro_free, lro, next);
147 }
148
149 int
150 mxge_lro_rx(struct mxge_slice_state *ss, struct mbuf *m_head, uint32_t csum)
151 {
152         struct ether_header *eh;
153         struct ip *ip;
154         struct tcphdr *tcp;
155         uint32_t *ts_ptr;
156         struct mbuf *m_nxt, *m_tail;
157         struct lro_entry *lro;
158         int hlen, ip_len, tcp_hdr_len, tcp_data_len, tot_len;
159         int opt_bytes, trim;
160         uint32_t seq, tmp_csum, device_mtu;
161
162         eh = mtod(m_head, struct ether_header *);
163         if (eh->ether_type != htons(ETHERTYPE_IP))
164                 return 1;
165         ip = (struct ip *) (eh + 1);
166         if (ip->ip_p != IPPROTO_TCP)
167                 return 1;
168         
169         /* ensure there are no options */
170         if ((ip->ip_hl << 2) != sizeof (*ip))
171                 return -1;
172
173         /* .. and the packet is not fragmented */
174         if (ip->ip_off & htons(IP_MF|IP_OFFMASK))
175                 return -1;
176
177         /* verify that the IP header checksum is correct */
178         tmp_csum = mxge_csum_generic((uint16_t *)ip, sizeof (*ip));
179         if (__predict_false((tmp_csum ^ 0xffff) != 0)) {
180                 ss->lro_bad_csum++;
181                 return -1;
182         }
183
184         /* find the TCP header */
185         tcp = (struct tcphdr *) (ip + 1);
186
187         /* ensure no bits set besides ack or psh */
188         if ((tcp->th_flags & ~(TH_ACK | TH_PUSH)) != 0)
189                 return -1;
190
191         /* check for timestamps. Since the only option we handle are
192            timestamps, we only have to handle the simple case of
193            aligned timestamps */
194
195         opt_bytes = (tcp->th_off << 2) - sizeof (*tcp);
196         tcp_hdr_len =  sizeof (*tcp) + opt_bytes;
197         ts_ptr = (uint32_t *)(tcp + 1);
198         if (opt_bytes != 0) {
199                 if (__predict_false(opt_bytes != TCPOLEN_TSTAMP_APPA) ||
200                     (*ts_ptr !=  ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16|TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))
201                         return -1;
202         }
203
204         ip_len = ntohs(ip->ip_len);
205         tcp_data_len = ip_len - (tcp->th_off << 2) - sizeof (*ip);
206         
207
208         /* 
209          * If frame is padded beyond the end of the IP packet,
210          * then we must trim the extra bytes off the end.
211          */
212         tot_len = m_head->m_pkthdr.len;
213         trim = tot_len - (ip_len + ETHER_HDR_LEN);
214         if (trim != 0) {
215                 if (trim < 0) {
216                         /* truncated packet */
217                         return -1;
218                 }
219                 m_adj(m_head, -trim);
220                 tot_len = m_head->m_pkthdr.len;
221         }
222
223         m_nxt = m_head;
224         m_tail = NULL; /* -Wuninitialized */
225         while (m_nxt != NULL) {
226                 m_tail = m_nxt;
227                 m_nxt = m_tail->m_next;
228         }
229
230         hlen = ip_len + ETHER_HDR_LEN - tcp_data_len;
231         seq = ntohl(tcp->th_seq);
232
233         SLIST_FOREACH(lro, &ss->lro_active, next) {
234                 if (lro->source_port == tcp->th_sport && 
235                     lro->dest_port == tcp->th_dport &&
236                     lro->source_ip == ip->ip_src.s_addr && 
237                     lro->dest_ip == ip->ip_dst.s_addr) {
238                         /* Try to append it */
239
240                         if (__predict_false(seq != lro->next_seq)) {
241                                 /* out of order packet */
242                                 SLIST_REMOVE(&ss->lro_active, lro,
243                                              lro_entry, next);
244                                 mxge_lro_flush(ss, lro);
245                                 return -1;
246                         }
247
248                         if (opt_bytes) {
249                                 uint32_t tsval = ntohl(*(ts_ptr + 1));
250                                 /* make sure timestamp values are increasing */
251                                 if (__predict_false(lro->tsval > tsval || 
252                                              *(ts_ptr + 2) == 0)) {
253                                         return -1;
254                                 }
255                                 lro->tsval = tsval;
256                                 lro->tsecr = *(ts_ptr + 2);
257                         }
258
259                         lro->next_seq += tcp_data_len;
260                         lro->ack_seq = tcp->th_ack;
261                         lro->window = tcp->th_win;
262                         lro->append_cnt++;
263                         if (tcp_data_len == 0) {
264                                 m_freem(m_head);
265                                 return 0;
266                         }
267                         /* subtract off the checksum of the tcp header
268                          * from the hardware checksum, and add it to the
269                          * stored tcp data checksum.  Byteswap the checksum
270                          * if the total length so far is odd 
271                          */
272                         tmp_csum = mxge_csum_generic((uint16_t*)tcp,
273                                                          tcp_hdr_len);
274                         csum = csum + (tmp_csum ^ 0xffff);
275                         csum = (csum & 0xffff) + (csum >> 16);
276                         csum = (csum & 0xffff) + (csum >> 16);
277                         if (lro->len & 0x1) {
278                                 /* Odd number of bytes so far, flip bytes */
279                                 csum = ((csum << 8) | (csum >> 8)) & 0xffff;
280                         }
281                         csum = csum + lro->data_csum;
282                         csum = (csum & 0xffff) + (csum >> 16);
283                         csum = (csum & 0xffff) + (csum >> 16);
284                         lro->data_csum = csum;
285
286                         lro->len += tcp_data_len;
287
288                         /* adjust mbuf so that m->m_data points to
289                            the first byte of the payload */
290                         m_adj(m_head, hlen);
291                         /* append mbuf chain */
292                         lro->m_tail->m_next = m_head;
293                         /* advance the last pointer */
294                         lro->m_tail = m_tail;
295                         /* flush packet if required */
296                         device_mtu = ss->sc->ifp->if_mtu;
297                         if (lro->len > (65535 - device_mtu)) {
298                                 SLIST_REMOVE(&ss->lro_active, lro,
299                                              lro_entry, next);
300                                 mxge_lro_flush(ss, lro);
301                         }
302                         return 0;
303                 }
304         }
305
306         if (SLIST_EMPTY(&ss->lro_free))
307             return -1;
308
309         /* start a new chain */
310         lro = SLIST_FIRST(&ss->lro_free);
311         SLIST_REMOVE_HEAD(&ss->lro_free, next);
312         SLIST_INSERT_HEAD(&ss->lro_active, lro, next);
313         lro->source_port = tcp->th_sport;
314         lro->dest_port = tcp->th_dport;
315         lro->source_ip = ip->ip_src.s_addr;
316         lro->dest_ip = ip->ip_dst.s_addr;
317         lro->next_seq = seq + tcp_data_len;
318         lro->mss = tcp_data_len;
319         lro->ack_seq = tcp->th_ack;
320         lro->window = tcp->th_win;
321
322         /* save the checksum of just the TCP payload by
323          * subtracting off the checksum of the TCP header from
324          * the entire hardware checksum 
325          * Since IP header checksum is correct, checksum over
326          * the IP header is -0.  Substracting -0 is unnecessary.
327          */
328         tmp_csum = mxge_csum_generic((uint16_t*)tcp, tcp_hdr_len);
329         csum = csum + (tmp_csum ^ 0xffff);
330         csum = (csum & 0xffff) + (csum >> 16);
331         csum = (csum & 0xffff) + (csum >> 16);
332         lro->data_csum = csum;
333         
334         lro->ip = ip;
335         /* record timestamp if it is present */
336         if (opt_bytes) {
337                 lro->timestamp = 1;
338                 lro->tsval = ntohl(*(ts_ptr + 1));
339                 lro->tsecr = *(ts_ptr + 2);
340         }
341         lro->len = tot_len;
342         lro->m_head = m_head;
343         lro->m_tail = m_tail;
344         return 0;
345 }
346
347 #endif /* INET */
348 /*
349   This file uses Myri10GE driver indentation.
350
351   Local Variables:
352   c-file-style:"linux"
353   tab-width:8
354   End:
355 */