import mxge from FreeBSD as is
[dragonfly.git] / sys / dev / netif / mxge / if_mxge.c
CommitLineData
8892ea20
AE
1/******************************************************************************
2
3Copyright (c) 2006-2009, Myricom Inc.
4All rights reserved.
5
6Redistribution and use in source and binary forms, with or without
7modification, are permitted provided that the following conditions are met:
8
9 1. Redistributions of source code must retain the above copyright notice,
10 this list of conditions and the following disclaimer.
11
12 2. Neither the name of the Myricom Inc, nor the names of its
13 contributors may be used to endorse or promote products derived from
14 this software without specific prior written permission.
15
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26POSSIBILITY OF SUCH DAMAGE.
27
28***************************************************************************/
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD: src/sys/dev/mxge/if_mxge.c,v 1.63 2009/06/26 11:45:06 rwatson Exp $");
32
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/linker.h>
36#include <sys/firmware.h>
37#include <sys/endian.h>
38#include <sys/sockio.h>
39#include <sys/mbuf.h>
40#include <sys/malloc.h>
41#include <sys/kdb.h>
42#include <sys/kernel.h>
43#include <sys/lock.h>
44#include <sys/module.h>
45#include <sys/socket.h>
46#include <sys/sysctl.h>
47#include <sys/sx.h>
48
49/* count xmits ourselves, rather than via drbr */
50#define NO_SLOW_STATS
51#include <net/if.h>
52#include <net/if_arp.h>
53#include <net/ethernet.h>
54#include <net/if_dl.h>
55#include <net/if_media.h>
56
57#include <net/bpf.h>
58
59#include <net/if_types.h>
60#include <net/if_vlan_var.h>
61#include <net/zlib.h>
62
63#include <netinet/in_systm.h>
64#include <netinet/in.h>
65#include <netinet/ip.h>
66#include <netinet/tcp.h>
67
68#include <machine/bus.h>
69#include <machine/in_cksum.h>
70#include <machine/resource.h>
71#include <sys/bus.h>
72#include <sys/rman.h>
73#include <sys/smp.h>
74
75#include <dev/pci/pcireg.h>
76#include <dev/pci/pcivar.h>
77#include <dev/pci/pci_private.h> /* XXX for pci_cfg_restore */
78
79#include <vm/vm.h> /* for pmap_mapdev() */
80#include <vm/pmap.h>
81
82#if defined(__i386) || defined(__amd64)
83#include <machine/specialreg.h>
84#endif
85
86#include <dev/mxge/mxge_mcp.h>
87#include <dev/mxge/mcp_gen_header.h>
88/*#define MXGE_FAKE_IFP*/
89#include <dev/mxge/if_mxge_var.h>
90#ifdef IFNET_BUF_RING
91#include <sys/buf_ring.h>
92#endif
93
94#include "opt_inet.h"
95
96/* tunable params */
97static int mxge_nvidia_ecrc_enable = 1;
98static int mxge_force_firmware = 0;
99static int mxge_intr_coal_delay = 30;
100static int mxge_deassert_wait = 1;
101static int mxge_flow_control = 1;
102static int mxge_verbose = 0;
103static int mxge_lro_cnt = 8;
104static int mxge_ticks;
105static int mxge_max_slices = 1;
106static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
107static int mxge_always_promisc = 0;
108static int mxge_initial_mtu = ETHERMTU_JUMBO;
109static char *mxge_fw_unaligned = "mxge_ethp_z8e";
110static char *mxge_fw_aligned = "mxge_eth_z8e";
111static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
112static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
113
114static int mxge_probe(device_t dev);
115static int mxge_attach(device_t dev);
116static int mxge_detach(device_t dev);
117static int mxge_shutdown(device_t dev);
118static void mxge_intr(void *arg);
119
120static device_method_t mxge_methods[] =
121{
122 /* Device interface */
123 DEVMETHOD(device_probe, mxge_probe),
124 DEVMETHOD(device_attach, mxge_attach),
125 DEVMETHOD(device_detach, mxge_detach),
126 DEVMETHOD(device_shutdown, mxge_shutdown),
127 {0, 0}
128};
129
130static driver_t mxge_driver =
131{
132 "mxge",
133 mxge_methods,
134 sizeof(mxge_softc_t),
135};
136
137static devclass_t mxge_devclass;
138
139/* Declare ourselves to be a child of the PCI bus.*/
140DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
141MODULE_DEPEND(mxge, firmware, 1, 1, 1);
142MODULE_DEPEND(mxge, zlib, 1, 1, 1);
143
144static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
145static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
146static int mxge_close(mxge_softc_t *sc);
147static int mxge_open(mxge_softc_t *sc);
148static void mxge_tick(void *arg);
149
150static int
151mxge_probe(device_t dev)
152{
153 int rev;
154
155
156 if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
157 ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
158 (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
159 rev = pci_get_revid(dev);
160 switch (rev) {
161 case MXGE_PCI_REV_Z8E:
162 device_set_desc(dev, "Myri10G-PCIE-8A");
163 break;
164 case MXGE_PCI_REV_Z8ES:
165 device_set_desc(dev, "Myri10G-PCIE-8B");
166 break;
167 default:
168 device_set_desc(dev, "Myri10G-PCIE-8??");
169 device_printf(dev, "Unrecognized rev %d NIC\n",
170 rev);
171 break;
172 }
173 return 0;
174 }
175 return ENXIO;
176}
177
178static void
179mxge_enable_wc(mxge_softc_t *sc)
180{
181#if defined(__i386) || defined(__amd64)
182 vm_offset_t len;
183 int err;
184
185 sc->wc = 1;
186 len = rman_get_size(sc->mem_res);
187 err = pmap_change_attr((vm_offset_t) sc->sram,
188 len, PAT_WRITE_COMBINING);
189 if (err != 0) {
190 device_printf(sc->dev, "pmap_change_attr failed, %d\n",
191 err);
192 sc->wc = 0;
193 }
194#endif
195}
196
197
198/* callback to get our DMA address */
199static void
200mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
201 int error)
202{
203 if (error == 0) {
204 *(bus_addr_t *) arg = segs->ds_addr;
205 }
206}
207
208static int
209mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
210 bus_size_t alignment)
211{
212 int err;
213 device_t dev = sc->dev;
214 bus_size_t boundary, maxsegsize;
215
216 if (bytes > 4096 && alignment == 4096) {
217 boundary = 0;
218 maxsegsize = bytes;
219 } else {
220 boundary = 4096;
221 maxsegsize = 4096;
222 }
223
224 /* allocate DMAable memory tags */
225 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
226 alignment, /* alignment */
227 boundary, /* boundary */
228 BUS_SPACE_MAXADDR, /* low */
229 BUS_SPACE_MAXADDR, /* high */
230 NULL, NULL, /* filter */
231 bytes, /* maxsize */
232 1, /* num segs */
233 maxsegsize, /* maxsegsize */
234 BUS_DMA_COHERENT, /* flags */
235 NULL, NULL, /* lock */
236 &dma->dmat); /* tag */
237 if (err != 0) {
238 device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
239 return err;
240 }
241
242 /* allocate DMAable memory & map */
243 err = bus_dmamem_alloc(dma->dmat, &dma->addr,
244 (BUS_DMA_WAITOK | BUS_DMA_COHERENT
245 | BUS_DMA_ZERO), &dma->map);
246 if (err != 0) {
247 device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
248 goto abort_with_dmat;
249 }
250
251 /* load the memory */
252 err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
253 mxge_dmamap_callback,
254 (void *)&dma->bus_addr, 0);
255 if (err != 0) {
256 device_printf(dev, "couldn't load map (err = %d)\n", err);
257 goto abort_with_mem;
258 }
259 return 0;
260
261abort_with_mem:
262 bus_dmamem_free(dma->dmat, dma->addr, dma->map);
263abort_with_dmat:
264 (void)bus_dma_tag_destroy(dma->dmat);
265 return err;
266}
267
268
269static void
270mxge_dma_free(mxge_dma_t *dma)
271{
272 bus_dmamap_unload(dma->dmat, dma->map);
273 bus_dmamem_free(dma->dmat, dma->addr, dma->map);
274 (void)bus_dma_tag_destroy(dma->dmat);
275}
276
277/*
278 * The eeprom strings on the lanaiX have the format
279 * SN=x\0
280 * MAC=x:x:x:x:x:x\0
281 * PC=text\0
282 */
283
284static int
285mxge_parse_strings(mxge_softc_t *sc)
286{
287#define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
288
289 char *ptr, *limit;
290 int i, found_mac;
291
292 ptr = sc->eeprom_strings;
293 limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
294 found_mac = 0;
295 while (ptr < limit && *ptr != '\0') {
296 if (memcmp(ptr, "MAC=", 4) == 0) {
297 ptr += 1;
298 sc->mac_addr_string = ptr;
299 for (i = 0; i < 6; i++) {
300 ptr += 3;
301 if ((ptr + 2) > limit)
302 goto abort;
303 sc->mac_addr[i] = strtoul(ptr, NULL, 16);
304 found_mac = 1;
305 }
306 } else if (memcmp(ptr, "PC=", 3) == 0) {
307 ptr += 3;
308 strncpy(sc->product_code_string, ptr,
309 sizeof (sc->product_code_string) - 1);
310 } else if (memcmp(ptr, "SN=", 3) == 0) {
311 ptr += 3;
312 strncpy(sc->serial_number_string, ptr,
313 sizeof (sc->serial_number_string) - 1);
314 }
315 MXGE_NEXT_STRING(ptr);
316 }
317
318 if (found_mac)
319 return 0;
320
321 abort:
322 device_printf(sc->dev, "failed to parse eeprom_strings\n");
323
324 return ENXIO;
325}
326
327#if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
328static void
329mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
330{
331 uint32_t val;
332 unsigned long base, off;
333 char *va, *cfgptr;
334 device_t pdev, mcp55;
335 uint16_t vendor_id, device_id, word;
336 uintptr_t bus, slot, func, ivend, idev;
337 uint32_t *ptr32;
338
339
340 if (!mxge_nvidia_ecrc_enable)
341 return;
342
343 pdev = device_get_parent(device_get_parent(sc->dev));
344 if (pdev == NULL) {
345 device_printf(sc->dev, "could not find parent?\n");
346 return;
347 }
348 vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
349 device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
350
351 if (vendor_id != 0x10de)
352 return;
353
354 base = 0;
355
356 if (device_id == 0x005d) {
357 /* ck804, base address is magic */
358 base = 0xe0000000UL;
359 } else if (device_id >= 0x0374 && device_id <= 0x378) {
360 /* mcp55, base address stored in chipset */
361 mcp55 = pci_find_bsf(0, 0, 0);
362 if (mcp55 &&
363 0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
364 0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
365 word = pci_read_config(mcp55, 0x90, 2);
366 base = ((unsigned long)word & 0x7ffeU) << 25;
367 }
368 }
369 if (!base)
370 return;
371
372 /* XXXX
373 Test below is commented because it is believed that doing
374 config read/write beyond 0xff will access the config space
375 for the next larger function. Uncomment this and remove
376 the hacky pmap_mapdev() way of accessing config space when
377 FreeBSD grows support for extended pcie config space access
378 */
379#if 0
380 /* See if we can, by some miracle, access the extended
381 config space */
382 val = pci_read_config(pdev, 0x178, 4);
383 if (val != 0xffffffff) {
384 val |= 0x40;
385 pci_write_config(pdev, 0x178, val, 4);
386 return;
387 }
388#endif
389 /* Rather than using normal pci config space writes, we must
390 * map the Nvidia config space ourselves. This is because on
391 * opteron/nvidia class machine the 0xe000000 mapping is
392 * handled by the nvidia chipset, that means the internal PCI
393 * device (the on-chip northbridge), or the amd-8131 bridge
394 * and things behind them are not visible by this method.
395 */
396
397 BUS_READ_IVAR(device_get_parent(pdev), pdev,
398 PCI_IVAR_BUS, &bus);
399 BUS_READ_IVAR(device_get_parent(pdev), pdev,
400 PCI_IVAR_SLOT, &slot);
401 BUS_READ_IVAR(device_get_parent(pdev), pdev,
402 PCI_IVAR_FUNCTION, &func);
403 BUS_READ_IVAR(device_get_parent(pdev), pdev,
404 PCI_IVAR_VENDOR, &ivend);
405 BUS_READ_IVAR(device_get_parent(pdev), pdev,
406 PCI_IVAR_DEVICE, &idev);
407
408 off = base
409 + 0x00100000UL * (unsigned long)bus
410 + 0x00001000UL * (unsigned long)(func
411 + 8 * slot);
412
413 /* map it into the kernel */
414 va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
415
416
417 if (va == NULL) {
418 device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
419 return;
420 }
421 /* get a pointer to the config space mapped into the kernel */
422 cfgptr = va + (off & PAGE_MASK);
423
424 /* make sure that we can really access it */
425 vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
426 device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
427 if (! (vendor_id == ivend && device_id == idev)) {
428 device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
429 vendor_id, device_id);
430 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
431 return;
432 }
433
434 ptr32 = (uint32_t*)(cfgptr + 0x178);
435 val = *ptr32;
436
437 if (val == 0xffffffff) {
438 device_printf(sc->dev, "extended mapping failed\n");
439 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
440 return;
441 }
442 *ptr32 = val | 0x40;
443 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
444 if (mxge_verbose)
445 device_printf(sc->dev,
446 "Enabled ECRC on upstream Nvidia bridge "
447 "at %d:%d:%d\n",
448 (int)bus, (int)slot, (int)func);
449 return;
450}
451#else
452static void
453mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
454{
455 device_printf(sc->dev,
456 "Nforce 4 chipset on non-x86/amd64!?!?!\n");
457 return;
458}
459#endif
460
461
462static int
463mxge_dma_test(mxge_softc_t *sc, int test_type)
464{
465 mxge_cmd_t cmd;
466 bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
467 int status;
468 uint32_t len;
469 char *test = " ";
470
471
472 /* Run a small DMA test.
473 * The magic multipliers to the length tell the firmware
474 * to do DMA read, write, or read+write tests. The
475 * results are returned in cmd.data0. The upper 16
476 * bits of the return is the number of transfers completed.
477 * The lower 16 bits is the time in 0.5us ticks that the
478 * transfers took to complete.
479 */
480
481 len = sc->tx_boundary;
482
483 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
484 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
485 cmd.data2 = len * 0x10000;
486 status = mxge_send_cmd(sc, test_type, &cmd);
487 if (status != 0) {
488 test = "read";
489 goto abort;
490 }
491 sc->read_dma = ((cmd.data0>>16) * len * 2) /
492 (cmd.data0 & 0xffff);
493 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
494 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
495 cmd.data2 = len * 0x1;
496 status = mxge_send_cmd(sc, test_type, &cmd);
497 if (status != 0) {
498 test = "write";
499 goto abort;
500 }
501 sc->write_dma = ((cmd.data0>>16) * len * 2) /
502 (cmd.data0 & 0xffff);
503
504 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
505 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
506 cmd.data2 = len * 0x10001;
507 status = mxge_send_cmd(sc, test_type, &cmd);
508 if (status != 0) {
509 test = "read/write";
510 goto abort;
511 }
512 sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
513 (cmd.data0 & 0xffff);
514
515abort:
516 if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
517 device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
518 test, status);
519
520 return status;
521}
522
523/*
524 * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
525 * when the PCI-E Completion packets are aligned on an 8-byte
526 * boundary. Some PCI-E chip sets always align Completion packets; on
527 * the ones that do not, the alignment can be enforced by enabling
528 * ECRC generation (if supported).
529 *
530 * When PCI-E Completion packets are not aligned, it is actually more
531 * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
532 *
533 * If the driver can neither enable ECRC nor verify that it has
534 * already been enabled, then it must use a firmware image which works
535 * around unaligned completion packets (ethp_z8e.dat), and it should
536 * also ensure that it never gives the device a Read-DMA which is
537 * larger than 2KB by setting the tx_boundary to 2KB. If ECRC is
538 * enabled, then the driver should use the aligned (eth_z8e.dat)
539 * firmware image, and set tx_boundary to 4KB.
540 */
541
542static int
543mxge_firmware_probe(mxge_softc_t *sc)
544{
545 device_t dev = sc->dev;
546 int reg, status;
547 uint16_t pectl;
548
549 sc->tx_boundary = 4096;
550 /*
551 * Verify the max read request size was set to 4KB
552 * before trying the test with 4KB.
553 */
554 if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
555 pectl = pci_read_config(dev, reg + 0x8, 2);
556 if ((pectl & (5 << 12)) != (5 << 12)) {
557 device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
558 pectl);
559 sc->tx_boundary = 2048;
560 }
561 }
562
563 /*
564 * load the optimized firmware (which assumes aligned PCIe
565 * completions) in order to see if it works on this host.
566 */
567 sc->fw_name = mxge_fw_aligned;
568 status = mxge_load_firmware(sc, 1);
569 if (status != 0) {
570 return status;
571 }
572
573 /*
574 * Enable ECRC if possible
575 */
576 mxge_enable_nvidia_ecrc(sc);
577
578 /*
579 * Run a DMA test which watches for unaligned completions and
580 * aborts on the first one seen.
581 */
582
583 status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
584 if (status == 0)
585 return 0; /* keep the aligned firmware */
586
587 if (status != E2BIG)
588 device_printf(dev, "DMA test failed: %d\n", status);
589 if (status == ENOSYS)
590 device_printf(dev, "Falling back to ethp! "
591 "Please install up to date fw\n");
592 return status;
593}
594
595static int
596mxge_select_firmware(mxge_softc_t *sc)
597{
598 int aligned = 0;
599
600
601 if (mxge_force_firmware != 0) {
602 if (mxge_force_firmware == 1)
603 aligned = 1;
604 else
605 aligned = 0;
606 if (mxge_verbose)
607 device_printf(sc->dev,
608 "Assuming %s completions (forced)\n",
609 aligned ? "aligned" : "unaligned");
610 goto abort;
611 }
612
613 /* if the PCIe link width is 4 or less, we can use the aligned
614 firmware and skip any checks */
615 if (sc->link_width != 0 && sc->link_width <= 4) {
616 device_printf(sc->dev,
617 "PCIe x%d Link, expect reduced performance\n",
618 sc->link_width);
619 aligned = 1;
620 goto abort;
621 }
622
623 if (0 == mxge_firmware_probe(sc))
624 return 0;
625
626abort:
627 if (aligned) {
628 sc->fw_name = mxge_fw_aligned;
629 sc->tx_boundary = 4096;
630 } else {
631 sc->fw_name = mxge_fw_unaligned;
632 sc->tx_boundary = 2048;
633 }
634 return (mxge_load_firmware(sc, 0));
635}
636
637union qualhack
638{
639 const char *ro_char;
640 char *rw_char;
641};
642
643static int
644mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
645{
646
647
648 if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
649 device_printf(sc->dev, "Bad firmware type: 0x%x\n",
650 be32toh(hdr->mcp_type));
651 return EIO;
652 }
653
654 /* save firmware version for sysctl */
655 strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
656 if (mxge_verbose)
657 device_printf(sc->dev, "firmware id: %s\n", hdr->version);
658
659 sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
660 &sc->fw_ver_minor, &sc->fw_ver_tiny);
661
662 if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
663 && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
664 device_printf(sc->dev, "Found firmware version %s\n",
665 sc->fw_version);
666 device_printf(sc->dev, "Driver needs %d.%d\n",
667 MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
668 return EINVAL;
669 }
670 return 0;
671
672}
673
674static void *
675z_alloc(void *nil, u_int items, u_int size)
676{
677 void *ptr;
678
679 ptr = malloc(items * size, M_TEMP, M_NOWAIT);
680 return ptr;
681}
682
683static void
684z_free(void *nil, void *ptr)
685{
686 free(ptr, M_TEMP);
687}
688
689
690static int
691mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
692{
693 z_stream zs;
694 char *inflate_buffer;
695 const struct firmware *fw;
696 const mcp_gen_header_t *hdr;
697 unsigned hdr_offset;
698 int status;
699 unsigned int i;
700 char dummy;
701 size_t fw_len;
702
703 fw = firmware_get(sc->fw_name);
704 if (fw == NULL) {
705 device_printf(sc->dev, "Could not find firmware image %s\n",
706 sc->fw_name);
707 return ENOENT;
708 }
709
710
711
712 /* setup zlib and decompress f/w */
713 bzero(&zs, sizeof (zs));
714 zs.zalloc = z_alloc;
715 zs.zfree = z_free;
716 status = inflateInit(&zs);
717 if (status != Z_OK) {
718 status = EIO;
719 goto abort_with_fw;
720 }
721
722 /* the uncompressed size is stored as the firmware version,
723 which would otherwise go unused */
724 fw_len = (size_t) fw->version;
725 inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT);
726 if (inflate_buffer == NULL)
727 goto abort_with_zs;
728 zs.avail_in = fw->datasize;
729 zs.next_in = __DECONST(char *, fw->data);
730 zs.avail_out = fw_len;
731 zs.next_out = inflate_buffer;
732 status = inflate(&zs, Z_FINISH);
733 if (status != Z_STREAM_END) {
734 device_printf(sc->dev, "zlib %d\n", status);
735 status = EIO;
736 goto abort_with_buffer;
737 }
738
739 /* check id */
740 hdr_offset = htobe32(*(const uint32_t *)
741 (inflate_buffer + MCP_HEADER_PTR_OFFSET));
742 if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
743 device_printf(sc->dev, "Bad firmware file");
744 status = EIO;
745 goto abort_with_buffer;
746 }
747 hdr = (const void*)(inflate_buffer + hdr_offset);
748
749 status = mxge_validate_firmware(sc, hdr);
750 if (status != 0)
751 goto abort_with_buffer;
752
753 /* Copy the inflated firmware to NIC SRAM. */
754 for (i = 0; i < fw_len; i += 256) {
755 mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
756 inflate_buffer + i,
757 min(256U, (unsigned)(fw_len - i)));
758 wmb();
759 dummy = *sc->sram;
760 wmb();
761 }
762
763 *limit = fw_len;
764 status = 0;
765abort_with_buffer:
766 free(inflate_buffer, M_TEMP);
767abort_with_zs:
768 inflateEnd(&zs);
769abort_with_fw:
770 firmware_put(fw, FIRMWARE_UNLOAD);
771 return status;
772}
773
774/*
775 * Enable or disable periodic RDMAs from the host to make certain
776 * chipsets resend dropped PCIe messages
777 */
778
779static void
780mxge_dummy_rdma(mxge_softc_t *sc, int enable)
781{
782 char buf_bytes[72];
783 volatile uint32_t *confirm;
784 volatile char *submit;
785 uint32_t *buf, dma_low, dma_high;
786 int i;
787
788 buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
789
790 /* clear confirmation addr */
791 confirm = (volatile uint32_t *)sc->cmd;
792 *confirm = 0;
793 wmb();
794
795 /* send an rdma command to the PCIe engine, and wait for the
796 response in the confirmation address. The firmware should
797 write a -1 there to indicate it is alive and well
798 */
799
800 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
801 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
802 buf[0] = htobe32(dma_high); /* confirm addr MSW */
803 buf[1] = htobe32(dma_low); /* confirm addr LSW */
804 buf[2] = htobe32(0xffffffff); /* confirm data */
805 dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
806 dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
807 buf[3] = htobe32(dma_high); /* dummy addr MSW */
808 buf[4] = htobe32(dma_low); /* dummy addr LSW */
809 buf[5] = htobe32(enable); /* enable? */
810
811
812 submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
813
814 mxge_pio_copy(submit, buf, 64);
815 wmb();
816 DELAY(1000);
817 wmb();
818 i = 0;
819 while (*confirm != 0xffffffff && i < 20) {
820 DELAY(1000);
821 i++;
822 }
823 if (*confirm != 0xffffffff) {
824 device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
825 (enable ? "enable" : "disable"), confirm,
826 *confirm);
827 }
828 return;
829}
830
831static int
832mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
833{
834 mcp_cmd_t *buf;
835 char buf_bytes[sizeof(*buf) + 8];
836 volatile mcp_cmd_response_t *response = sc->cmd;
837 volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
838 uint32_t dma_low, dma_high;
839 int err, sleep_total = 0;
840
841 /* ensure buf is aligned to 8 bytes */
842 buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
843
844 buf->data0 = htobe32(data->data0);
845 buf->data1 = htobe32(data->data1);
846 buf->data2 = htobe32(data->data2);
847 buf->cmd = htobe32(cmd);
848 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
849 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
850
851 buf->response_addr.low = htobe32(dma_low);
852 buf->response_addr.high = htobe32(dma_high);
853 mtx_lock(&sc->cmd_mtx);
854 response->result = 0xffffffff;
855 wmb();
856 mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
857
858 /* wait up to 20ms */
859 err = EAGAIN;
860 for (sleep_total = 0; sleep_total < 20; sleep_total++) {
861 bus_dmamap_sync(sc->cmd_dma.dmat,
862 sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
863 wmb();
864 switch (be32toh(response->result)) {
865 case 0:
866 data->data0 = be32toh(response->data);
867 err = 0;
868 break;
869 case 0xffffffff:
870 DELAY(1000);
871 break;
872 case MXGEFW_CMD_UNKNOWN:
873 err = ENOSYS;
874 break;
875 case MXGEFW_CMD_ERROR_UNALIGNED:
876 err = E2BIG;
877 break;
878 case MXGEFW_CMD_ERROR_BUSY:
879 err = EBUSY;
880 break;
881 default:
882 device_printf(sc->dev,
883 "mxge: command %d "
884 "failed, result = %d\n",
885 cmd, be32toh(response->result));
886 err = ENXIO;
887 break;
888 }
889 if (err != EAGAIN)
890 break;
891 }
892 if (err == EAGAIN)
893 device_printf(sc->dev, "mxge: command %d timed out"
894 "result = %d\n",
895 cmd, be32toh(response->result));
896 mtx_unlock(&sc->cmd_mtx);
897 return err;
898}
899
900static int
901mxge_adopt_running_firmware(mxge_softc_t *sc)
902{
903 struct mcp_gen_header *hdr;
904 const size_t bytes = sizeof (struct mcp_gen_header);
905 size_t hdr_offset;
906 int status;
907
908 /* find running firmware header */
909 hdr_offset = htobe32(*(volatile uint32_t *)
910 (sc->sram + MCP_HEADER_PTR_OFFSET));
911
912 if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
913 device_printf(sc->dev,
914 "Running firmware has bad header offset (%d)\n",
915 (int)hdr_offset);
916 return EIO;
917 }
918
919 /* copy header of running firmware from SRAM to host memory to
920 * validate firmware */
921 hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
922 if (hdr == NULL) {
923 device_printf(sc->dev, "could not malloc firmware hdr\n");
924 return ENOMEM;
925 }
926 bus_space_read_region_1(rman_get_bustag(sc->mem_res),
927 rman_get_bushandle(sc->mem_res),
928 hdr_offset, (char *)hdr, bytes);
929 status = mxge_validate_firmware(sc, hdr);
930 free(hdr, M_DEVBUF);
931
932 /*
933 * check to see if adopted firmware has bug where adopting
934 * it will cause broadcasts to be filtered unless the NIC
935 * is kept in ALLMULTI mode
936 */
937 if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
938 sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
939 sc->adopted_rx_filter_bug = 1;
940 device_printf(sc->dev, "Adopting fw %d.%d.%d: "
941 "working around rx filter bug\n",
942 sc->fw_ver_major, sc->fw_ver_minor,
943 sc->fw_ver_tiny);
944 }
945
946 return status;
947}
948
949
950static int
951mxge_load_firmware(mxge_softc_t *sc, int adopt)
952{
953 volatile uint32_t *confirm;
954 volatile char *submit;
955 char buf_bytes[72];
956 uint32_t *buf, size, dma_low, dma_high;
957 int status, i;
958
959 buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
960
961 size = sc->sram_size;
962 status = mxge_load_firmware_helper(sc, &size);
963 if (status) {
964 if (!adopt)
965 return status;
966 /* Try to use the currently running firmware, if
967 it is new enough */
968 status = mxge_adopt_running_firmware(sc);
969 if (status) {
970 device_printf(sc->dev,
971 "failed to adopt running firmware\n");
972 return status;
973 }
974 device_printf(sc->dev,
975 "Successfully adopted running firmware\n");
976 if (sc->tx_boundary == 4096) {
977 device_printf(sc->dev,
978 "Using firmware currently running on NIC"
979 ". For optimal\n");
980 device_printf(sc->dev,
981 "performance consider loading optimized "
982 "firmware\n");
983 }
984 sc->fw_name = mxge_fw_unaligned;
985 sc->tx_boundary = 2048;
986 return 0;
987 }
988 /* clear confirmation addr */
989 confirm = (volatile uint32_t *)sc->cmd;
990 *confirm = 0;
991 wmb();
992 /* send a reload command to the bootstrap MCP, and wait for the
993 response in the confirmation address. The firmware should
994 write a -1 there to indicate it is alive and well
995 */
996
997 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
998 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
999
1000 buf[0] = htobe32(dma_high); /* confirm addr MSW */
1001 buf[1] = htobe32(dma_low); /* confirm addr LSW */
1002 buf[2] = htobe32(0xffffffff); /* confirm data */
1003
1004 /* FIX: All newest firmware should un-protect the bottom of
1005 the sram before handoff. However, the very first interfaces
1006 do not. Therefore the handoff copy must skip the first 8 bytes
1007 */
1008 /* where the code starts*/
1009 buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1010 buf[4] = htobe32(size - 8); /* length of code */
1011 buf[5] = htobe32(8); /* where to copy to */
1012 buf[6] = htobe32(0); /* where to jump to */
1013
1014 submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1015 mxge_pio_copy(submit, buf, 64);
1016 wmb();
1017 DELAY(1000);
1018 wmb();
1019 i = 0;
1020 while (*confirm != 0xffffffff && i < 20) {
1021 DELAY(1000*10);
1022 i++;
1023 bus_dmamap_sync(sc->cmd_dma.dmat,
1024 sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1025 }
1026 if (*confirm != 0xffffffff) {
1027 device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1028 confirm, *confirm);
1029
1030 return ENXIO;
1031 }
1032 return 0;
1033}
1034
1035static int
1036mxge_update_mac_address(mxge_softc_t *sc)
1037{
1038 mxge_cmd_t cmd;
1039 uint8_t *addr = sc->mac_addr;
1040 int status;
1041
1042
1043 cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1044 | (addr[2] << 8) | addr[3]);
1045
1046 cmd.data1 = ((addr[4] << 8) | (addr[5]));
1047
1048 status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1049 return status;
1050}
1051
1052static int
1053mxge_change_pause(mxge_softc_t *sc, int pause)
1054{
1055 mxge_cmd_t cmd;
1056 int status;
1057
1058 if (pause)
1059 status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1060 &cmd);
1061 else
1062 status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1063 &cmd);
1064
1065 if (status) {
1066 device_printf(sc->dev, "Failed to set flow control mode\n");
1067 return ENXIO;
1068 }
1069 sc->pause = pause;
1070 return 0;
1071}
1072
1073static void
1074mxge_change_promisc(mxge_softc_t *sc, int promisc)
1075{
1076 mxge_cmd_t cmd;
1077 int status;
1078
1079 if (mxge_always_promisc)
1080 promisc = 1;
1081
1082 if (promisc)
1083 status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1084 &cmd);
1085 else
1086 status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1087 &cmd);
1088
1089 if (status) {
1090 device_printf(sc->dev, "Failed to set promisc mode\n");
1091 }
1092}
1093
1094static void
1095mxge_set_multicast_list(mxge_softc_t *sc)
1096{
1097 mxge_cmd_t cmd;
1098 struct ifmultiaddr *ifma;
1099 struct ifnet *ifp = sc->ifp;
1100 int err;
1101
1102 /* This firmware is known to not support multicast */
1103 if (!sc->fw_multicast_support)
1104 return;
1105
1106 /* Disable multicast filtering while we play with the lists*/
1107 err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1108 if (err != 0) {
1109 device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1110 " error status: %d\n", err);
1111 return;
1112 }
1113
1114 if (sc->adopted_rx_filter_bug)
1115 return;
1116
1117 if (ifp->if_flags & IFF_ALLMULTI)
1118 /* request to disable multicast filtering, so quit here */
1119 return;
1120
1121 /* Flush all the filters */
1122
1123 err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1124 if (err != 0) {
1125 device_printf(sc->dev,
1126 "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1127 ", error status: %d\n", err);
1128 return;
1129 }
1130
1131 /* Walk the multicast list, and add each address */
1132
1133 if_maddr_rlock(ifp);
1134 TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1135 if (ifma->ifma_addr->sa_family != AF_LINK)
1136 continue;
1137 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1138 &cmd.data0, 4);
1139 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1140 &cmd.data1, 2);
1141 cmd.data0 = htonl(cmd.data0);
1142 cmd.data1 = htonl(cmd.data1);
1143 err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1144 if (err != 0) {
1145 device_printf(sc->dev, "Failed "
1146 "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1147 "%d\t", err);
1148 /* abort, leaving multicast filtering off */
1149 if_maddr_runlock(ifp);
1150 return;
1151 }
1152 }
1153 if_maddr_runlock(ifp);
1154 /* Enable multicast filtering */
1155 err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1156 if (err != 0) {
1157 device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1158 ", error status: %d\n", err);
1159 }
1160}
1161
1162static int
1163mxge_max_mtu(mxge_softc_t *sc)
1164{
1165 mxge_cmd_t cmd;
1166 int status;
1167
1168 if (MJUMPAGESIZE - MXGEFW_PAD > MXGEFW_MAX_MTU)
1169 return MXGEFW_MAX_MTU - MXGEFW_PAD;
1170
1171 /* try to set nbufs to see if it we can
1172 use virtually contiguous jumbos */
1173 cmd.data0 = 0;
1174 status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1175 &cmd);
1176 if (status == 0)
1177 return MXGEFW_MAX_MTU - MXGEFW_PAD;
1178
1179 /* otherwise, we're limited to MJUMPAGESIZE */
1180 return MJUMPAGESIZE - MXGEFW_PAD;
1181}
1182
1183static int
1184mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1185{
1186 struct mxge_slice_state *ss;
1187 mxge_rx_done_t *rx_done;
1188 volatile uint32_t *irq_claim;
1189 mxge_cmd_t cmd;
1190 int slice, status;
1191
1192 /* try to send a reset command to the card to see if it
1193 is alive */
1194 memset(&cmd, 0, sizeof (cmd));
1195 status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1196 if (status != 0) {
1197 device_printf(sc->dev, "failed reset\n");
1198 return ENXIO;
1199 }
1200
1201 mxge_dummy_rdma(sc, 1);
1202
1203
1204 /* set the intrq size */
1205 cmd.data0 = sc->rx_ring_size;
1206 status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1207
1208 /*
1209 * Even though we already know how many slices are supported
1210 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1211 * has magic side effects, and must be called after a reset.
1212 * It must be called prior to calling any RSS related cmds,
1213 * including assigning an interrupt queue for anything but
1214 * slice 0. It must also be called *after*
1215 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1216 * the firmware to compute offsets.
1217 */
1218
1219 if (sc->num_slices > 1) {
1220 /* ask the maximum number of slices it supports */
1221 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1222 &cmd);
1223 if (status != 0) {
1224 device_printf(sc->dev,
1225 "failed to get number of slices\n");
1226 return status;
1227 }
1228 /*
1229 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1230 * to setting up the interrupt queue DMA
1231 */
1232 cmd.data0 = sc->num_slices;
1233 cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1234#ifdef IFNET_BUF_RING
1235 cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1236#endif
1237 status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1238 &cmd);
1239 if (status != 0) {
1240 device_printf(sc->dev,
1241 "failed to set number of slices\n");
1242 return status;
1243 }
1244 }
1245
1246
1247 if (interrupts_setup) {
1248 /* Now exchange information about interrupts */
1249 for (slice = 0; slice < sc->num_slices; slice++) {
1250 rx_done = &sc->ss[slice].rx_done;
1251 memset(rx_done->entry, 0, sc->rx_ring_size);
1252 cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1253 cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1254 cmd.data2 = slice;
1255 status |= mxge_send_cmd(sc,
1256 MXGEFW_CMD_SET_INTRQ_DMA,
1257 &cmd);
1258 }
1259 }
1260
1261 status |= mxge_send_cmd(sc,
1262 MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1263
1264
1265 sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1266
1267 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1268 irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1269
1270
1271 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1272 &cmd);
1273 sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1274 if (status != 0) {
1275 device_printf(sc->dev, "failed set interrupt parameters\n");
1276 return status;
1277 }
1278
1279
1280 *sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1281
1282
1283 /* run a DMA benchmark */
1284 (void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1285
1286 for (slice = 0; slice < sc->num_slices; slice++) {
1287 ss = &sc->ss[slice];
1288
1289 ss->irq_claim = irq_claim + (2 * slice);
1290 /* reset mcp/driver shared state back to 0 */
1291 ss->rx_done.idx = 0;
1292 ss->rx_done.cnt = 0;
1293 ss->tx.req = 0;
1294 ss->tx.done = 0;
1295 ss->tx.pkt_done = 0;
1296 ss->tx.queue_active = 0;
1297 ss->tx.activate = 0;
1298 ss->tx.deactivate = 0;
1299 ss->tx.wake = 0;
1300 ss->tx.defrag = 0;
1301 ss->tx.stall = 0;
1302 ss->rx_big.cnt = 0;
1303 ss->rx_small.cnt = 0;
1304 ss->lro_bad_csum = 0;
1305 ss->lro_queued = 0;
1306 ss->lro_flushed = 0;
1307 if (ss->fw_stats != NULL) {
1308 ss->fw_stats->valid = 0;
1309 ss->fw_stats->send_done_count = 0;
1310 }
1311 }
1312 sc->rdma_tags_available = 15;
1313 status = mxge_update_mac_address(sc);
1314 mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1315 mxge_change_pause(sc, sc->pause);
1316 mxge_set_multicast_list(sc);
1317 return status;
1318}
1319
1320static int
1321mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1322{
1323 mxge_softc_t *sc;
1324 unsigned int intr_coal_delay;
1325 int err;
1326
1327 sc = arg1;
1328 intr_coal_delay = sc->intr_coal_delay;
1329 err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1330 if (err != 0) {
1331 return err;
1332 }
1333 if (intr_coal_delay == sc->intr_coal_delay)
1334 return 0;
1335
1336 if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1337 return EINVAL;
1338
1339 mtx_lock(&sc->driver_mtx);
1340 *sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1341 sc->intr_coal_delay = intr_coal_delay;
1342
1343 mtx_unlock(&sc->driver_mtx);
1344 return err;
1345}
1346
1347static int
1348mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1349{
1350 mxge_softc_t *sc;
1351 unsigned int enabled;
1352 int err;
1353
1354 sc = arg1;
1355 enabled = sc->pause;
1356 err = sysctl_handle_int(oidp, &enabled, arg2, req);
1357 if (err != 0) {
1358 return err;
1359 }
1360 if (enabled == sc->pause)
1361 return 0;
1362
1363 mtx_lock(&sc->driver_mtx);
1364 err = mxge_change_pause(sc, enabled);
1365 mtx_unlock(&sc->driver_mtx);
1366 return err;
1367}
1368
1369static int
1370mxge_change_lro_locked(mxge_softc_t *sc, int lro_cnt)
1371{
1372 struct ifnet *ifp;
1373 int err = 0;
1374
1375 ifp = sc->ifp;
1376 if (lro_cnt == 0)
1377 ifp->if_capenable &= ~IFCAP_LRO;
1378 else
1379 ifp->if_capenable |= IFCAP_LRO;
1380 sc->lro_cnt = lro_cnt;
1381 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
1382 mxge_close(sc);
1383 err = mxge_open(sc);
1384 }
1385 return err;
1386}
1387
1388static int
1389mxge_change_lro(SYSCTL_HANDLER_ARGS)
1390{
1391 mxge_softc_t *sc;
1392 unsigned int lro_cnt;
1393 int err;
1394
1395 sc = arg1;
1396 lro_cnt = sc->lro_cnt;
1397 err = sysctl_handle_int(oidp, &lro_cnt, arg2, req);
1398 if (err != 0)
1399 return err;
1400
1401 if (lro_cnt == sc->lro_cnt)
1402 return 0;
1403
1404 if (lro_cnt > 128)
1405 return EINVAL;
1406
1407 mtx_lock(&sc->driver_mtx);
1408 err = mxge_change_lro_locked(sc, lro_cnt);
1409 mtx_unlock(&sc->driver_mtx);
1410 return err;
1411}
1412
1413static int
1414mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1415{
1416 int err;
1417
1418 if (arg1 == NULL)
1419 return EFAULT;
1420 arg2 = be32toh(*(int *)arg1);
1421 arg1 = NULL;
1422 err = sysctl_handle_int(oidp, arg1, arg2, req);
1423
1424 return err;
1425}
1426
1427static void
1428mxge_rem_sysctls(mxge_softc_t *sc)
1429{
1430 struct mxge_slice_state *ss;
1431 int slice;
1432
1433 if (sc->slice_sysctl_tree == NULL)
1434 return;
1435
1436 for (slice = 0; slice < sc->num_slices; slice++) {
1437 ss = &sc->ss[slice];
1438 if (ss == NULL || ss->sysctl_tree == NULL)
1439 continue;
1440 sysctl_ctx_free(&ss->sysctl_ctx);
1441 ss->sysctl_tree = NULL;
1442 }
1443 sysctl_ctx_free(&sc->slice_sysctl_ctx);
1444 sc->slice_sysctl_tree = NULL;
1445}
1446
1447static void
1448mxge_add_sysctls(mxge_softc_t *sc)
1449{
1450 struct sysctl_ctx_list *ctx;
1451 struct sysctl_oid_list *children;
1452 mcp_irq_data_t *fw;
1453 struct mxge_slice_state *ss;
1454 int slice;
1455 char slice_num[8];
1456
1457 ctx = device_get_sysctl_ctx(sc->dev);
1458 children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1459 fw = sc->ss[0].fw_stats;
1460
1461 /* random information */
1462 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1463 "firmware_version",
1464 CTLFLAG_RD, &sc->fw_version,
1465 0, "firmware version");
1466 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1467 "serial_number",
1468 CTLFLAG_RD, &sc->serial_number_string,
1469 0, "serial number");
1470 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1471 "product_code",
1472 CTLFLAG_RD, &sc->product_code_string,
1473 0, "product_code");
1474 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1475 "pcie_link_width",
1476 CTLFLAG_RD, &sc->link_width,
1477 0, "tx_boundary");
1478 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1479 "tx_boundary",
1480 CTLFLAG_RD, &sc->tx_boundary,
1481 0, "tx_boundary");
1482 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1483 "write_combine",
1484 CTLFLAG_RD, &sc->wc,
1485 0, "write combining PIO?");
1486 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1487 "read_dma_MBs",
1488 CTLFLAG_RD, &sc->read_dma,
1489 0, "DMA Read speed in MB/s");
1490 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1491 "write_dma_MBs",
1492 CTLFLAG_RD, &sc->write_dma,
1493 0, "DMA Write speed in MB/s");
1494 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1495 "read_write_dma_MBs",
1496 CTLFLAG_RD, &sc->read_write_dma,
1497 0, "DMA concurrent Read/Write speed in MB/s");
1498
1499
1500 /* performance related tunables */
1501 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1502 "intr_coal_delay",
1503 CTLTYPE_INT|CTLFLAG_RW, sc,
1504 0, mxge_change_intr_coal,
1505 "I", "interrupt coalescing delay in usecs");
1506
1507 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1508 "flow_control_enabled",
1509 CTLTYPE_INT|CTLFLAG_RW, sc,
1510 0, mxge_change_flow_control,
1511 "I", "interrupt coalescing delay in usecs");
1512
1513 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1514 "deassert_wait",
1515 CTLFLAG_RW, &mxge_deassert_wait,
1516 0, "Wait for IRQ line to go low in ihandler");
1517
1518 /* stats block from firmware is in network byte order.
1519 Need to swap it */
1520 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1521 "link_up",
1522 CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1523 0, mxge_handle_be32,
1524 "I", "link up");
1525 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1526 "rdma_tags_available",
1527 CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1528 0, mxge_handle_be32,
1529 "I", "rdma_tags_available");
1530 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1531 "dropped_bad_crc32",
1532 CTLTYPE_INT|CTLFLAG_RD,
1533 &fw->dropped_bad_crc32,
1534 0, mxge_handle_be32,
1535 "I", "dropped_bad_crc32");
1536 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1537 "dropped_bad_phy",
1538 CTLTYPE_INT|CTLFLAG_RD,
1539 &fw->dropped_bad_phy,
1540 0, mxge_handle_be32,
1541 "I", "dropped_bad_phy");
1542 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1543 "dropped_link_error_or_filtered",
1544 CTLTYPE_INT|CTLFLAG_RD,
1545 &fw->dropped_link_error_or_filtered,
1546 0, mxge_handle_be32,
1547 "I", "dropped_link_error_or_filtered");
1548 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1549 "dropped_link_overflow",
1550 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1551 0, mxge_handle_be32,
1552 "I", "dropped_link_overflow");
1553 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1554 "dropped_multicast_filtered",
1555 CTLTYPE_INT|CTLFLAG_RD,
1556 &fw->dropped_multicast_filtered,
1557 0, mxge_handle_be32,
1558 "I", "dropped_multicast_filtered");
1559 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1560 "dropped_no_big_buffer",
1561 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1562 0, mxge_handle_be32,
1563 "I", "dropped_no_big_buffer");
1564 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1565 "dropped_no_small_buffer",
1566 CTLTYPE_INT|CTLFLAG_RD,
1567 &fw->dropped_no_small_buffer,
1568 0, mxge_handle_be32,
1569 "I", "dropped_no_small_buffer");
1570 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1571 "dropped_overrun",
1572 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1573 0, mxge_handle_be32,
1574 "I", "dropped_overrun");
1575 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1576 "dropped_pause",
1577 CTLTYPE_INT|CTLFLAG_RD,
1578 &fw->dropped_pause,
1579 0, mxge_handle_be32,
1580 "I", "dropped_pause");
1581 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1582 "dropped_runt",
1583 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1584 0, mxge_handle_be32,
1585 "I", "dropped_runt");
1586
1587 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1588 "dropped_unicast_filtered",
1589 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1590 0, mxge_handle_be32,
1591 "I", "dropped_unicast_filtered");
1592
1593 /* verbose printing? */
1594 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1595 "verbose",
1596 CTLFLAG_RW, &mxge_verbose,
1597 0, "verbose printing");
1598
1599 /* lro */
1600 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1601 "lro_cnt",
1602 CTLTYPE_INT|CTLFLAG_RW, sc,
1603 0, mxge_change_lro,
1604 "I", "number of lro merge queues");
1605
1606
1607 /* add counters exported for debugging from all slices */
1608 sysctl_ctx_init(&sc->slice_sysctl_ctx);
1609 sc->slice_sysctl_tree =
1610 SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1611 "slice", CTLFLAG_RD, 0, "");
1612
1613 for (slice = 0; slice < sc->num_slices; slice++) {
1614 ss = &sc->ss[slice];
1615 sysctl_ctx_init(&ss->sysctl_ctx);
1616 ctx = &ss->sysctl_ctx;
1617 children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1618 sprintf(slice_num, "%d", slice);
1619 ss->sysctl_tree =
1620 SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1621 CTLFLAG_RD, 0, "");
1622 children = SYSCTL_CHILDREN(ss->sysctl_tree);
1623 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1624 "rx_small_cnt",
1625 CTLFLAG_RD, &ss->rx_small.cnt,
1626 0, "rx_small_cnt");
1627 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1628 "rx_big_cnt",
1629 CTLFLAG_RD, &ss->rx_big.cnt,
1630 0, "rx_small_cnt");
1631 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1632 "lro_flushed", CTLFLAG_RD, &ss->lro_flushed,
1633 0, "number of lro merge queues flushed");
1634
1635 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1636 "lro_queued", CTLFLAG_RD, &ss->lro_queued,
1637 0, "number of frames appended to lro merge"
1638 "queues");
1639
1640#ifndef IFNET_BUF_RING
1641 /* only transmit from slice 0 for now */
1642 if (slice > 0)
1643 continue;
1644#endif
1645 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1646 "tx_req",
1647 CTLFLAG_RD, &ss->tx.req,
1648 0, "tx_req");
1649
1650 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1651 "tx_done",
1652 CTLFLAG_RD, &ss->tx.done,
1653 0, "tx_done");
1654 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1655 "tx_pkt_done",
1656 CTLFLAG_RD, &ss->tx.pkt_done,
1657 0, "tx_done");
1658 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1659 "tx_stall",
1660 CTLFLAG_RD, &ss->tx.stall,
1661 0, "tx_stall");
1662 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1663 "tx_wake",
1664 CTLFLAG_RD, &ss->tx.wake,
1665 0, "tx_wake");
1666 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1667 "tx_defrag",
1668 CTLFLAG_RD, &ss->tx.defrag,
1669 0, "tx_defrag");
1670 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1671 "tx_queue_active",
1672 CTLFLAG_RD, &ss->tx.queue_active,
1673 0, "tx_queue_active");
1674 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1675 "tx_activate",
1676 CTLFLAG_RD, &ss->tx.activate,
1677 0, "tx_activate");
1678 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1679 "tx_deactivate",
1680 CTLFLAG_RD, &ss->tx.deactivate,
1681 0, "tx_deactivate");
1682 }
1683}
1684
1685/* copy an array of mcp_kreq_ether_send_t's to the mcp. Copy
1686 backwards one at a time and handle ring wraps */
1687
1688static inline void
1689mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1690 mcp_kreq_ether_send_t *src, int cnt)
1691{
1692 int idx, starting_slot;
1693 starting_slot = tx->req;
1694 while (cnt > 1) {
1695 cnt--;
1696 idx = (starting_slot + cnt) & tx->mask;
1697 mxge_pio_copy(&tx->lanai[idx],
1698 &src[cnt], sizeof(*src));
1699 wmb();
1700 }
1701}
1702
1703/*
1704 * copy an array of mcp_kreq_ether_send_t's to the mcp. Copy
1705 * at most 32 bytes at a time, so as to avoid involving the software
1706 * pio handler in the nic. We re-write the first segment's flags
1707 * to mark them valid only after writing the entire chain
1708 */
1709
1710static inline void
1711mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1712 int cnt)
1713{
1714 int idx, i;
1715 uint32_t *src_ints;
1716 volatile uint32_t *dst_ints;
1717 mcp_kreq_ether_send_t *srcp;
1718 volatile mcp_kreq_ether_send_t *dstp, *dst;
1719 uint8_t last_flags;
1720
1721 idx = tx->req & tx->mask;
1722
1723 last_flags = src->flags;
1724 src->flags = 0;
1725 wmb();
1726 dst = dstp = &tx->lanai[idx];
1727 srcp = src;
1728
1729 if ((idx + cnt) < tx->mask) {
1730 for (i = 0; i < (cnt - 1); i += 2) {
1731 mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1732 wmb(); /* force write every 32 bytes */
1733 srcp += 2;
1734 dstp += 2;
1735 }
1736 } else {
1737 /* submit all but the first request, and ensure
1738 that it is submitted below */
1739 mxge_submit_req_backwards(tx, src, cnt);
1740 i = 0;
1741 }
1742 if (i < cnt) {
1743 /* submit the first request */
1744 mxge_pio_copy(dstp, srcp, sizeof(*src));
1745 wmb(); /* barrier before setting valid flag */
1746 }
1747
1748 /* re-write the last 32-bits with the valid flags */
1749 src->flags = last_flags;
1750 src_ints = (uint32_t *)src;
1751 src_ints+=3;
1752 dst_ints = (volatile uint32_t *)dst;
1753 dst_ints+=3;
1754 *dst_ints = *src_ints;
1755 tx->req += cnt;
1756 wmb();
1757}
1758
1759#if IFCAP_TSO4
1760
1761static void
1762mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1763 int busdma_seg_cnt, int ip_off)
1764{
1765 mxge_tx_ring_t *tx;
1766 mcp_kreq_ether_send_t *req;
1767 bus_dma_segment_t *seg;
1768 struct ip *ip;
1769 struct tcphdr *tcp;
1770 uint32_t low, high_swapped;
1771 int len, seglen, cum_len, cum_len_next;
1772 int next_is_first, chop, cnt, rdma_count, small;
1773 uint16_t pseudo_hdr_offset, cksum_offset, mss;
1774 uint8_t flags, flags_next;
1775 static int once;
1776
1777 mss = m->m_pkthdr.tso_segsz;
1778
1779 /* negative cum_len signifies to the
1780 * send loop that we are still in the
1781 * header portion of the TSO packet.
1782 */
1783
1784 /* ensure we have the ethernet, IP and TCP
1785 header together in the first mbuf, copy
1786 it to a scratch buffer if not */
1787 if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1788 m_copydata(m, 0, ip_off + sizeof (*ip),
1789 ss->scratch);
1790 ip = (struct ip *)(ss->scratch + ip_off);
1791 } else {
1792 ip = (struct ip *)(mtod(m, char *) + ip_off);
1793 }
1794 if (__predict_false(m->m_len < ip_off + (ip->ip_hl << 2)
1795 + sizeof (*tcp))) {
1796 m_copydata(m, 0, ip_off + (ip->ip_hl << 2)
1797 + sizeof (*tcp), ss->scratch);
1798 ip = (struct ip *)(mtod(m, char *) + ip_off);
1799 }
1800
1801 tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1802 cum_len = -(ip_off + ((ip->ip_hl + tcp->th_off) << 2));
1803
1804 /* TSO implies checksum offload on this hardware */
1805 cksum_offset = ip_off + (ip->ip_hl << 2);
1806 flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1807
1808
1809 /* for TSO, pseudo_hdr_offset holds mss.
1810 * The firmware figures out where to put
1811 * the checksum by parsing the header. */
1812 pseudo_hdr_offset = htobe16(mss);
1813
1814 tx = &ss->tx;
1815 req = tx->req_list;
1816 seg = tx->seg_list;
1817 cnt = 0;
1818 rdma_count = 0;
1819 /* "rdma_count" is the number of RDMAs belonging to the
1820 * current packet BEFORE the current send request. For
1821 * non-TSO packets, this is equal to "count".
1822 * For TSO packets, rdma_count needs to be reset
1823 * to 0 after a segment cut.
1824 *
1825 * The rdma_count field of the send request is
1826 * the number of RDMAs of the packet starting at
1827 * that request. For TSO send requests with one ore more cuts
1828 * in the middle, this is the number of RDMAs starting
1829 * after the last cut in the request. All previous
1830 * segments before the last cut implicitly have 1 RDMA.
1831 *
1832 * Since the number of RDMAs is not known beforehand,
1833 * it must be filled-in retroactively - after each
1834 * segmentation cut or at the end of the entire packet.
1835 */
1836
1837 while (busdma_seg_cnt) {
1838 /* Break the busdma segment up into pieces*/
1839 low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1840 high_swapped = htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1841 len = seg->ds_len;
1842
1843 while (len) {
1844 flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1845 seglen = len;
1846 cum_len_next = cum_len + seglen;
1847 (req-rdma_count)->rdma_count = rdma_count + 1;
1848 if (__predict_true(cum_len >= 0)) {
1849 /* payload */
1850 chop = (cum_len_next > mss);
1851 cum_len_next = cum_len_next % mss;
1852 next_is_first = (cum_len_next == 0);
1853 flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1854 flags_next |= next_is_first *
1855 MXGEFW_FLAGS_FIRST;
1856 rdma_count |= -(chop | next_is_first);
1857 rdma_count += chop & !next_is_first;
1858 } else if (cum_len_next >= 0) {
1859 /* header ends */
1860 rdma_count = -1;
1861 cum_len_next = 0;
1862 seglen = -cum_len;
1863 small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1864 flags_next = MXGEFW_FLAGS_TSO_PLD |
1865 MXGEFW_FLAGS_FIRST |
1866 (small * MXGEFW_FLAGS_SMALL);
1867 }
1868
1869 req->addr_high = high_swapped;
1870 req->addr_low = htobe32(low);
1871 req->pseudo_hdr_offset = pseudo_hdr_offset;
1872 req->pad = 0;
1873 req->rdma_count = 1;
1874 req->length = htobe16(seglen);
1875 req->cksum_offset = cksum_offset;
1876 req->flags = flags | ((cum_len & 1) *
1877 MXGEFW_FLAGS_ALIGN_ODD);
1878 low += seglen;
1879 len -= seglen;
1880 cum_len = cum_len_next;
1881 flags = flags_next;
1882 req++;
1883 cnt++;
1884 rdma_count++;
1885 if (__predict_false(cksum_offset > seglen))
1886 cksum_offset -= seglen;
1887 else
1888 cksum_offset = 0;
1889 if (__predict_false(cnt > tx->max_desc))
1890 goto drop;
1891 }
1892 busdma_seg_cnt--;
1893 seg++;
1894 }
1895 (req-rdma_count)->rdma_count = rdma_count;
1896
1897 do {
1898 req--;
1899 req->flags |= MXGEFW_FLAGS_TSO_LAST;
1900 } while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1901
1902 tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1903 mxge_submit_req(tx, tx->req_list, cnt);
1904#ifdef IFNET_BUF_RING
1905 if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1906 /* tell the NIC to start polling this slice */
1907 *tx->send_go = 1;
1908 tx->queue_active = 1;
1909 tx->activate++;
1910 wmb();
1911 }
1912#endif
1913 return;
1914
1915drop:
1916 bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1917 m_freem(m);
1918 ss->oerrors++;
1919 if (!once) {
1920 printf("tx->max_desc exceeded via TSO!\n");
1921 printf("mss = %d, %ld, %d!\n", mss,
1922 (long)seg - (long)tx->seg_list, tx->max_desc);
1923 once = 1;
1924 }
1925 return;
1926
1927}
1928
1929#endif /* IFCAP_TSO4 */
1930
1931#ifdef MXGE_NEW_VLAN_API
1932/*
1933 * We reproduce the software vlan tag insertion from
1934 * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
1935 * vlan tag insertion. We need to advertise this in order to have the
1936 * vlan interface respect our csum offload flags.
1937 */
1938static struct mbuf *
1939mxge_vlan_tag_insert(struct mbuf *m)
1940{
1941 struct ether_vlan_header *evl;
1942
1943 M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_DONTWAIT);
1944 if (__predict_false(m == NULL))
1945 return NULL;
1946 if (m->m_len < sizeof(*evl)) {
1947 m = m_pullup(m, sizeof(*evl));
1948 if (__predict_false(m == NULL))
1949 return NULL;
1950 }
1951 /*
1952 * Transform the Ethernet header into an Ethernet header
1953 * with 802.1Q encapsulation.
1954 */
1955 evl = mtod(m, struct ether_vlan_header *);
1956 bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
1957 (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
1958 evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
1959 evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
1960 m->m_flags &= ~M_VLANTAG;
1961 return m;
1962}
1963#endif /* MXGE_NEW_VLAN_API */
1964
1965static void
1966mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
1967{
1968 mxge_softc_t *sc;
1969 mcp_kreq_ether_send_t *req;
1970 bus_dma_segment_t *seg;
1971 struct mbuf *m_tmp;
1972 struct ifnet *ifp;
1973 mxge_tx_ring_t *tx;
1974 struct ip *ip;
1975 int cnt, cum_len, err, i, idx, odd_flag, ip_off;
1976 uint16_t pseudo_hdr_offset;
1977 uint8_t flags, cksum_offset;
1978
1979
1980 sc = ss->sc;
1981 ifp = sc->ifp;
1982 tx = &ss->tx;
1983
1984 ip_off = sizeof (struct ether_header);
1985#ifdef MXGE_NEW_VLAN_API
1986 if (m->m_flags & M_VLANTAG) {
1987 m = mxge_vlan_tag_insert(m);
1988 if (__predict_false(m == NULL))
1989 goto drop;
1990 ip_off += ETHER_VLAN_ENCAP_LEN;
1991 }
1992#endif
1993 /* (try to) map the frame for DMA */
1994 idx = tx->req & tx->mask;
1995 err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
1996 m, tx->seg_list, &cnt,
1997 BUS_DMA_NOWAIT);
1998 if (__predict_false(err == EFBIG)) {
1999 /* Too many segments in the chain. Try
2000 to defrag */
2001 m_tmp = m_defrag(m, M_NOWAIT);
2002 if (m_tmp == NULL) {
2003 goto drop;
2004 }
2005 ss->tx.defrag++;
2006 m = m_tmp;
2007 err = bus_dmamap_load_mbuf_sg(tx->dmat,
2008 tx->info[idx].map,
2009 m, tx->seg_list, &cnt,
2010 BUS_DMA_NOWAIT);
2011 }
2012 if (__predict_false(err != 0)) {
2013 device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
2014 " packet len = %d\n", err, m->m_pkthdr.len);
2015 goto drop;
2016 }
2017 bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2018 BUS_DMASYNC_PREWRITE);
2019 tx->info[idx].m = m;
2020
2021#if IFCAP_TSO4
2022 /* TSO is different enough, we handle it in another routine */
2023 if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2024 mxge_encap_tso(ss, m, cnt, ip_off);
2025 return;
2026 }
2027#endif
2028
2029 req = tx->req_list;
2030 cksum_offset = 0;
2031 pseudo_hdr_offset = 0;
2032 flags = MXGEFW_FLAGS_NO_TSO;
2033
2034 /* checksum offloading? */
2035 if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
2036 /* ensure ip header is in first mbuf, copy
2037 it to a scratch buffer if not */
2038 if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
2039 m_copydata(m, 0, ip_off + sizeof (*ip),
2040 ss->scratch);
2041 ip = (struct ip *)(ss->scratch + ip_off);
2042 } else {
2043 ip = (struct ip *)(mtod(m, char *) + ip_off);
2044 }
2045 cksum_offset = ip_off + (ip->ip_hl << 2);
2046 pseudo_hdr_offset = cksum_offset + m->m_pkthdr.csum_data;
2047 pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2048 req->cksum_offset = cksum_offset;
2049 flags |= MXGEFW_FLAGS_CKSUM;
2050 odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2051 } else {
2052 odd_flag = 0;
2053 }
2054 if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2055 flags |= MXGEFW_FLAGS_SMALL;
2056
2057 /* convert segments into a request list */
2058 cum_len = 0;
2059 seg = tx->seg_list;
2060 req->flags = MXGEFW_FLAGS_FIRST;
2061 for (i = 0; i < cnt; i++) {
2062 req->addr_low =
2063 htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2064 req->addr_high =
2065 htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2066 req->length = htobe16(seg->ds_len);
2067 req->cksum_offset = cksum_offset;
2068 if (cksum_offset > seg->ds_len)
2069 cksum_offset -= seg->ds_len;
2070 else
2071 cksum_offset = 0;
2072 req->pseudo_hdr_offset = pseudo_hdr_offset;
2073 req->pad = 0; /* complete solid 16-byte block */
2074 req->rdma_count = 1;
2075 req->flags |= flags | ((cum_len & 1) * odd_flag);
2076 cum_len += seg->ds_len;
2077 seg++;
2078 req++;
2079 req->flags = 0;
2080 }
2081 req--;
2082 /* pad runts to 60 bytes */
2083 if (cum_len < 60) {
2084 req++;
2085 req->addr_low =
2086 htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2087 req->addr_high =
2088 htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2089 req->length = htobe16(60 - cum_len);
2090 req->cksum_offset = 0;
2091 req->pseudo_hdr_offset = pseudo_hdr_offset;
2092 req->pad = 0; /* complete solid 16-byte block */
2093 req->rdma_count = 1;
2094 req->flags |= flags | ((cum_len & 1) * odd_flag);
2095 cnt++;
2096 }
2097
2098 tx->req_list[0].rdma_count = cnt;
2099#if 0
2100 /* print what the firmware will see */
2101 for (i = 0; i < cnt; i++) {
2102 printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2103 "cso:%d, flags:0x%x, rdma:%d\n",
2104 i, (int)ntohl(tx->req_list[i].addr_high),
2105 (int)ntohl(tx->req_list[i].addr_low),
2106 (int)ntohs(tx->req_list[i].length),
2107 (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2108 tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2109 tx->req_list[i].rdma_count);
2110 }
2111 printf("--------------\n");
2112#endif
2113 tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2114 mxge_submit_req(tx, tx->req_list, cnt);
2115#ifdef IFNET_BUF_RING
2116 if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2117 /* tell the NIC to start polling this slice */
2118 *tx->send_go = 1;
2119 tx->queue_active = 1;
2120 tx->activate++;
2121 wmb();
2122 }
2123#endif
2124 return;
2125
2126drop:
2127 m_freem(m);
2128 ss->oerrors++;
2129 return;
2130}
2131
2132#ifdef IFNET_BUF_RING
2133static void
2134mxge_qflush(struct ifnet *ifp)
2135{
2136 mxge_softc_t *sc = ifp->if_softc;
2137 mxge_tx_ring_t *tx;
2138 struct mbuf *m;
2139 int slice;
2140
2141 for (slice = 0; slice < sc->num_slices; slice++) {
2142 tx = &sc->ss[slice].tx;
2143 mtx_lock(&tx->mtx);
2144 while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2145 m_freem(m);
2146 mtx_unlock(&tx->mtx);
2147 }
2148 if_qflush(ifp);
2149}
2150
2151static inline void
2152mxge_start_locked(struct mxge_slice_state *ss)
2153{
2154 mxge_softc_t *sc;
2155 struct mbuf *m;
2156 struct ifnet *ifp;
2157 mxge_tx_ring_t *tx;
2158
2159 sc = ss->sc;
2160 ifp = sc->ifp;
2161 tx = &ss->tx;
2162
2163 while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2164 m = drbr_dequeue(ifp, tx->br);
2165 if (m == NULL) {
2166 return;
2167 }
2168 /* let BPF see it */
2169 BPF_MTAP(ifp, m);
2170
2171 /* give it to the nic */
2172 mxge_encap(ss, m);
2173 }
2174 /* ran out of transmit slots */
2175 if (((ss->if_drv_flags & IFF_DRV_OACTIVE) == 0)
2176 && (!drbr_empty(ifp, tx->br))) {
2177 ss->if_drv_flags |= IFF_DRV_OACTIVE;
2178 tx->stall++;
2179 }
2180}
2181
2182static int
2183mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2184{
2185 mxge_softc_t *sc;
2186 struct ifnet *ifp;
2187 mxge_tx_ring_t *tx;
2188 int err;
2189
2190 sc = ss->sc;
2191 ifp = sc->ifp;
2192 tx = &ss->tx;
2193
2194 if ((ss->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
2195 IFF_DRV_RUNNING) {
2196 err = drbr_enqueue(ifp, tx->br, m);
2197 return (err);
2198 }
2199
2200 if (drbr_empty(ifp, tx->br) &&
2201 ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2202 /* let BPF see it */
2203 BPF_MTAP(ifp, m);
2204 /* give it to the nic */
2205 mxge_encap(ss, m);
2206 } else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2207 return (err);
2208 }
2209 if (!drbr_empty(ifp, tx->br))
2210 mxge_start_locked(ss);
2211 return (0);
2212}
2213
2214static int
2215mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2216{
2217 mxge_softc_t *sc = ifp->if_softc;
2218 struct mxge_slice_state *ss;
2219 mxge_tx_ring_t *tx;
2220 int err = 0;
2221 int slice;
2222
2223 slice = m->m_pkthdr.flowid;
2224 slice &= (sc->num_slices - 1); /* num_slices always power of 2 */
2225
2226 ss = &sc->ss[slice];
2227 tx = &ss->tx;
2228
2229 if (mtx_trylock(&tx->mtx)) {
2230 err = mxge_transmit_locked(ss, m);
2231 mtx_unlock(&tx->mtx);
2232 } else {
2233 err = drbr_enqueue(ifp, tx->br, m);
2234 }
2235
2236 return (err);
2237}
2238
2239#else
2240
2241static inline void
2242mxge_start_locked(struct mxge_slice_state *ss)
2243{
2244 mxge_softc_t *sc;
2245 struct mbuf *m;
2246 struct ifnet *ifp;
2247 mxge_tx_ring_t *tx;
2248
2249 sc = ss->sc;
2250 ifp = sc->ifp;
2251 tx = &ss->tx;
2252 while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2253 IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
2254 if (m == NULL) {
2255 return;
2256 }
2257 /* let BPF see it */
2258 BPF_MTAP(ifp, m);
2259
2260 /* give it to the nic */
2261 mxge_encap(ss, m);
2262 }
2263 /* ran out of transmit slots */
2264 if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
2265 sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
2266 tx->stall++;
2267 }
2268}
2269#endif
2270static void
2271mxge_start(struct ifnet *ifp)
2272{
2273 mxge_softc_t *sc = ifp->if_softc;
2274 struct mxge_slice_state *ss;
2275
2276 /* only use the first slice for now */
2277 ss = &sc->ss[0];
2278 mtx_lock(&ss->tx.mtx);
2279 mxge_start_locked(ss);
2280 mtx_unlock(&ss->tx.mtx);
2281}
2282
2283/*
2284 * copy an array of mcp_kreq_ether_recv_t's to the mcp. Copy
2285 * at most 32 bytes at a time, so as to avoid involving the software
2286 * pio handler in the nic. We re-write the first segment's low
2287 * DMA address to mark it valid only after we write the entire chunk
2288 * in a burst
2289 */
2290static inline void
2291mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2292 mcp_kreq_ether_recv_t *src)
2293{
2294 uint32_t low;
2295
2296 low = src->addr_low;
2297 src->addr_low = 0xffffffff;
2298 mxge_pio_copy(dst, src, 4 * sizeof (*src));
2299 wmb();
2300 mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2301 wmb();
2302 src->addr_low = low;
2303 dst->addr_low = low;
2304 wmb();
2305}
2306
2307static int
2308mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2309{
2310 bus_dma_segment_t seg;
2311 struct mbuf *m;
2312 mxge_rx_ring_t *rx = &ss->rx_small;
2313 int cnt, err;
2314
2315 m = m_gethdr(M_DONTWAIT, MT_DATA);
2316 if (m == NULL) {
2317 rx->alloc_fail++;
2318 err = ENOBUFS;
2319 goto done;
2320 }
2321 m->m_len = MHLEN;
2322 err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2323 &seg, &cnt, BUS_DMA_NOWAIT);
2324 if (err != 0) {
2325 m_free(m);
2326 goto done;
2327 }
2328 rx->info[idx].m = m;
2329 rx->shadow[idx].addr_low =
2330 htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2331 rx->shadow[idx].addr_high =
2332 htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2333
2334done:
2335 if ((idx & 7) == 7)
2336 mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2337 return err;
2338}
2339
2340static int
2341mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2342{
2343 bus_dma_segment_t seg[3];
2344 struct mbuf *m;
2345 mxge_rx_ring_t *rx = &ss->rx_big;
2346 int cnt, err, i;
2347
2348 if (rx->cl_size == MCLBYTES)
2349 m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR);
2350 else
2351 m = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2352 if (m == NULL) {
2353 rx->alloc_fail++;
2354 err = ENOBUFS;
2355 goto done;
2356 }
2357 m->m_len = rx->mlen;
2358 err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2359 seg, &cnt, BUS_DMA_NOWAIT);
2360 if (err != 0) {
2361 m_free(m);
2362 goto done;
2363 }
2364 rx->info[idx].m = m;
2365 rx->shadow[idx].addr_low =
2366 htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2367 rx->shadow[idx].addr_high =
2368 htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2369
2370#if MXGE_VIRT_JUMBOS
2371 for (i = 1; i < cnt; i++) {
2372 rx->shadow[idx + i].addr_low =
2373 htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2374 rx->shadow[idx + i].addr_high =
2375 htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2376 }
2377#endif
2378
2379done:
2380 for (i = 0; i < rx->nbufs; i++) {
2381 if ((idx & 7) == 7) {
2382 mxge_submit_8rx(&rx->lanai[idx - 7],
2383 &rx->shadow[idx - 7]);
2384 }
2385 idx++;
2386 }
2387 return err;
2388}
2389
2390/*
2391 * Myri10GE hardware checksums are not valid if the sender
2392 * padded the frame with non-zero padding. This is because
2393 * the firmware just does a simple 16-bit 1s complement
2394 * checksum across the entire frame, excluding the first 14
2395 * bytes. It is best to simply to check the checksum and
2396 * tell the stack about it only if the checksum is good
2397 */
2398
2399static inline uint16_t
2400mxge_rx_csum(struct mbuf *m, int csum)
2401{
2402 struct ether_header *eh;
2403 struct ip *ip;
2404 uint16_t c;
2405
2406 eh = mtod(m, struct ether_header *);
2407
2408 /* only deal with IPv4 TCP & UDP for now */
2409 if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2410 return 1;
2411 ip = (struct ip *)(eh + 1);
2412 if (__predict_false(ip->ip_p != IPPROTO_TCP &&
2413 ip->ip_p != IPPROTO_UDP))
2414 return 1;
2415#ifdef INET
2416 c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2417 htonl(ntohs(csum) + ntohs(ip->ip_len) +
2418 - (ip->ip_hl << 2) + ip->ip_p));
2419#else
2420 c = 1;
2421#endif
2422 c ^= 0xffff;
2423 return (c);
2424}
2425
2426static void
2427mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2428{
2429 struct ether_vlan_header *evl;
2430 struct ether_header *eh;
2431 uint32_t partial;
2432
2433 evl = mtod(m, struct ether_vlan_header *);
2434 eh = mtod(m, struct ether_header *);
2435
2436 /*
2437 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2438 * after what the firmware thought was the end of the ethernet
2439 * header.
2440 */
2441
2442 /* put checksum into host byte order */
2443 *csum = ntohs(*csum);
2444 partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2445 (*csum) += ~partial;
2446 (*csum) += ((*csum) < ~partial);
2447 (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2448 (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2449
2450 /* restore checksum to network byte order;
2451 later consumers expect this */
2452 *csum = htons(*csum);
2453
2454 /* save the tag */
2455#ifdef MXGE_NEW_VLAN_API
2456 m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2457#else
2458 {
2459 struct m_tag *mtag;
2460 mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2461 M_NOWAIT);
2462 if (mtag == NULL)
2463 return;
2464 VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2465 m_tag_prepend(m, mtag);
2466 }
2467
2468#endif
2469 m->m_flags |= M_VLANTAG;
2470
2471 /*
2472 * Remove the 802.1q header by copying the Ethernet
2473 * addresses over it and adjusting the beginning of
2474 * the data in the mbuf. The encapsulated Ethernet
2475 * type field is already in place.
2476 */
2477 bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2478 ETHER_HDR_LEN - ETHER_TYPE_LEN);
2479 m_adj(m, ETHER_VLAN_ENCAP_LEN);
2480}
2481
2482
2483static inline void
2484mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2485{
2486 mxge_softc_t *sc;
2487 struct ifnet *ifp;
2488 struct mbuf *m;
2489 struct ether_header *eh;
2490 mxge_rx_ring_t *rx;
2491 bus_dmamap_t old_map;
2492 int idx;
2493 uint16_t tcpudp_csum;
2494
2495 sc = ss->sc;
2496 ifp = sc->ifp;
2497 rx = &ss->rx_big;
2498 idx = rx->cnt & rx->mask;
2499 rx->cnt += rx->nbufs;
2500 /* save a pointer to the received mbuf */
2501 m = rx->info[idx].m;
2502 /* try to replace the received mbuf */
2503 if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2504 /* drop the frame -- the old mbuf is re-cycled */
2505 ifp->if_ierrors++;
2506 return;
2507 }
2508
2509 /* unmap the received buffer */
2510 old_map = rx->info[idx].map;
2511 bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2512 bus_dmamap_unload(rx->dmat, old_map);
2513
2514 /* swap the bus_dmamap_t's */
2515 rx->info[idx].map = rx->extra_map;
2516 rx->extra_map = old_map;
2517
2518 /* mcp implicitly skips 1st 2 bytes so that packet is properly
2519 * aligned */
2520 m->m_data += MXGEFW_PAD;
2521
2522 m->m_pkthdr.rcvif = ifp;
2523 m->m_len = m->m_pkthdr.len = len;
2524 ss->ipackets++;
2525 eh = mtod(m, struct ether_header *);
2526 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2527 mxge_vlan_tag_remove(m, &csum);
2528 }
2529 /* if the checksum is valid, mark it in the mbuf header */
2530 if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2531 if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2532 return;
2533 /* otherwise, it was a UDP frame, or a TCP frame which
2534 we could not do LRO on. Tell the stack that the
2535 checksum is good */
2536 m->m_pkthdr.csum_data = 0xffff;
2537 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2538 }
2539 /* flowid only valid if RSS hashing is enabled */
2540 if (sc->num_slices > 1) {
2541 m->m_pkthdr.flowid = (ss - sc->ss);
2542 m->m_flags |= M_FLOWID;
2543 }
2544 /* pass the frame up the stack */
2545 (*ifp->if_input)(ifp, m);
2546}
2547
2548static inline void
2549mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2550{
2551 mxge_softc_t *sc;
2552 struct ifnet *ifp;
2553 struct ether_header *eh;
2554 struct mbuf *m;
2555 mxge_rx_ring_t *rx;
2556 bus_dmamap_t old_map;
2557 int idx;
2558 uint16_t tcpudp_csum;
2559
2560 sc = ss->sc;
2561 ifp = sc->ifp;
2562 rx = &ss->rx_small;
2563 idx = rx->cnt & rx->mask;
2564 rx->cnt++;
2565 /* save a pointer to the received mbuf */
2566 m = rx->info[idx].m;
2567 /* try to replace the received mbuf */
2568 if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2569 /* drop the frame -- the old mbuf is re-cycled */
2570 ifp->if_ierrors++;
2571 return;
2572 }
2573
2574 /* unmap the received buffer */
2575 old_map = rx->info[idx].map;
2576 bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2577 bus_dmamap_unload(rx->dmat, old_map);
2578
2579 /* swap the bus_dmamap_t's */
2580 rx->info[idx].map = rx->extra_map;
2581 rx->extra_map = old_map;
2582
2583 /* mcp implicitly skips 1st 2 bytes so that packet is properly
2584 * aligned */
2585 m->m_data += MXGEFW_PAD;
2586
2587 m->m_pkthdr.rcvif = ifp;
2588 m->m_len = m->m_pkthdr.len = len;
2589 ss->ipackets++;
2590 eh = mtod(m, struct ether_header *);
2591 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2592 mxge_vlan_tag_remove(m, &csum);
2593 }
2594 /* if the checksum is valid, mark it in the mbuf header */
2595 if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2596 if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2597 return;
2598 /* otherwise, it was a UDP frame, or a TCP frame which
2599 we could not do LRO on. Tell the stack that the
2600 checksum is good */
2601 m->m_pkthdr.csum_data = 0xffff;
2602 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2603 }
2604 /* flowid only valid if RSS hashing is enabled */
2605 if (sc->num_slices > 1) {
2606 m->m_pkthdr.flowid = (ss - sc->ss);
2607 m->m_flags |= M_FLOWID;
2608 }
2609 /* pass the frame up the stack */
2610 (*ifp->if_input)(ifp, m);
2611}
2612
2613static inline void
2614mxge_clean_rx_done(struct mxge_slice_state *ss)
2615{
2616 mxge_rx_done_t *rx_done = &ss->rx_done;
2617 int limit = 0;
2618 uint16_t length;
2619 uint16_t checksum;
2620
2621
2622 while (rx_done->entry[rx_done->idx].length != 0) {
2623 length = ntohs(rx_done->entry[rx_done->idx].length);
2624 rx_done->entry[rx_done->idx].length = 0;
2625 checksum = rx_done->entry[rx_done->idx].checksum;
2626 if (length <= (MHLEN - MXGEFW_PAD))
2627 mxge_rx_done_small(ss, length, checksum);
2628 else
2629 mxge_rx_done_big(ss, length, checksum);
2630 rx_done->cnt++;
2631 rx_done->idx = rx_done->cnt & rx_done->mask;
2632
2633 /* limit potential for livelock */
2634 if (__predict_false(++limit > rx_done->mask / 2))
2635 break;
2636 }
2637#ifdef INET
2638 while (!SLIST_EMPTY(&ss->lro_active)) {
2639 struct lro_entry *lro = SLIST_FIRST(&ss->lro_active);
2640 SLIST_REMOVE_HEAD(&ss->lro_active, next);
2641 mxge_lro_flush(ss, lro);
2642 }
2643#endif
2644}
2645
2646
2647static inline void
2648mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2649{
2650 struct ifnet *ifp;
2651 mxge_tx_ring_t *tx;
2652 struct mbuf *m;
2653 bus_dmamap_t map;
2654 int idx;
2655 int *flags;
2656
2657 tx = &ss->tx;
2658 ifp = ss->sc->ifp;
2659 while (tx->pkt_done != mcp_idx) {
2660 idx = tx->done & tx->mask;
2661 tx->done++;
2662 m = tx->info[idx].m;
2663 /* mbuf and DMA map only attached to the first
2664 segment per-mbuf */
2665 if (m != NULL) {
2666 ss->obytes += m->m_pkthdr.len;
2667 if (m->m_flags & M_MCAST)
2668 ss->omcasts++;
2669 ss->opackets++;
2670 tx->info[idx].m = NULL;
2671 map = tx->info[idx].map;
2672 bus_dmamap_unload(tx->dmat, map);
2673 m_freem(m);
2674 }
2675 if (tx->info[idx].flag) {
2676 tx->info[idx].flag = 0;
2677 tx->pkt_done++;
2678 }
2679 }
2680
2681 /* If we have space, clear IFF_OACTIVE to tell the stack that
2682 its OK to send packets */
2683#ifdef IFNET_BUF_RING
2684 flags = &ss->if_drv_flags;
2685#else
2686 flags = &ifp->if_drv_flags;
2687#endif
2688 mtx_lock(&ss->tx.mtx);
2689 if ((*flags) & IFF_DRV_OACTIVE &&
2690 tx->req - tx->done < (tx->mask + 1)/4) {
2691 *(flags) &= ~IFF_DRV_OACTIVE;
2692 ss->tx.wake++;
2693 mxge_start_locked(ss);
2694 }
2695#ifdef IFNET_BUF_RING
2696 if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2697 /* let the NIC stop polling this queue, since there
2698 * are no more transmits pending */
2699 if (tx->req == tx->done) {
2700 *tx->send_stop = 1;
2701 tx->queue_active = 0;
2702 tx->deactivate++;
2703 wmb();
2704 }
2705 }
2706#endif
2707 mtx_unlock(&ss->tx.mtx);
2708
2709}
2710
2711static struct mxge_media_type mxge_xfp_media_types[] =
2712{
2713 {IFM_10G_CX4, 0x7f, "10GBASE-CX4 (module)"},
2714 {IFM_10G_SR, (1 << 7), "10GBASE-SR"},
2715 {IFM_10G_LR, (1 << 6), "10GBASE-LR"},
2716 {0, (1 << 5), "10GBASE-ER"},
2717 {IFM_10G_LRM, (1 << 4), "10GBASE-LRM"},
2718 {0, (1 << 3), "10GBASE-SW"},
2719 {0, (1 << 2), "10GBASE-LW"},
2720 {0, (1 << 1), "10GBASE-EW"},
2721 {0, (1 << 0), "Reserved"}
2722};
2723static struct mxge_media_type mxge_sfp_media_types[] =
2724{
2725 {0, (1 << 7), "Reserved"},
2726 {IFM_10G_LRM, (1 << 6), "10GBASE-LRM"},
2727 {IFM_10G_LR, (1 << 5), "10GBASE-LR"},
2728 {IFM_10G_SR, (1 << 4), "10GBASE-SR"}
2729};
2730
2731static void
2732mxge_set_media(mxge_softc_t *sc, int type)
2733{
2734 sc->media_flags |= type;
2735 ifmedia_add(&sc->media, sc->media_flags, 0, NULL);
2736 ifmedia_set(&sc->media, sc->media_flags);
2737}
2738
2739
2740/*
2741 * Determine the media type for a NIC. Some XFPs will identify
2742 * themselves only when their link is up, so this is initiated via a
2743 * link up interrupt. However, this can potentially take up to
2744 * several milliseconds, so it is run via the watchdog routine, rather
2745 * than in the interrupt handler itself. This need only be done
2746 * once, not each time the link is up.
2747 */
2748static void
2749mxge_media_probe(mxge_softc_t *sc)
2750{
2751 mxge_cmd_t cmd;
2752 char *cage_type;
2753 char *ptr;
2754 struct mxge_media_type *mxge_media_types = NULL;
2755 int i, err, ms, mxge_media_type_entries;
2756 uint32_t byte;
2757
2758 sc->need_media_probe = 0;
2759
2760 /* if we've already set a media type, we're done */
2761 if (sc->media_flags != (IFM_ETHER | IFM_AUTO))
2762 return;
2763
2764 /*
2765 * parse the product code to deterimine the interface type
2766 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2767 * after the 3rd dash in the driver's cached copy of the
2768 * EEPROM's product code string.
2769 */
2770 ptr = sc->product_code_string;
2771 if (ptr == NULL) {
2772 device_printf(sc->dev, "Missing product code\n");
2773 }
2774
2775 for (i = 0; i < 3; i++, ptr++) {
2776 ptr = index(ptr, '-');
2777 if (ptr == NULL) {
2778 device_printf(sc->dev,
2779 "only %d dashes in PC?!?\n", i);
2780 return;
2781 }
2782 }
2783 if (*ptr == 'C') {
2784 /* -C is CX4 */
2785 mxge_set_media(sc, IFM_10G_CX4);
2786 return;
2787 }
2788 else if (*ptr == 'Q') {
2789 /* -Q is Quad Ribbon Fiber */
2790 device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2791 /* FreeBSD has no media type for Quad ribbon fiber */
2792 return;
2793 }
2794
2795 if (*ptr == 'R') {
2796 /* -R is XFP */
2797 mxge_media_types = mxge_xfp_media_types;
2798 mxge_media_type_entries =
2799 sizeof (mxge_xfp_media_types) /
2800 sizeof (mxge_xfp_media_types[0]);
2801 byte = MXGE_XFP_COMPLIANCE_BYTE;
2802 cage_type = "XFP";
2803 }
2804
2805 if (*ptr == 'S' || *(ptr +1) == 'S') {
2806 /* -S or -2S is SFP+ */
2807 mxge_media_types = mxge_sfp_media_types;
2808 mxge_media_type_entries =
2809 sizeof (mxge_sfp_media_types) /
2810 sizeof (mxge_sfp_media_types[0]);
2811 cage_type = "SFP+";
2812 byte = 3;
2813 }
2814
2815 if (mxge_media_types == NULL) {
2816 device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2817 return;
2818 }
2819
2820 /*
2821 * At this point we know the NIC has an XFP cage, so now we
2822 * try to determine what is in the cage by using the
2823 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2824 * register. We read just one byte, which may take over
2825 * a millisecond
2826 */
2827
2828 cmd.data0 = 0; /* just fetch 1 byte, not all 256 */
2829 cmd.data1 = byte;
2830 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2831 if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
2832 device_printf(sc->dev, "failed to read XFP\n");
2833 }
2834 if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
2835 device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
2836 }
2837 if (err != MXGEFW_CMD_OK) {
2838 return;
2839 }
2840
2841 /* now we wait for the data to be cached */
2842 cmd.data0 = byte;
2843 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2844 for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
2845 DELAY(1000);
2846 cmd.data0 = byte;
2847 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2848 }
2849 if (err != MXGEFW_CMD_OK) {
2850 device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
2851 cage_type, err, ms);
2852 return;
2853 }
2854
2855 if (cmd.data0 == mxge_media_types[0].bitmask) {
2856 if (mxge_verbose)
2857 device_printf(sc->dev, "%s:%s\n", cage_type,
2858 mxge_media_types[0].name);
2859 mxge_set_media(sc, IFM_10G_CX4);
2860 return;
2861 }
2862 for (i = 1; i < mxge_media_type_entries; i++) {
2863 if (cmd.data0 & mxge_media_types[i].bitmask) {
2864 if (mxge_verbose)
2865 device_printf(sc->dev, "%s:%s\n",
2866 cage_type,
2867 mxge_media_types[i].name);
2868
2869 mxge_set_media(sc, mxge_media_types[i].flag);
2870 return;
2871 }
2872 }
2873 device_printf(sc->dev, "%s media 0x%x unknown\n", cage_type,
2874 cmd.data0);
2875
2876 return;
2877}
2878
2879static void
2880mxge_intr(void *arg)
2881{
2882 struct mxge_slice_state *ss = arg;
2883 mxge_softc_t *sc = ss->sc;
2884 mcp_irq_data_t *stats = ss->fw_stats;
2885 mxge_tx_ring_t *tx = &ss->tx;
2886 mxge_rx_done_t *rx_done = &ss->rx_done;
2887 uint32_t send_done_count;
2888 uint8_t valid;
2889
2890
2891#ifndef IFNET_BUF_RING
2892 /* an interrupt on a non-zero slice is implicitly valid
2893 since MSI-X irqs are not shared */
2894 if (ss != sc->ss) {
2895 mxge_clean_rx_done(ss);
2896 *ss->irq_claim = be32toh(3);
2897 return;
2898 }
2899#endif
2900
2901 /* make sure the DMA has finished */
2902 if (!stats->valid) {
2903 return;
2904 }
2905 valid = stats->valid;
2906
2907 if (sc->legacy_irq) {
2908 /* lower legacy IRQ */
2909 *sc->irq_deassert = 0;
2910 if (!mxge_deassert_wait)
2911 /* don't wait for conf. that irq is low */
2912 stats->valid = 0;
2913 } else {
2914 stats->valid = 0;
2915 }
2916
2917 /* loop while waiting for legacy irq deassertion */
2918 do {
2919 /* check for transmit completes and receives */
2920 send_done_count = be32toh(stats->send_done_count);
2921 while ((send_done_count != tx->pkt_done) ||
2922 (rx_done->entry[rx_done->idx].length != 0)) {
2923 if (send_done_count != tx->pkt_done)
2924 mxge_tx_done(ss, (int)send_done_count);
2925 mxge_clean_rx_done(ss);
2926 send_done_count = be32toh(stats->send_done_count);
2927 }
2928 if (sc->legacy_irq && mxge_deassert_wait)
2929 wmb();
2930 } while (*((volatile uint8_t *) &stats->valid));
2931
2932 /* fw link & error stats meaningful only on the first slice */
2933 if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
2934 if (sc->link_state != stats->link_up) {
2935 sc->link_state = stats->link_up;
2936 if (sc->link_state) {
2937 if_link_state_change(sc->ifp, LINK_STATE_UP);
2938 if (mxge_verbose)
2939 device_printf(sc->dev, "link up\n");
2940 } else {
2941 if_link_state_change(sc->ifp, LINK_STATE_DOWN);
2942 if (mxge_verbose)
2943 device_printf(sc->dev, "link down\n");
2944 }
2945 sc->need_media_probe = 1;
2946 }
2947 if (sc->rdma_tags_available !=
2948 be32toh(stats->rdma_tags_available)) {
2949 sc->rdma_tags_available =
2950 be32toh(stats->rdma_tags_available);
2951 device_printf(sc->dev, "RDMA timed out! %d tags "
2952 "left\n", sc->rdma_tags_available);
2953 }
2954
2955 if (stats->link_down) {
2956 sc->down_cnt += stats->link_down;
2957 sc->link_state = 0;
2958 if_link_state_change(sc->ifp, LINK_STATE_DOWN);
2959 }
2960 }
2961
2962 /* check to see if we have rx token to pass back */
2963 if (valid & 0x1)
2964 *ss->irq_claim = be32toh(3);
2965 *(ss->irq_claim + 1) = be32toh(3);
2966}
2967
2968static void
2969mxge_init(void *arg)
2970{
2971}
2972
2973
2974
2975static void
2976mxge_free_slice_mbufs(struct mxge_slice_state *ss)
2977{
2978 struct lro_entry *lro_entry;
2979 int i;
2980
2981 while (!SLIST_EMPTY(&ss->lro_free)) {
2982 lro_entry = SLIST_FIRST(&ss->lro_free);
2983 SLIST_REMOVE_HEAD(&ss->lro_free, next);
2984 free(lro_entry, M_DEVBUF);
2985 }
2986
2987 for (i = 0; i <= ss->rx_big.mask; i++) {
2988 if (ss->rx_big.info[i].m == NULL)
2989 continue;
2990 bus_dmamap_unload(ss->rx_big.dmat,
2991 ss->rx_big.info[i].map);
2992 m_freem(ss->rx_big.info[i].m);
2993 ss->rx_big.info[i].m = NULL;
2994 }
2995
2996 for (i = 0; i <= ss->rx_small.mask; i++) {
2997 if (ss->rx_small.info[i].m == NULL)
2998 continue;
2999 bus_dmamap_unload(ss->rx_small.dmat,
3000 ss->rx_small.info[i].map);
3001 m_freem(ss->rx_small.info[i].m);
3002 ss->rx_small.info[i].m = NULL;
3003 }
3004
3005 /* transmit ring used only on the first slice */
3006 if (ss->tx.info == NULL)
3007 return;
3008
3009 for (i = 0; i <= ss->tx.mask; i++) {
3010 ss->tx.info[i].flag = 0;
3011 if (ss->tx.info[i].m == NULL)
3012 continue;
3013 bus_dmamap_unload(ss->tx.dmat,
3014 ss->tx.info[i].map);
3015 m_freem(ss->tx.info[i].m);
3016 ss->tx.info[i].m = NULL;
3017 }
3018}
3019
3020static void
3021mxge_free_mbufs(mxge_softc_t *sc)
3022{
3023 int slice;
3024
3025 for (slice = 0; slice < sc->num_slices; slice++)
3026 mxge_free_slice_mbufs(&sc->ss[slice]);
3027}
3028
3029static void
3030mxge_free_slice_rings(struct mxge_slice_state *ss)
3031{
3032 int i;
3033
3034
3035 if (ss->rx_done.entry != NULL)
3036 mxge_dma_free(&ss->rx_done.dma);
3037 ss->rx_done.entry = NULL;
3038
3039 if (ss->tx.req_bytes != NULL)
3040 free(ss->tx.req_bytes, M_DEVBUF);
3041 ss->tx.req_bytes = NULL;
3042
3043 if (ss->tx.seg_list != NULL)
3044 free(ss->tx.seg_list, M_DEVBUF);
3045 ss->tx.seg_list = NULL;
3046
3047 if (ss->rx_small.shadow != NULL)
3048 free(ss->rx_small.shadow, M_DEVBUF);
3049 ss->rx_small.shadow = NULL;
3050
3051 if (ss->rx_big.shadow != NULL)
3052 free(ss->rx_big.shadow, M_DEVBUF);
3053 ss->rx_big.shadow = NULL;
3054
3055 if (ss->tx.info != NULL) {
3056 if (ss->tx.dmat != NULL) {
3057 for (i = 0; i <= ss->tx.mask; i++) {
3058 bus_dmamap_destroy(ss->tx.dmat,
3059 ss->tx.info[i].map);
3060 }
3061 bus_dma_tag_destroy(ss->tx.dmat);
3062 }
3063 free(ss->tx.info, M_DEVBUF);
3064 }
3065 ss->tx.info = NULL;
3066
3067 if (ss->rx_small.info != NULL) {
3068 if (ss->rx_small.dmat != NULL) {
3069 for (i = 0; i <= ss->rx_small.mask; i++) {
3070 bus_dmamap_destroy(ss->rx_small.dmat,
3071 ss->rx_small.info[i].map);
3072 }
3073 bus_dmamap_destroy(ss->rx_small.dmat,
3074 ss->rx_small.extra_map);
3075 bus_dma_tag_destroy(ss->rx_small.dmat);
3076 }
3077 free(ss->rx_small.info, M_DEVBUF);
3078 }
3079 ss->rx_small.info = NULL;
3080
3081 if (ss->rx_big.info != NULL) {
3082 if (ss->rx_big.dmat != NULL) {
3083 for (i = 0; i <= ss->rx_big.mask; i++) {
3084 bus_dmamap_destroy(ss->rx_big.dmat,
3085 ss->rx_big.info[i].map);
3086 }
3087 bus_dmamap_destroy(ss->rx_big.dmat,
3088 ss->rx_big.extra_map);
3089 bus_dma_tag_destroy(ss->rx_big.dmat);
3090 }
3091 free(ss->rx_big.info, M_DEVBUF);
3092 }
3093 ss->rx_big.info = NULL;
3094}
3095
3096static void
3097mxge_free_rings(mxge_softc_t *sc)
3098{
3099 int slice;
3100
3101 for (slice = 0; slice < sc->num_slices; slice++)
3102 mxge_free_slice_rings(&sc->ss[slice]);
3103}
3104
3105static int
3106mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3107 int tx_ring_entries)
3108{
3109 mxge_softc_t *sc = ss->sc;
3110 size_t bytes;
3111 int err, i;
3112
3113 err = ENOMEM;
3114
3115 /* allocate per-slice receive resources */
3116
3117 ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3118 ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3119
3120 /* allocate the rx shadow rings */
3121 bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3122 ss->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3123 if (ss->rx_small.shadow == NULL)
3124 return err;;
3125
3126 bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3127 ss->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3128 if (ss->rx_big.shadow == NULL)
3129 return err;;
3130
3131 /* allocate the rx host info rings */
3132 bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3133 ss->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3134 if (ss->rx_small.info == NULL)
3135 return err;;
3136
3137 bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3138 ss->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3139 if (ss->rx_big.info == NULL)
3140 return err;;
3141
3142 /* allocate the rx busdma resources */
3143 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3144 1, /* alignment */
3145 4096, /* boundary */
3146 BUS_SPACE_MAXADDR, /* low */
3147 BUS_SPACE_MAXADDR, /* high */
3148 NULL, NULL, /* filter */
3149 MHLEN, /* maxsize */
3150 1, /* num segs */
3151 MHLEN, /* maxsegsize */
3152 BUS_DMA_ALLOCNOW, /* flags */
3153 NULL, NULL, /* lock */
3154 &ss->rx_small.dmat); /* tag */
3155 if (err != 0) {
3156 device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3157 err);
3158 return err;;
3159 }
3160
3161 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3162 1, /* alignment */
3163#if MXGE_VIRT_JUMBOS
3164 4096, /* boundary */
3165#else
3166 0, /* boundary */
3167#endif
3168 BUS_SPACE_MAXADDR, /* low */
3169 BUS_SPACE_MAXADDR, /* high */
3170 NULL, NULL, /* filter */
3171 3*4096, /* maxsize */
3172#if MXGE_VIRT_JUMBOS
3173 3, /* num segs */
3174 4096, /* maxsegsize*/
3175#else
3176 1, /* num segs */
3177 MJUM9BYTES, /* maxsegsize*/
3178#endif
3179 BUS_DMA_ALLOCNOW, /* flags */
3180 NULL, NULL, /* lock */
3181 &ss->rx_big.dmat); /* tag */
3182 if (err != 0) {
3183 device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3184 err);
3185 return err;;
3186 }
3187 for (i = 0; i <= ss->rx_small.mask; i++) {
3188 err = bus_dmamap_create(ss->rx_small.dmat, 0,
3189 &ss->rx_small.info[i].map);
3190 if (err != 0) {
3191 device_printf(sc->dev, "Err %d rx_small dmamap\n",
3192 err);
3193 return err;;
3194 }
3195 }
3196 err = bus_dmamap_create(ss->rx_small.dmat, 0,
3197 &ss->rx_small.extra_map);
3198 if (err != 0) {
3199 device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3200 err);
3201 return err;;
3202 }
3203
3204 for (i = 0; i <= ss->rx_big.mask; i++) {
3205 err = bus_dmamap_create(ss->rx_big.dmat, 0,
3206 &ss->rx_big.info[i].map);
3207 if (err != 0) {
3208 device_printf(sc->dev, "Err %d rx_big dmamap\n",
3209 err);
3210 return err;;
3211 }
3212 }
3213 err = bus_dmamap_create(ss->rx_big.dmat, 0,
3214 &ss->rx_big.extra_map);
3215 if (err != 0) {
3216 device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3217 err);
3218 return err;;
3219 }
3220
3221 /* now allocate TX resouces */
3222
3223#ifndef IFNET_BUF_RING
3224 /* only use a single TX ring for now */
3225 if (ss != ss->sc->ss)
3226 return 0;
3227#endif
3228
3229 ss->tx.mask = tx_ring_entries - 1;
3230 ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3231
3232
3233 /* allocate the tx request copy block */
3234 bytes = 8 +
3235 sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3236 ss->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
3237 if (ss->tx.req_bytes == NULL)
3238 return err;;
3239 /* ensure req_list entries are aligned to 8 bytes */
3240 ss->tx.req_list = (mcp_kreq_ether_send_t *)
3241 ((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3242
3243 /* allocate the tx busdma segment list */
3244 bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3245 ss->tx.seg_list = (bus_dma_segment_t *)
3246 malloc(bytes, M_DEVBUF, M_WAITOK);
3247 if (ss->tx.seg_list == NULL)
3248 return err;;
3249
3250 /* allocate the tx host info ring */
3251 bytes = tx_ring_entries * sizeof (*ss->tx.info);
3252 ss->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3253 if (ss->tx.info == NULL)
3254 return err;;
3255
3256 /* allocate the tx busdma resources */
3257 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3258 1, /* alignment */
3259 sc->tx_boundary, /* boundary */
3260 BUS_SPACE_MAXADDR, /* low */
3261 BUS_SPACE_MAXADDR, /* high */
3262 NULL, NULL, /* filter */
3263 65536 + 256, /* maxsize */
3264 ss->tx.max_desc - 2, /* num segs */
3265 sc->tx_boundary, /* maxsegsz */
3266 BUS_DMA_ALLOCNOW, /* flags */
3267 NULL, NULL, /* lock */
3268 &ss->tx.dmat); /* tag */
3269
3270 if (err != 0) {
3271 device_printf(sc->dev, "Err %d allocating tx dmat\n",
3272 err);
3273 return err;;
3274 }
3275
3276 /* now use these tags to setup dmamaps for each slot
3277 in the ring */
3278 for (i = 0; i <= ss->tx.mask; i++) {
3279 err = bus_dmamap_create(ss->tx.dmat, 0,
3280 &ss->tx.info[i].map);
3281 if (err != 0) {
3282 device_printf(sc->dev, "Err %d tx dmamap\n",
3283 err);
3284 return err;;
3285 }
3286 }
3287 return 0;
3288
3289}
3290
3291static int
3292mxge_alloc_rings(mxge_softc_t *sc)
3293{
3294 mxge_cmd_t cmd;
3295 int tx_ring_size;
3296 int tx_ring_entries, rx_ring_entries;
3297 int err, slice;
3298
3299 /* get ring sizes */
3300 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3301 tx_ring_size = cmd.data0;
3302 if (err != 0) {
3303 device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3304 goto abort;
3305 }
3306
3307 tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3308 rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3309 IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
3310 sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
3311 IFQ_SET_READY(&sc->ifp->if_snd);
3312
3313 for (slice = 0; slice < sc->num_slices; slice++) {
3314 err = mxge_alloc_slice_rings(&sc->ss[slice],
3315 rx_ring_entries,
3316 tx_ring_entries);
3317 if (err != 0)
3318 goto abort;
3319 }
3320 return 0;
3321
3322abort:
3323 mxge_free_rings(sc);
3324 return err;
3325
3326}
3327
3328
3329static void
3330mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3331{
3332 int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3333
3334 if (bufsize < MCLBYTES) {
3335 /* easy, everything fits in a single buffer */
3336 *big_buf_size = MCLBYTES;
3337 *cl_size = MCLBYTES;
3338 *nbufs = 1;
3339 return;
3340 }
3341
3342 if (bufsize < MJUMPAGESIZE) {
3343 /* still easy, everything still fits in a single buffer */
3344 *big_buf_size = MJUMPAGESIZE;
3345 *cl_size = MJUMPAGESIZE;
3346 *nbufs = 1;
3347 return;
3348 }
3349#if MXGE_VIRT_JUMBOS
3350 /* now we need to use virtually contiguous buffers */
3351 *cl_size = MJUM9BYTES;
3352 *big_buf_size = 4096;
3353 *nbufs = mtu / 4096 + 1;
3354 /* needs to be a power of two, so round up */
3355 if (*nbufs == 3)
3356 *nbufs = 4;
3357#else
3358 *cl_size = MJUM9BYTES;
3359 *big_buf_size = MJUM9BYTES;
3360 *nbufs = 1;
3361#endif
3362}
3363
3364static int
3365mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3366{
3367 mxge_softc_t *sc;
3368 mxge_cmd_t cmd;
3369 bus_dmamap_t map;
3370 struct lro_entry *lro_entry;
3371 int err, i, slice;
3372
3373
3374 sc = ss->sc;
3375 slice = ss - sc->ss;
3376
3377 SLIST_INIT(&ss->lro_free);
3378 SLIST_INIT(&ss->lro_active);
3379
3380 for (i = 0; i < sc->lro_cnt; i++) {
3381 lro_entry = (struct lro_entry *)
3382 malloc(sizeof (*lro_entry), M_DEVBUF,
3383 M_NOWAIT | M_ZERO);
3384 if (lro_entry == NULL) {
3385 sc->lro_cnt = i;
3386 break;
3387 }
3388 SLIST_INSERT_HEAD(&ss->lro_free, lro_entry, next);
3389 }
3390 /* get the lanai pointers to the send and receive rings */
3391
3392 err = 0;
3393#ifndef IFNET_BUF_RING
3394 /* We currently only send from the first slice */
3395 if (slice == 0) {
3396#endif
3397 cmd.data0 = slice;
3398 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3399 ss->tx.lanai =
3400 (volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3401 ss->tx.send_go = (volatile uint32_t *)
3402 (sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3403 ss->tx.send_stop = (volatile uint32_t *)
3404 (sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3405#ifndef IFNET_BUF_RING
3406 }
3407#endif
3408 cmd.data0 = slice;
3409 err |= mxge_send_cmd(sc,
3410 MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3411 ss->rx_small.lanai =
3412 (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3413 cmd.data0 = slice;
3414 err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3415 ss->rx_big.lanai =
3416 (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3417
3418 if (err != 0) {
3419 device_printf(sc->dev,
3420 "failed to get ring sizes or locations\n");
3421 return EIO;
3422 }
3423
3424 /* stock receive rings */
3425 for (i = 0; i <= ss->rx_small.mask; i++) {
3426 map = ss->rx_small.info[i].map;
3427 err = mxge_get_buf_small(ss, map, i);
3428 if (err) {
3429 device_printf(sc->dev, "alloced %d/%d smalls\n",
3430 i, ss->rx_small.mask + 1);
3431 return ENOMEM;
3432 }
3433 }
3434 for (i = 0; i <= ss->rx_big.mask; i++) {
3435 ss->rx_big.shadow[i].addr_low = 0xffffffff;
3436 ss->rx_big.shadow[i].addr_high = 0xffffffff;
3437 }
3438 ss->rx_big.nbufs = nbufs;
3439 ss->rx_big.cl_size = cl_size;
3440 ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3441 ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3442 for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3443 map = ss->rx_big.info[i].map;
3444 err = mxge_get_buf_big(ss, map, i);
3445 if (err) {
3446 device_printf(sc->dev, "alloced %d/%d bigs\n",
3447 i, ss->rx_big.mask + 1);
3448 return ENOMEM;
3449 }
3450 }
3451 return 0;
3452}
3453
3454static int
3455mxge_open(mxge_softc_t *sc)
3456{
3457 mxge_cmd_t cmd;
3458 int err, big_bytes, nbufs, slice, cl_size, i;
3459 bus_addr_t bus;
3460 volatile uint8_t *itable;
3461 struct mxge_slice_state *ss;
3462
3463 /* Copy the MAC address in case it was overridden */
3464 bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3465
3466 err = mxge_reset(sc, 1);
3467 if (err != 0) {
3468 device_printf(sc->dev, "failed to reset\n");
3469 return EIO;
3470 }
3471
3472 if (sc->num_slices > 1) {
3473 /* setup the indirection table */
3474 cmd.data0 = sc->num_slices;
3475 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3476 &cmd);
3477
3478 err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3479 &cmd);
3480 if (err != 0) {
3481 device_printf(sc->dev,
3482 "failed to setup rss tables\n");
3483 return err;
3484 }
3485
3486 /* just enable an identity mapping */
3487 itable = sc->sram + cmd.data0;
3488 for (i = 0; i < sc->num_slices; i++)
3489 itable[i] = (uint8_t)i;
3490
3491 cmd.data0 = 1;
3492 cmd.data1 = mxge_rss_hash_type;
3493 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3494 if (err != 0) {
3495 device_printf(sc->dev, "failed to enable slices\n");
3496 return err;
3497 }
3498 }
3499
3500
3501 mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3502
3503 cmd.data0 = nbufs;
3504 err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3505 &cmd);
3506 /* error is only meaningful if we're trying to set
3507 MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3508 if (err && nbufs > 1) {
3509 device_printf(sc->dev,
3510 "Failed to set alway-use-n to %d\n",
3511 nbufs);
3512 return EIO;
3513 }
3514 /* Give the firmware the mtu and the big and small buffer
3515 sizes. The firmware wants the big buf size to be a power
3516 of two. Luckily, FreeBSD's clusters are powers of two */
3517 cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3518 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3519 cmd.data0 = MHLEN - MXGEFW_PAD;
3520 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3521 &cmd);
3522 cmd.data0 = big_bytes;
3523 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3524
3525 if (err != 0) {
3526 device_printf(sc->dev, "failed to setup params\n");
3527 goto abort;
3528 }
3529
3530 /* Now give him the pointer to the stats block */
3531 for (slice = 0;
3532#ifdef IFNET_BUF_RING
3533 slice < sc->num_slices;
3534#else
3535 slice < 1;
3536#endif
3537 slice++) {
3538 ss = &sc->ss[slice];
3539 cmd.data0 =
3540 MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3541 cmd.data1 =
3542 MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3543 cmd.data2 = sizeof(struct mcp_irq_data);
3544 cmd.data2 |= (slice << 16);
3545 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3546 }
3547
3548 if (err != 0) {
3549 bus = sc->ss->fw_stats_dma.bus_addr;
3550 bus += offsetof(struct mcp_irq_data, send_done_count);
3551 cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3552 cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3553 err = mxge_send_cmd(sc,
3554 MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3555 &cmd);
3556 /* Firmware cannot support multicast without STATS_DMA_V2 */
3557 sc->fw_multicast_support = 0;
3558 } else {
3559 sc->fw_multicast_support = 1;
3560 }
3561
3562 if (err != 0) {
3563 device_printf(sc->dev, "failed to setup params\n");
3564 goto abort;
3565 }
3566
3567 for (slice = 0; slice < sc->num_slices; slice++) {
3568 err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3569 if (err != 0) {
3570 device_printf(sc->dev, "couldn't open slice %d\n",
3571 slice);
3572 goto abort;
3573 }
3574 }
3575
3576 /* Finally, start the firmware running */
3577 err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3578 if (err) {
3579 device_printf(sc->dev, "Couldn't bring up link\n");
3580 goto abort;
3581 }
3582#ifdef IFNET_BUF_RING
3583 for (slice = 0; slice < sc->num_slices; slice++) {
3584 ss = &sc->ss[slice];
3585 ss->if_drv_flags |= IFF_DRV_RUNNING;
3586 ss->if_drv_flags &= ~IFF_DRV_OACTIVE;
3587 }
3588#endif
3589 sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
3590 sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
3591 callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3592
3593 return 0;
3594
3595
3596abort:
3597 mxge_free_mbufs(sc);
3598
3599 return err;
3600}
3601
3602static int
3603mxge_close(mxge_softc_t *sc)
3604{
3605 mxge_cmd_t cmd;
3606 int err, old_down_cnt;
3607#ifdef IFNET_BUF_RING
3608 struct mxge_slice_state *ss;
3609 int slice;
3610#endif
3611
3612 callout_stop(&sc->co_hdl);
3613#ifdef IFNET_BUF_RING
3614 for (slice = 0; slice < sc->num_slices; slice++) {
3615 ss = &sc->ss[slice];
3616 ss->if_drv_flags &= ~IFF_DRV_RUNNING;
3617 }
3618#endif
3619 sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
3620 old_down_cnt = sc->down_cnt;
3621 wmb();
3622 err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3623 if (err) {
3624 device_printf(sc->dev, "Couldn't bring down link\n");
3625 }
3626 if (old_down_cnt == sc->down_cnt) {
3627 /* wait for down irq */
3628 DELAY(10 * sc->intr_coal_delay);
3629 }
3630 wmb();
3631 if (old_down_cnt == sc->down_cnt) {
3632 device_printf(sc->dev, "never got down irq\n");
3633 }
3634
3635 mxge_free_mbufs(sc);
3636
3637 return 0;
3638}
3639
3640static void
3641mxge_setup_cfg_space(mxge_softc_t *sc)
3642{
3643 device_t dev = sc->dev;
3644 int reg;
3645 uint16_t cmd, lnk, pectl;
3646
3647 /* find the PCIe link width and set max read request to 4KB*/
3648 if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
3649 lnk = pci_read_config(dev, reg + 0x12, 2);
3650 sc->link_width = (lnk >> 4) & 0x3f;
3651
3652 pectl = pci_read_config(dev, reg + 0x8, 2);
3653 pectl = (pectl & ~0x7000) | (5 << 12);
3654 pci_write_config(dev, reg + 0x8, pectl, 2);
3655 }
3656
3657 /* Enable DMA and Memory space access */
3658 pci_enable_busmaster(dev);
3659 cmd = pci_read_config(dev, PCIR_COMMAND, 2);
3660 cmd |= PCIM_CMD_MEMEN;
3661 pci_write_config(dev, PCIR_COMMAND, cmd, 2);
3662}
3663
3664static uint32_t
3665mxge_read_reboot(mxge_softc_t *sc)
3666{
3667 device_t dev = sc->dev;
3668 uint32_t vs;
3669
3670 /* find the vendor specific offset */
3671 if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
3672 device_printf(sc->dev,
3673 "could not find vendor specific offset\n");
3674 return (uint32_t)-1;
3675 }
3676 /* enable read32 mode */
3677 pci_write_config(dev, vs + 0x10, 0x3, 1);
3678 /* tell NIC which register to read */
3679 pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3680 return (pci_read_config(dev, vs + 0x14, 4));
3681}
3682
3683static int
3684mxge_watchdog_reset(mxge_softc_t *sc, int slice)
3685{
3686 struct pci_devinfo *dinfo;
3687 mxge_tx_ring_t *tx;
3688 int err;
3689 uint32_t reboot;
3690 uint16_t cmd;
3691
3692 err = ENXIO;
3693
3694 device_printf(sc->dev, "Watchdog reset!\n");
3695
3696 /*
3697 * check to see if the NIC rebooted. If it did, then all of
3698 * PCI config space has been reset, and things like the
3699 * busmaster bit will be zero. If this is the case, then we
3700 * must restore PCI config space before the NIC can be used
3701 * again
3702 */
3703 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3704 if (cmd == 0xffff) {
3705 /*
3706 * maybe the watchdog caught the NIC rebooting; wait
3707 * up to 100ms for it to finish. If it does not come
3708 * back, then give up
3709 */
3710 DELAY(1000*100);
3711 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3712 if (cmd == 0xffff) {
3713 device_printf(sc->dev, "NIC disappeared!\n");
3714 return (err);
3715 }
3716 }
3717 if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3718 /* print the reboot status */
3719 reboot = mxge_read_reboot(sc);
3720 device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3721 reboot);
3722 /* restore PCI configuration space */
3723 dinfo = device_get_ivars(sc->dev);
3724 pci_cfg_restore(sc->dev, dinfo);
3725
3726 /* and redo any changes we made to our config space */
3727 mxge_setup_cfg_space(sc);
3728
3729 if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING) {
3730 mxge_close(sc);
3731 err = mxge_open(sc);
3732 }
3733 } else {
3734 tx = &sc->ss[slice].tx;
3735 device_printf(sc->dev,
3736 "NIC did not reboot, slice %d ring state:\n",
3737 slice);
3738 device_printf(sc->dev,
3739 "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3740 tx->req, tx->done, tx->queue_active);
3741 device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3742 tx->activate, tx->deactivate);
3743 device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3744 tx->pkt_done,
3745 be32toh(sc->ss->fw_stats->send_done_count));
3746 device_printf(sc->dev, "not resetting\n");
3747 }
3748 return (err);
3749}
3750
3751static int
3752mxge_watchdog(mxge_softc_t *sc)
3753{
3754 mxge_tx_ring_t *tx;
3755 uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3756 int i, err = 0;
3757
3758 /* see if we have outstanding transmits, which
3759 have been pending for more than mxge_ticks */
3760 for (i = 0;
3761#ifdef IFNET_BUF_RING
3762 (i < sc->num_slices) && (err == 0);
3763#else
3764 (i < 1) && (err == 0);
3765#endif
3766 i++) {
3767 tx = &sc->ss[i].tx;
3768 if (tx->req != tx->done &&
3769 tx->watchdog_req != tx->watchdog_done &&
3770 tx->done == tx->watchdog_done) {
3771 /* check for pause blocking before resetting */
3772 if (tx->watchdog_rx_pause == rx_pause)
3773 err = mxge_watchdog_reset(sc, i);
3774 else
3775 device_printf(sc->dev, "Flow control blocking "
3776 "xmits, check link partner\n");
3777 }
3778
3779 tx->watchdog_req = tx->req;
3780 tx->watchdog_done = tx->done;
3781 tx->watchdog_rx_pause = rx_pause;
3782 }
3783
3784 if (sc->need_media_probe)
3785 mxge_media_probe(sc);
3786 return (err);
3787}
3788
3789static void
3790mxge_update_stats(mxge_softc_t *sc)
3791{
3792 struct mxge_slice_state *ss;
3793 u_long ipackets = 0;
3794 u_long opackets = 0;
3795#ifdef IFNET_BUF_RING
3796 u_long obytes = 0;
3797 u_long omcasts = 0;
3798 u_long odrops = 0;
3799#endif
3800 u_long oerrors = 0;
3801 int slice;
3802
3803 for (slice = 0; slice < sc->num_slices; slice++) {
3804 ss = &sc->ss[slice];
3805 ipackets += ss->ipackets;
3806 opackets += ss->opackets;
3807#ifdef IFNET_BUF_RING
3808 obytes += ss->obytes;
3809 omcasts += ss->omcasts;
3810 odrops += ss->tx.br->br_drops;
3811#endif
3812 oerrors += ss->oerrors;
3813 }
3814 sc->ifp->if_ipackets = ipackets;
3815 sc->ifp->if_opackets = opackets;
3816#ifdef IFNET_BUF_RING
3817 sc->ifp->if_obytes = obytes;
3818 sc->ifp->if_omcasts = omcasts;
3819 sc->ifp->if_snd.ifq_drops = odrops;
3820#endif
3821 sc->ifp->if_oerrors = oerrors;
3822}
3823
3824static void
3825mxge_tick(void *arg)
3826{
3827 mxge_softc_t *sc = arg;
3828 int err = 0;
3829
3830 /* aggregate stats from different slices */
3831 mxge_update_stats(sc);
3832 if (!sc->watchdog_countdown) {
3833 err = mxge_watchdog(sc);
3834 sc->watchdog_countdown = 4;
3835 }
3836 sc->watchdog_countdown--;
3837 if (err == 0)
3838 callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3839
3840}
3841
3842static int
3843mxge_media_change(struct ifnet *ifp)
3844{
3845 return EINVAL;
3846}
3847
3848static int
3849mxge_change_mtu(mxge_softc_t *sc, int mtu)
3850{
3851 struct ifnet *ifp = sc->ifp;
3852 int real_mtu, old_mtu;
3853 int err = 0;
3854
3855
3856 real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3857 if ((real_mtu > sc->max_mtu) || real_mtu < 60)
3858 return EINVAL;
3859 mtx_lock(&sc->driver_mtx);
3860 old_mtu = ifp->if_mtu;
3861 ifp->if_mtu = mtu;
3862 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3863 mxge_close(sc);
3864 err = mxge_open(sc);
3865 if (err != 0) {
3866 ifp->if_mtu = old_mtu;
3867 mxge_close(sc);
3868 (void) mxge_open(sc);
3869 }
3870 }
3871 mtx_unlock(&sc->driver_mtx);
3872 return err;
3873}
3874
3875static void
3876mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
3877{
3878 mxge_softc_t *sc = ifp->if_softc;
3879
3880
3881 if (sc == NULL)
3882 return;
3883 ifmr->ifm_status = IFM_AVALID;
3884 ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
3885 ifmr->ifm_active = IFM_AUTO | IFM_ETHER;
3886 ifmr->ifm_active |= sc->link_state ? IFM_FDX : 0;
3887}
3888
3889static int
3890mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
3891{
3892 mxge_softc_t *sc = ifp->if_softc;
3893 struct ifreq *ifr = (struct ifreq *)data;
3894 int err, mask;
3895
3896 err = 0;
3897 switch (command) {
3898 case SIOCSIFADDR:
3899 case SIOCGIFADDR:
3900 err = ether_ioctl(ifp, command, data);
3901 break;
3902
3903 case SIOCSIFMTU:
3904 err = mxge_change_mtu(sc, ifr->ifr_mtu);
3905 break;
3906
3907 case SIOCSIFFLAGS:
3908 mtx_lock(&sc->driver_mtx);
3909 if (sc->dying) {
3910 mtx_unlock(&sc->driver_mtx);
3911 return EINVAL;
3912 }
3913 if (ifp->if_flags & IFF_UP) {
3914 if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
3915 err = mxge_open(sc);
3916 } else {
3917 /* take care of promis can allmulti
3918 flag chages */
3919 mxge_change_promisc(sc,
3920 ifp->if_flags & IFF_PROMISC);
3921 mxge_set_multicast_list(sc);
3922 }
3923 } else {
3924 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3925 mxge_close(sc);
3926 }
3927 }
3928 mtx_unlock(&sc->driver_mtx);
3929 break;
3930
3931 case SIOCADDMULTI:
3932 case SIOCDELMULTI:
3933 mtx_lock(&sc->driver_mtx);
3934 mxge_set_multicast_list(sc);
3935 mtx_unlock(&sc->driver_mtx);
3936 break;
3937
3938 case SIOCSIFCAP:
3939 mtx_lock(&sc->driver_mtx);
3940 mask = ifr->ifr_reqcap ^ ifp->if_capenable;
3941 if (mask & IFCAP_TXCSUM) {
3942 if (IFCAP_TXCSUM & ifp->if_capenable) {
3943 ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
3944 ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
3945 | CSUM_TSO);
3946 } else {
3947 ifp->if_capenable |= IFCAP_TXCSUM;
3948 ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
3949 }
3950 } else if (mask & IFCAP_RXCSUM) {
3951 if (IFCAP_RXCSUM & ifp->if_capenable) {
3952 ifp->if_capenable &= ~IFCAP_RXCSUM;
3953 sc->csum_flag = 0;
3954 } else {
3955 ifp->if_capenable |= IFCAP_RXCSUM;
3956 sc->csum_flag = 1;
3957 }
3958 }
3959 if (mask & IFCAP_TSO4) {
3960 if (IFCAP_TSO4 & ifp->if_capenable) {
3961 ifp->if_capenable &= ~IFCAP_TSO4;
3962 ifp->if_hwassist &= ~CSUM_TSO;
3963 } else if (IFCAP_TXCSUM & ifp->if_capenable) {
3964 ifp->if_capenable |= IFCAP_TSO4;
3965 ifp->if_hwassist |= CSUM_TSO;
3966 } else {
3967 printf("mxge requires tx checksum offload"
3968 " be enabled to use TSO\n");
3969 err = EINVAL;
3970 }
3971 }
3972 if (mask & IFCAP_LRO) {
3973 if (IFCAP_LRO & ifp->if_capenable)
3974 err = mxge_change_lro_locked(sc, 0);
3975 else
3976 err = mxge_change_lro_locked(sc, mxge_lro_cnt);
3977 }
3978 if (mask & IFCAP_VLAN_HWTAGGING)
3979 ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
3980 mtx_unlock(&sc->driver_mtx);
3981 VLAN_CAPABILITIES(ifp);
3982
3983 break;
3984
3985 case SIOCGIFMEDIA:
3986 err = ifmedia_ioctl(ifp, (struct ifreq *)data,
3987 &sc->media, command);
3988 break;
3989
3990 default:
3991 err = ENOTTY;
3992 }
3993 return err;
3994}
3995
3996static void
3997mxge_fetch_tunables(mxge_softc_t *sc)
3998{
3999
4000 TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4001 TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4002 &mxge_flow_control);
4003 TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4004 &mxge_intr_coal_delay);
4005 TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4006 &mxge_nvidia_ecrc_enable);
4007 TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4008 &mxge_force_firmware);
4009 TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4010 &mxge_deassert_wait);
4011 TUNABLE_INT_FETCH("hw.mxge.verbose",
4012 &mxge_verbose);
4013 TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4014 TUNABLE_INT_FETCH("hw.mxge.lro_cnt", &sc->lro_cnt);
4015 TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4016 TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4017 TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4018 if (sc->lro_cnt != 0)
4019 mxge_lro_cnt = sc->lro_cnt;
4020
4021 if (bootverbose)
4022 mxge_verbose = 1;
4023 if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4024 mxge_intr_coal_delay = 30;
4025 if (mxge_ticks == 0)
4026 mxge_ticks = hz / 2;
4027 sc->pause = mxge_flow_control;
4028 if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4029 || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4030 mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
4031 }
4032 if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4033 mxge_initial_mtu < ETHER_MIN_LEN)
4034 mxge_initial_mtu = ETHERMTU_JUMBO;
4035}
4036
4037
4038static void
4039mxge_free_slices(mxge_softc_t *sc)
4040{
4041 struct mxge_slice_state *ss;
4042 int i;
4043
4044
4045 if (sc->ss == NULL)
4046 return;
4047
4048 for (i = 0; i < sc->num_slices; i++) {
4049 ss = &sc->ss[i];
4050 if (ss->fw_stats != NULL) {
4051 mxge_dma_free(&ss->fw_stats_dma);
4052 ss->fw_stats = NULL;
4053#ifdef IFNET_BUF_RING
4054 if (ss->tx.br != NULL) {
4055 drbr_free(ss->tx.br, M_DEVBUF);
4056 ss->tx.br = NULL;
4057 }
4058#endif
4059 mtx_destroy(&ss->tx.mtx);
4060 }
4061 if (ss->rx_done.entry != NULL) {
4062 mxge_dma_free(&ss->rx_done.dma);
4063 ss->rx_done.entry = NULL;
4064 }
4065 }
4066 free(sc->ss, M_DEVBUF);
4067 sc->ss = NULL;
4068}
4069
4070static int
4071mxge_alloc_slices(mxge_softc_t *sc)
4072{
4073 mxge_cmd_t cmd;
4074 struct mxge_slice_state *ss;
4075 size_t bytes;
4076 int err, i, max_intr_slots;
4077
4078 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4079 if (err != 0) {
4080 device_printf(sc->dev, "Cannot determine rx ring size\n");
4081 return err;
4082 }
4083 sc->rx_ring_size = cmd.data0;
4084 max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4085
4086 bytes = sizeof (*sc->ss) * sc->num_slices;
4087 sc->ss = malloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4088 if (sc->ss == NULL)
4089 return (ENOMEM);
4090 for (i = 0; i < sc->num_slices; i++) {
4091 ss = &sc->ss[i];
4092
4093 ss->sc = sc;
4094
4095 /* allocate per-slice rx interrupt queues */
4096
4097 bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4098 err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4099 if (err != 0)
4100 goto abort;
4101 ss->rx_done.entry = ss->rx_done.dma.addr;
4102 bzero(ss->rx_done.entry, bytes);
4103
4104 /*
4105 * allocate the per-slice firmware stats; stats
4106 * (including tx) are used used only on the first
4107 * slice for now
4108 */
4109#ifndef IFNET_BUF_RING
4110 if (i > 0)
4111 continue;
4112#endif
4113
4114 bytes = sizeof (*ss->fw_stats);
4115 err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4116 sizeof (*ss->fw_stats), 64);
4117 if (err != 0)
4118 goto abort;
4119 ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4120 snprintf(ss->tx.mtx_name, sizeof(ss->tx.mtx_name),
4121 "%s:tx(%d)", device_get_nameunit(sc->dev), i);
4122 mtx_init(&ss->tx.mtx, ss->tx.mtx_name, NULL, MTX_DEF);
4123#ifdef IFNET_BUF_RING
4124 ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4125 &ss->tx.mtx);
4126#endif
4127 }
4128
4129 return (0);
4130
4131abort:
4132 mxge_free_slices(sc);
4133 return (ENOMEM);
4134}
4135
4136static void
4137mxge_slice_probe(mxge_softc_t *sc)
4138{
4139 mxge_cmd_t cmd;
4140 char *old_fw;
4141 int msix_cnt, status, max_intr_slots;
4142
4143 sc->num_slices = 1;
4144 /*
4145 * don't enable multiple slices if they are not enabled,
4146 * or if this is not an SMP system
4147 */
4148
4149 if (mxge_max_slices == 0 || mxge_max_slices == 1 || mp_ncpus < 2)
4150 return;
4151
4152 /* see how many MSI-X interrupts are available */
4153 msix_cnt = pci_msix_count(sc->dev);
4154 if (msix_cnt < 2)
4155 return;
4156
4157 /* now load the slice aware firmware see what it supports */
4158 old_fw = sc->fw_name;
4159 if (old_fw == mxge_fw_aligned)
4160 sc->fw_name = mxge_fw_rss_aligned;
4161 else
4162 sc->fw_name = mxge_fw_rss_unaligned;
4163 status = mxge_load_firmware(sc, 0);
4164 if (status != 0) {
4165 device_printf(sc->dev, "Falling back to a single slice\n");
4166 return;
4167 }
4168
4169 /* try to send a reset command to the card to see if it
4170 is alive */
4171 memset(&cmd, 0, sizeof (cmd));
4172 status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4173 if (status != 0) {
4174 device_printf(sc->dev, "failed reset\n");
4175 goto abort_with_fw;
4176 }