1 /******************************************************************************
3 Copyright (c) 2006-2009, Myricom Inc.
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
9 1. Redistributions of source code must retain the above copyright notice,
10 this list of conditions and the following disclaimer.
12 2. Neither the name of the Myricom Inc, nor the names of its
13 contributors may be used to endorse or promote products derived from
14 this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
28 ***************************************************************************/
30 #include <sys/cdefs.h>
31 /*__FBSDID("$FreeBSD: src/sys/dev/mxge/if_mxge.c,v 1.63 2009/06/26 11:45:06 rwatson Exp $");*/
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/linker.h>
36 #include <sys/firmware.h>
37 #include <sys/endian.h>
38 #include <sys/sockio.h>
40 #include <sys/malloc.h>
41 #include <sys/kernel.h>
43 #include <sys/module.h>
44 #include <sys/socket.h>
45 #include <sys/sysctl.h>
47 /* count xmits ourselves, rather than via drbr */
50 #include <net/if_arp.h>
51 #include <net/ethernet.h>
52 #include <net/if_dl.h>
53 #include <net/if_media.h>
57 #include <net/if_types.h>
58 #include <net/vlan/if_vlan_var.h>
61 #include <netinet/in_systm.h>
62 #include <netinet/in.h>
63 #include <netinet/ip.h>
64 #include <netinet/tcp.h>
66 #include <machine/resource.h>
70 #include <bus/pci/pcireg.h>
71 #include <bus/pci/pcivar.h>
72 #include <bus/pci/pci_private.h> /* XXX for pci_cfg_restore */
74 #include <vm/vm.h> /* for pmap_mapdev() */
77 #if defined(__i386) || defined(__amd64)
78 #include <machine/specialreg.h>
81 #include <dev/netif/mxge/mxge_mcp.h>
82 #include <dev/netif/mxge/mcp_gen_header.h>
83 /*#define MXGE_FAKE_IFP*/
84 #include <dev/netif/mxge/if_mxge_var.h>
86 #include <sys/buf_ring.h>
92 static int mxge_nvidia_ecrc_enable = 1;
93 static int mxge_force_firmware = 0;
94 static int mxge_intr_coal_delay = 30;
95 static int mxge_deassert_wait = 1;
96 static int mxge_flow_control = 1;
97 static int mxge_verbose = 0;
98 static int mxge_lro_cnt = 8;
99 static int mxge_ticks;
100 static int mxge_max_slices = 1;
101 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
102 static int mxge_always_promisc = 0;
103 static int mxge_initial_mtu = ETHERMTU_JUMBO;
104 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
105 static char *mxge_fw_aligned = "mxge_eth_z8e";
106 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
107 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
109 static int mxge_probe(device_t dev);
110 static int mxge_attach(device_t dev);
111 static int mxge_detach(device_t dev);
112 static int mxge_shutdown(device_t dev);
113 static void mxge_intr(void *arg);
115 static device_method_t mxge_methods[] =
117 /* Device interface */
118 DEVMETHOD(device_probe, mxge_probe),
119 DEVMETHOD(device_attach, mxge_attach),
120 DEVMETHOD(device_detach, mxge_detach),
121 DEVMETHOD(device_shutdown, mxge_shutdown),
125 static driver_t mxge_driver =
129 sizeof(mxge_softc_t),
132 static devclass_t mxge_devclass;
134 /* Declare ourselves to be a child of the PCI bus.*/
135 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
136 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
137 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
139 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
140 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
141 static int mxge_close(mxge_softc_t *sc);
142 static int mxge_open(mxge_softc_t *sc);
143 static void mxge_tick(void *arg);
146 mxge_probe(device_t dev)
151 if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
152 ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
153 (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
154 rev = pci_get_revid(dev);
156 case MXGE_PCI_REV_Z8E:
157 device_set_desc(dev, "Myri10G-PCIE-8A");
159 case MXGE_PCI_REV_Z8ES:
160 device_set_desc(dev, "Myri10G-PCIE-8B");
163 device_set_desc(dev, "Myri10G-PCIE-8??");
164 device_printf(dev, "Unrecognized rev %d NIC\n",
174 mxge_enable_wc(mxge_softc_t *sc)
177 #if defined(__i386) || defined(__amd64)
182 len = rman_get_size(sc->mem_res);
183 err = pmap_change_attr((vm_offset_t) sc->sram,
184 len, PAT_WRITE_COMBINING);
186 device_printf(sc->dev, "pmap_change_attr failed, %d\n",
192 sc->wc = 0; /* TBD: PAT support */
197 /* callback to get our DMA address */
199 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
203 *(bus_addr_t *) arg = segs->ds_addr;
208 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
209 bus_size_t alignment)
212 device_t dev = sc->dev;
213 bus_size_t boundary, maxsegsize;
215 if (bytes > 4096 && alignment == 4096) {
223 /* allocate DMAable memory tags */
224 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
225 alignment, /* alignment */
226 boundary, /* boundary */
227 BUS_SPACE_MAXADDR, /* low */
228 BUS_SPACE_MAXADDR, /* high */
229 NULL, NULL, /* filter */
232 maxsegsize, /* maxsegsize */
233 BUS_DMA_COHERENT, /* flags */
234 &dma->dmat); /* tag */
236 device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
240 /* allocate DMAable memory & map */
241 err = bus_dmamem_alloc(dma->dmat, &dma->addr,
242 (BUS_DMA_WAITOK | BUS_DMA_COHERENT
243 | BUS_DMA_ZERO), &dma->map);
245 device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
246 goto abort_with_dmat;
249 /* load the memory */
250 err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
251 mxge_dmamap_callback,
252 (void *)&dma->bus_addr, 0);
254 device_printf(dev, "couldn't load map (err = %d)\n", err);
260 bus_dmamem_free(dma->dmat, dma->addr, dma->map);
262 (void)bus_dma_tag_destroy(dma->dmat);
268 mxge_dma_free(mxge_dma_t *dma)
270 bus_dmamap_unload(dma->dmat, dma->map);
271 bus_dmamem_free(dma->dmat, dma->addr, dma->map);
272 (void)bus_dma_tag_destroy(dma->dmat);
276 * The eeprom strings on the lanaiX have the format
283 mxge_parse_strings(mxge_softc_t *sc)
285 #define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
290 ptr = sc->eeprom_strings;
291 limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
293 while (ptr < limit && *ptr != '\0') {
294 if (memcmp(ptr, "MAC=", 4) == 0) {
296 sc->mac_addr_string = ptr;
297 for (i = 0; i < 6; i++) {
299 if ((ptr + 2) > limit)
301 sc->mac_addr[i] = strtoul(ptr, NULL, 16);
304 } else if (memcmp(ptr, "PC=", 3) == 0) {
306 strncpy(sc->product_code_string, ptr,
307 sizeof (sc->product_code_string) - 1);
308 } else if (memcmp(ptr, "SN=", 3) == 0) {
310 strncpy(sc->serial_number_string, ptr,
311 sizeof (sc->serial_number_string) - 1);
313 MXGE_NEXT_STRING(ptr);
320 device_printf(sc->dev, "failed to parse eeprom_strings\n");
325 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
327 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
330 unsigned long base, off;
332 device_t pdev, mcp55;
333 uint16_t vendor_id, device_id, word;
334 uintptr_t bus, slot, func, ivend, idev;
338 if (!mxge_nvidia_ecrc_enable)
341 pdev = device_get_parent(device_get_parent(sc->dev));
343 device_printf(sc->dev, "could not find parent?\n");
346 vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
347 device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
349 if (vendor_id != 0x10de)
354 if (device_id == 0x005d) {
355 /* ck804, base address is magic */
357 } else if (device_id >= 0x0374 && device_id <= 0x378) {
358 /* mcp55, base address stored in chipset */
359 mcp55 = pci_find_bsf(0, 0, 0);
361 0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
362 0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
363 word = pci_read_config(mcp55, 0x90, 2);
364 base = ((unsigned long)word & 0x7ffeU) << 25;
371 Test below is commented because it is believed that doing
372 config read/write beyond 0xff will access the config space
373 for the next larger function. Uncomment this and remove
374 the hacky pmap_mapdev() way of accessing config space when
375 FreeBSD grows support for extended pcie config space access
378 /* See if we can, by some miracle, access the extended
380 val = pci_read_config(pdev, 0x178, 4);
381 if (val != 0xffffffff) {
383 pci_write_config(pdev, 0x178, val, 4);
387 /* Rather than using normal pci config space writes, we must
388 * map the Nvidia config space ourselves. This is because on
389 * opteron/nvidia class machine the 0xe000000 mapping is
390 * handled by the nvidia chipset, that means the internal PCI
391 * device (the on-chip northbridge), or the amd-8131 bridge
392 * and things behind them are not visible by this method.
395 BUS_READ_IVAR(device_get_parent(pdev), pdev,
397 BUS_READ_IVAR(device_get_parent(pdev), pdev,
398 PCI_IVAR_SLOT, &slot);
399 BUS_READ_IVAR(device_get_parent(pdev), pdev,
400 PCI_IVAR_FUNCTION, &func);
401 BUS_READ_IVAR(device_get_parent(pdev), pdev,
402 PCI_IVAR_VENDOR, &ivend);
403 BUS_READ_IVAR(device_get_parent(pdev), pdev,
404 PCI_IVAR_DEVICE, &idev);
407 + 0x00100000UL * (unsigned long)bus
408 + 0x00001000UL * (unsigned long)(func
411 /* map it into the kernel */
412 va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
416 device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
419 /* get a pointer to the config space mapped into the kernel */
420 cfgptr = va + (off & PAGE_MASK);
422 /* make sure that we can really access it */
423 vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
424 device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
425 if (! (vendor_id == ivend && device_id == idev)) {
426 device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
427 vendor_id, device_id);
428 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
432 ptr32 = (uint32_t*)(cfgptr + 0x178);
435 if (val == 0xffffffff) {
436 device_printf(sc->dev, "extended mapping failed\n");
437 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
441 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
443 device_printf(sc->dev,
444 "Enabled ECRC on upstream Nvidia bridge "
446 (int)bus, (int)slot, (int)func);
451 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
453 device_printf(sc->dev,
454 "Nforce 4 chipset on non-x86/amd64!?!?!\n");
461 mxge_dma_test(mxge_softc_t *sc, int test_type)
464 bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
470 /* Run a small DMA test.
471 * The magic multipliers to the length tell the firmware
472 * to do DMA read, write, or read+write tests. The
473 * results are returned in cmd.data0. The upper 16
474 * bits of the return is the number of transfers completed.
475 * The lower 16 bits is the time in 0.5us ticks that the
476 * transfers took to complete.
479 len = sc->tx_boundary;
481 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
482 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
483 cmd.data2 = len * 0x10000;
484 status = mxge_send_cmd(sc, test_type, &cmd);
489 sc->read_dma = ((cmd.data0>>16) * len * 2) /
490 (cmd.data0 & 0xffff);
491 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
492 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
493 cmd.data2 = len * 0x1;
494 status = mxge_send_cmd(sc, test_type, &cmd);
499 sc->write_dma = ((cmd.data0>>16) * len * 2) /
500 (cmd.data0 & 0xffff);
502 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
503 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
504 cmd.data2 = len * 0x10001;
505 status = mxge_send_cmd(sc, test_type, &cmd);
510 sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
511 (cmd.data0 & 0xffff);
514 if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
515 device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
522 * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
523 * when the PCI-E Completion packets are aligned on an 8-byte
524 * boundary. Some PCI-E chip sets always align Completion packets; on
525 * the ones that do not, the alignment can be enforced by enabling
526 * ECRC generation (if supported).
528 * When PCI-E Completion packets are not aligned, it is actually more
529 * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
531 * If the driver can neither enable ECRC nor verify that it has
532 * already been enabled, then it must use a firmware image which works
533 * around unaligned completion packets (ethp_z8e.dat), and it should
534 * also ensure that it never gives the device a Read-DMA which is
535 * larger than 2KB by setting the tx_boundary to 2KB. If ECRC is
536 * enabled, then the driver should use the aligned (eth_z8e.dat)
537 * firmware image, and set tx_boundary to 4KB.
541 mxge_firmware_probe(mxge_softc_t *sc)
543 device_t dev = sc->dev;
547 sc->tx_boundary = 4096;
549 * Verify the max read request size was set to 4KB
550 * before trying the test with 4KB.
552 if (pci_find_extcap(dev, PCIY_EXPRESS, ®) == 0) {
553 pectl = pci_read_config(dev, reg + 0x8, 2);
554 if ((pectl & (5 << 12)) != (5 << 12)) {
555 device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
557 sc->tx_boundary = 2048;
562 * load the optimized firmware (which assumes aligned PCIe
563 * completions) in order to see if it works on this host.
565 sc->fw_name = mxge_fw_aligned;
566 status = mxge_load_firmware(sc, 1);
572 * Enable ECRC if possible
574 mxge_enable_nvidia_ecrc(sc);
577 * Run a DMA test which watches for unaligned completions and
578 * aborts on the first one seen.
581 status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
583 return 0; /* keep the aligned firmware */
586 device_printf(dev, "DMA test failed: %d\n", status);
587 if (status == ENOSYS)
588 device_printf(dev, "Falling back to ethp! "
589 "Please install up to date fw\n");
594 mxge_select_firmware(mxge_softc_t *sc)
599 if (mxge_force_firmware != 0) {
600 if (mxge_force_firmware == 1)
605 device_printf(sc->dev,
606 "Assuming %s completions (forced)\n",
607 aligned ? "aligned" : "unaligned");
611 /* if the PCIe link width is 4 or less, we can use the aligned
612 firmware and skip any checks */
613 if (sc->link_width != 0 && sc->link_width <= 4) {
614 device_printf(sc->dev,
615 "PCIe x%d Link, expect reduced performance\n",
621 if (0 == mxge_firmware_probe(sc))
626 sc->fw_name = mxge_fw_aligned;
627 sc->tx_boundary = 4096;
629 sc->fw_name = mxge_fw_unaligned;
630 sc->tx_boundary = 2048;
632 return (mxge_load_firmware(sc, 0));
642 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
646 if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
647 device_printf(sc->dev, "Bad firmware type: 0x%x\n",
648 be32toh(hdr->mcp_type));
652 /* save firmware version for sysctl */
653 strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
655 device_printf(sc->dev, "firmware id: %s\n", hdr->version);
657 ksscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
658 &sc->fw_ver_minor, &sc->fw_ver_tiny);
660 if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
661 && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
662 device_printf(sc->dev, "Found firmware version %s\n",
664 device_printf(sc->dev, "Driver needs %d.%d\n",
665 MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
673 z_alloc(void *nil, u_int items, u_int size)
677 ptr = kmalloc(items * size, M_TEMP, M_NOWAIT);
682 z_free(void *nil, void *ptr)
689 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
692 const mcp_gen_header_t *hdr;
699 fw = firmware_image_load(sc->fw_name, NULL);
701 device_printf(sc->dev, "Could not find firmware image %s\n",
706 /* setup zlib and decompress f/w */
707 bzero(&zs, sizeof (zs));
710 status = inflateInit(&zs);
711 if (status != Z_OK) {
716 /* the uncompressed size is stored as the firmware version,
717 which would otherwise go unused */
718 fw_len = (size_t) fw->version;
719 inflate_buffer = kmalloc(fw_len, M_TEMP, M_NOWAIT);
720 if (inflate_buffer == NULL)
722 zs.avail_in = fw->datasize;
723 zs.next_in = __DECONST(char *, fw->data);
724 zs.avail_out = fw_len;
725 zs.next_out = inflate_buffer;
726 status = inflate(&zs, Z_FINISH);
727 if (status != Z_STREAM_END) {
728 device_printf(sc->dev, "zlib %d\n", status);
730 goto abort_with_buffer;
734 hdr_offset = htobe32(*(const uint32_t *)
735 (fw->fw_image + MCP_HEADER_PTR_OFFSET));
736 if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
737 device_printf(sc->dev, "Bad firmware file");
741 hdr = (const void*)(fw->fw_image + hdr_offset);
743 status = mxge_validate_firmware(sc, hdr);
747 /* Copy the inflated firmware to NIC SRAM. */
748 for (i = 0; i < fw_len; i += 256) {
749 mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
751 min(256U, (unsigned)(fw_len - i)));
761 kfree(inflate_buffer, M_TEMP);
766 firmware_image_unload(fw);
771 * Enable or disable periodic RDMAs from the host to make certain
772 * chipsets resend dropped PCIe messages
776 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
779 volatile uint32_t *confirm;
780 volatile char *submit;
781 uint32_t *buf, dma_low, dma_high;
784 buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
786 /* clear confirmation addr */
787 confirm = (volatile uint32_t *)sc->cmd;
791 /* send an rdma command to the PCIe engine, and wait for the
792 response in the confirmation address. The firmware should
793 write a -1 there to indicate it is alive and well
796 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
797 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
798 buf[0] = htobe32(dma_high); /* confirm addr MSW */
799 buf[1] = htobe32(dma_low); /* confirm addr LSW */
800 buf[2] = htobe32(0xffffffff); /* confirm data */
801 dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
802 dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
803 buf[3] = htobe32(dma_high); /* dummy addr MSW */
804 buf[4] = htobe32(dma_low); /* dummy addr LSW */
805 buf[5] = htobe32(enable); /* enable? */
808 submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
810 mxge_pio_copy(submit, buf, 64);
815 while (*confirm != 0xffffffff && i < 20) {
819 if (*confirm != 0xffffffff) {
820 device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
821 (enable ? "enable" : "disable"), confirm,
828 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
831 char buf_bytes[sizeof(*buf) + 8];
832 volatile mcp_cmd_response_t *response = sc->cmd;
833 volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
834 uint32_t dma_low, dma_high;
835 int err, sleep_total = 0;
837 /* ensure buf is aligned to 8 bytes */
838 buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
840 buf->data0 = htobe32(data->data0);
841 buf->data1 = htobe32(data->data1);
842 buf->data2 = htobe32(data->data2);
843 buf->cmd = htobe32(cmd);
844 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
845 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
847 buf->response_addr.low = htobe32(dma_low);
848 buf->response_addr.high = htobe32(dma_high);
849 lockmgr(&sc->cmd_lock, LK_EXCLUSIVE);
850 response->result = 0xffffffff;
852 mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
854 /* wait up to 20ms */
856 for (sleep_total = 0; sleep_total < 20; sleep_total++) {
857 bus_dmamap_sync(sc->cmd_dma.dmat,
858 sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
860 switch (be32toh(response->result)) {
862 data->data0 = be32toh(response->data);
868 case MXGEFW_CMD_UNKNOWN:
871 case MXGEFW_CMD_ERROR_UNALIGNED:
874 case MXGEFW_CMD_ERROR_BUSY:
878 device_printf(sc->dev,
880 "failed, result = %d\n",
881 cmd, be32toh(response->result));
889 device_printf(sc->dev, "mxge: command %d timed out"
891 cmd, be32toh(response->result));
892 lockmgr(&sc->cmd_lock, LK_RELEASE);
897 mxge_adopt_running_firmware(mxge_softc_t *sc)
899 struct mcp_gen_header *hdr;
900 const size_t bytes = sizeof (struct mcp_gen_header);
904 /* find running firmware header */
905 hdr_offset = htobe32(*(volatile uint32_t *)
906 (sc->sram + MCP_HEADER_PTR_OFFSET));
908 if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
909 device_printf(sc->dev,
910 "Running firmware has bad header offset (%d)\n",
915 /* copy header of running firmware from SRAM to host memory to
916 * validate firmware */
917 hdr = kmalloc(bytes, M_DEVBUF, M_NOWAIT);
919 device_printf(sc->dev, "could not kmalloc firmware hdr\n");
922 bus_space_read_region_1(rman_get_bustag(sc->mem_res),
923 rman_get_bushandle(sc->mem_res),
924 hdr_offset, (char *)hdr, bytes);
925 status = mxge_validate_firmware(sc, hdr);
926 kfree(hdr, M_DEVBUF);
929 * check to see if adopted firmware has bug where adopting
930 * it will cause broadcasts to be filtered unless the NIC
931 * is kept in ALLMULTI mode
933 if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
934 sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
935 sc->adopted_rx_filter_bug = 1;
936 device_printf(sc->dev, "Adopting fw %d.%d.%d: "
937 "working around rx filter bug\n",
938 sc->fw_ver_major, sc->fw_ver_minor,
947 mxge_load_firmware(mxge_softc_t *sc, int adopt)
949 volatile uint32_t *confirm;
950 volatile char *submit;
952 uint32_t *buf, size, dma_low, dma_high;
955 buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
957 size = sc->sram_size;
958 status = mxge_load_firmware_helper(sc, &size);
962 /* Try to use the currently running firmware, if
964 status = mxge_adopt_running_firmware(sc);
966 device_printf(sc->dev,
967 "failed to adopt running firmware\n");
970 device_printf(sc->dev,
971 "Successfully adopted running firmware\n");
972 if (sc->tx_boundary == 4096) {
973 device_printf(sc->dev,
974 "Using firmware currently running on NIC"
976 device_printf(sc->dev,
977 "performance consider loading optimized "
980 sc->fw_name = mxge_fw_unaligned;
981 sc->tx_boundary = 2048;
984 /* clear confirmation addr */
985 confirm = (volatile uint32_t *)sc->cmd;
988 /* send a reload command to the bootstrap MCP, and wait for the
989 response in the confirmation address. The firmware should
990 write a -1 there to indicate it is alive and well
993 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
994 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
996 buf[0] = htobe32(dma_high); /* confirm addr MSW */
997 buf[1] = htobe32(dma_low); /* confirm addr LSW */
998 buf[2] = htobe32(0xffffffff); /* confirm data */
1000 /* FIX: All newest firmware should un-protect the bottom of
1001 the sram before handoff. However, the very first interfaces
1002 do not. Therefore the handoff copy must skip the first 8 bytes
1004 /* where the code starts*/
1005 buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1006 buf[4] = htobe32(size - 8); /* length of code */
1007 buf[5] = htobe32(8); /* where to copy to */
1008 buf[6] = htobe32(0); /* where to jump to */
1010 submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1011 mxge_pio_copy(submit, buf, 64);
1016 while (*confirm != 0xffffffff && i < 20) {
1019 bus_dmamap_sync(sc->cmd_dma.dmat,
1020 sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1022 if (*confirm != 0xffffffff) {
1023 device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1032 mxge_update_mac_address(mxge_softc_t *sc)
1035 uint8_t *addr = sc->mac_addr;
1039 cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1040 | (addr[2] << 8) | addr[3]);
1042 cmd.data1 = ((addr[4] << 8) | (addr[5]));
1044 status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1049 mxge_change_pause(mxge_softc_t *sc, int pause)
1055 status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1058 status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1062 device_printf(sc->dev, "Failed to set flow control mode\n");
1070 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1075 if (mxge_always_promisc)
1079 status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1082 status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1086 device_printf(sc->dev, "Failed to set promisc mode\n");
1091 mxge_set_multicast_list(mxge_softc_t *sc)
1094 struct ifmultiaddr *ifma;
1095 struct ifnet *ifp = sc->ifp;
1098 /* This firmware is known to not support multicast */
1099 if (!sc->fw_multicast_support)
1102 /* Disable multicast filtering while we play with the lists*/
1103 err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1105 device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1106 " error status: %d\n", err);
1110 if (sc->adopted_rx_filter_bug)
1113 if (ifp->if_flags & IFF_ALLMULTI)
1114 /* request to disable multicast filtering, so quit here */
1117 /* Flush all the filters */
1119 err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1121 device_printf(sc->dev,
1122 "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1123 ", error status: %d\n", err);
1127 /* Walk the multicast list, and add each address */
1129 if_maddr_rlock(ifp);
1130 TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1131 if (ifma->ifma_addr->sa_family != AF_LINK)
1133 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1135 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1137 cmd.data0 = htonl(cmd.data0);
1138 cmd.data1 = htonl(cmd.data1);
1139 err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1141 device_printf(sc->dev, "Failed "
1142 "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1144 /* abort, leaving multicast filtering off */
1145 if_maddr_runlock(ifp);
1149 if_maddr_runlock(ifp);
1150 /* Enable multicast filtering */
1151 err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1153 device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1154 ", error status: %d\n", err);
1159 mxge_max_mtu(mxge_softc_t *sc)
1164 if (MJUMPAGESIZE - MXGEFW_PAD > MXGEFW_MAX_MTU)
1165 return MXGEFW_MAX_MTU - MXGEFW_PAD;
1167 /* try to set nbufs to see if it we can
1168 use virtually contiguous jumbos */
1170 status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1173 return MXGEFW_MAX_MTU - MXGEFW_PAD;
1175 /* otherwise, we're limited to MJUMPAGESIZE */
1176 return MJUMPAGESIZE - MXGEFW_PAD;
1180 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1182 struct mxge_slice_state *ss;
1183 mxge_rx_done_t *rx_done;
1184 volatile uint32_t *irq_claim;
1188 /* try to send a reset command to the card to see if it
1190 memset(&cmd, 0, sizeof (cmd));
1191 status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1193 device_printf(sc->dev, "failed reset\n");
1197 mxge_dummy_rdma(sc, 1);
1200 /* set the intrq size */
1201 cmd.data0 = sc->rx_ring_size;
1202 status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1205 * Even though we already know how many slices are supported
1206 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1207 * has magic side effects, and must be called after a reset.
1208 * It must be called prior to calling any RSS related cmds,
1209 * including assigning an interrupt queue for anything but
1210 * slice 0. It must also be called *after*
1211 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1212 * the firmware to compute offsets.
1215 if (sc->num_slices > 1) {
1216 /* ask the maximum number of slices it supports */
1217 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1220 device_printf(sc->dev,
1221 "failed to get number of slices\n");
1225 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1226 * to setting up the interrupt queue DMA
1228 cmd.data0 = sc->num_slices;
1229 cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1230 #ifdef IFNET_BUF_RING
1231 cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1233 status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1236 device_printf(sc->dev,
1237 "failed to set number of slices\n");
1243 if (interrupts_setup) {
1244 /* Now exchange information about interrupts */
1245 for (slice = 0; slice < sc->num_slices; slice++) {
1246 rx_done = &sc->ss[slice].rx_done;
1247 memset(rx_done->entry, 0, sc->rx_ring_size);
1248 cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1249 cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1251 status |= mxge_send_cmd(sc,
1252 MXGEFW_CMD_SET_INTRQ_DMA,
1257 status |= mxge_send_cmd(sc,
1258 MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1261 sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1263 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1264 irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1267 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1269 sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1271 device_printf(sc->dev, "failed set interrupt parameters\n");
1276 *sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1279 /* run a DMA benchmark */
1280 (void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1282 for (slice = 0; slice < sc->num_slices; slice++) {
1283 ss = &sc->ss[slice];
1285 ss->irq_claim = irq_claim + (2 * slice);
1286 /* reset mcp/driver shared state back to 0 */
1287 ss->rx_done.idx = 0;
1288 ss->rx_done.cnt = 0;
1291 ss->tx.pkt_done = 0;
1292 ss->tx.queue_active = 0;
1293 ss->tx.activate = 0;
1294 ss->tx.deactivate = 0;
1299 ss->rx_small.cnt = 0;
1300 ss->lro_bad_csum = 0;
1302 ss->lro_flushed = 0;
1303 if (ss->fw_stats != NULL) {
1304 ss->fw_stats->valid = 0;
1305 ss->fw_stats->send_done_count = 0;
1308 sc->rdma_tags_available = 15;
1309 status = mxge_update_mac_address(sc);
1310 mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1311 mxge_change_pause(sc, sc->pause);
1312 mxge_set_multicast_list(sc);
1317 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1320 unsigned int intr_coal_delay;
1324 intr_coal_delay = sc->intr_coal_delay;
1325 err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1329 if (intr_coal_delay == sc->intr_coal_delay)
1332 if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1335 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
1336 *sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1337 sc->intr_coal_delay = intr_coal_delay;
1339 lockmgr(&sc->driver_lock, LK_RELEASE);
1344 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1347 unsigned int enabled;
1351 enabled = sc->pause;
1352 err = sysctl_handle_int(oidp, &enabled, arg2, req);
1356 if (enabled == sc->pause)
1359 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
1360 err = mxge_change_pause(sc, enabled);
1361 lockmgr(&sc->driver_lock, LK_RELEASE);
1366 mxge_change_lro_locked(mxge_softc_t *sc, int lro_cnt)
1373 ifp->if_capenable &= ~IFCAP_LRO;
1375 ifp->if_capenable |= IFCAP_LRO;
1376 sc->lro_cnt = lro_cnt;
1377 if (ifp->if_flags & IFF_RUNNING) {
1379 err = mxge_open(sc);
1385 mxge_change_lro(SYSCTL_HANDLER_ARGS)
1388 unsigned int lro_cnt;
1392 lro_cnt = sc->lro_cnt;
1393 err = sysctl_handle_int(oidp, &lro_cnt, arg2, req);
1397 if (lro_cnt == sc->lro_cnt)
1403 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
1404 err = mxge_change_lro_locked(sc, lro_cnt);
1405 lockmgr(&sc->driver_lock, LK_RELEASE);
1410 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1416 arg2 = be32toh(*(int *)arg1);
1418 err = sysctl_handle_int(oidp, arg1, arg2, req);
1424 mxge_rem_sysctls(mxge_softc_t *sc)
1426 struct mxge_slice_state *ss;
1429 if (sc->slice_sysctl_tree == NULL)
1432 for (slice = 0; slice < sc->num_slices; slice++) {
1433 ss = &sc->ss[slice];
1434 if (ss == NULL || ss->sysctl_tree == NULL)
1436 sysctl_ctx_free(&ss->sysctl_ctx);
1437 ss->sysctl_tree = NULL;
1439 sysctl_ctx_free(&sc->slice_sysctl_ctx);
1440 sc->slice_sysctl_tree = NULL;
1444 mxge_add_sysctls(mxge_softc_t *sc)
1446 struct sysctl_ctx_list *ctx;
1447 struct sysctl_oid_list *children;
1449 struct mxge_slice_state *ss;
1453 ctx = &sc->sysctl_ctx;
1454 sysctl_ctx_init(ctx);
1455 sc->sysctl_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_STATIC_CHILDREN(_hw),
1457 device_get_nameunit(sc->dev),
1459 if (sc->sysctl_tree == NULL) {
1460 device_printf(sc->dev, "can't add sysctl node\n");
1464 children = SYSCTL_CHILDREN(sc->sysctl_tree);
1465 fw = sc->ss[0].fw_stats;
1467 /* random information */
1468 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1470 CTLFLAG_RD, &sc->fw_version,
1471 0, "firmware version");
1472 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1474 CTLFLAG_RD, &sc->serial_number_string,
1475 0, "serial number");
1476 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1478 CTLFLAG_RD, &sc->product_code_string,
1480 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1482 CTLFLAG_RD, &sc->link_width,
1484 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1486 CTLFLAG_RD, &sc->tx_boundary,
1488 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1490 CTLFLAG_RD, &sc->wc,
1491 0, "write combining PIO?");
1492 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1494 CTLFLAG_RD, &sc->read_dma,
1495 0, "DMA Read speed in MB/s");
1496 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1498 CTLFLAG_RD, &sc->write_dma,
1499 0, "DMA Write speed in MB/s");
1500 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1501 "read_write_dma_MBs",
1502 CTLFLAG_RD, &sc->read_write_dma,
1503 0, "DMA concurrent Read/Write speed in MB/s");
1506 /* performance related tunables */
1507 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1509 CTLTYPE_INT|CTLFLAG_RW, sc,
1510 0, mxge_change_intr_coal,
1511 "I", "interrupt coalescing delay in usecs");
1513 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1514 "flow_control_enabled",
1515 CTLTYPE_INT|CTLFLAG_RW, sc,
1516 0, mxge_change_flow_control,
1517 "I", "interrupt coalescing delay in usecs");
1519 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1521 CTLFLAG_RW, &mxge_deassert_wait,
1522 0, "Wait for IRQ line to go low in ihandler");
1524 /* stats block from firmware is in network byte order.
1526 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1528 CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1529 0, mxge_handle_be32,
1531 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1532 "rdma_tags_available",
1533 CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1534 0, mxge_handle_be32,
1535 "I", "rdma_tags_available");
1536 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1537 "dropped_bad_crc32",
1538 CTLTYPE_INT|CTLFLAG_RD,
1539 &fw->dropped_bad_crc32,
1540 0, mxge_handle_be32,
1541 "I", "dropped_bad_crc32");
1542 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1544 CTLTYPE_INT|CTLFLAG_RD,
1545 &fw->dropped_bad_phy,
1546 0, mxge_handle_be32,
1547 "I", "dropped_bad_phy");
1548 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1549 "dropped_link_error_or_filtered",
1550 CTLTYPE_INT|CTLFLAG_RD,
1551 &fw->dropped_link_error_or_filtered,
1552 0, mxge_handle_be32,
1553 "I", "dropped_link_error_or_filtered");
1554 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1555 "dropped_link_overflow",
1556 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1557 0, mxge_handle_be32,
1558 "I", "dropped_link_overflow");
1559 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1560 "dropped_multicast_filtered",
1561 CTLTYPE_INT|CTLFLAG_RD,
1562 &fw->dropped_multicast_filtered,
1563 0, mxge_handle_be32,
1564 "I", "dropped_multicast_filtered");
1565 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1566 "dropped_no_big_buffer",
1567 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1568 0, mxge_handle_be32,
1569 "I", "dropped_no_big_buffer");
1570 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1571 "dropped_no_small_buffer",
1572 CTLTYPE_INT|CTLFLAG_RD,
1573 &fw->dropped_no_small_buffer,
1574 0, mxge_handle_be32,
1575 "I", "dropped_no_small_buffer");
1576 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1578 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1579 0, mxge_handle_be32,
1580 "I", "dropped_overrun");
1581 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1583 CTLTYPE_INT|CTLFLAG_RD,
1585 0, mxge_handle_be32,
1586 "I", "dropped_pause");
1587 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1589 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1590 0, mxge_handle_be32,
1591 "I", "dropped_runt");
1593 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1594 "dropped_unicast_filtered",
1595 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1596 0, mxge_handle_be32,
1597 "I", "dropped_unicast_filtered");
1599 /* verbose printing? */
1600 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1602 CTLFLAG_RW, &mxge_verbose,
1603 0, "verbose printing");
1606 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1608 CTLTYPE_INT|CTLFLAG_RW, sc,
1610 "I", "number of lro merge queues");
1613 /* add counters exported for debugging from all slices */
1614 sysctl_ctx_init(&sc->slice_sysctl_ctx);
1615 sc->slice_sysctl_tree =
1616 SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1617 "slice", CTLFLAG_RD, 0, "");
1619 for (slice = 0; slice < sc->num_slices; slice++) {
1620 ss = &sc->ss[slice];
1621 sysctl_ctx_init(&ss->sysctl_ctx);
1622 ctx = &ss->sysctl_ctx;
1623 children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1624 ksprintf(slice_num, "%d", slice);
1626 SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1628 children = SYSCTL_CHILDREN(ss->sysctl_tree);
1629 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1631 CTLFLAG_RD, &ss->rx_small.cnt,
1633 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1635 CTLFLAG_RD, &ss->rx_big.cnt,
1637 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1638 "lro_flushed", CTLFLAG_RD, &ss->lro_flushed,
1639 0, "number of lro merge queues flushed");
1641 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1642 "lro_queued", CTLFLAG_RD, &ss->lro_queued,
1643 0, "number of frames appended to lro merge"
1646 #ifndef IFNET_BUF_RING
1647 /* only transmit from slice 0 for now */
1651 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1653 CTLFLAG_RD, &ss->tx.req,
1656 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1658 CTLFLAG_RD, &ss->tx.done,
1660 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1662 CTLFLAG_RD, &ss->tx.pkt_done,
1664 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1666 CTLFLAG_RD, &ss->tx.stall,
1668 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1670 CTLFLAG_RD, &ss->tx.wake,
1672 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1674 CTLFLAG_RD, &ss->tx.defrag,
1676 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1678 CTLFLAG_RD, &ss->tx.queue_active,
1679 0, "tx_queue_active");
1680 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1682 CTLFLAG_RD, &ss->tx.activate,
1684 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1686 CTLFLAG_RD, &ss->tx.deactivate,
1687 0, "tx_deactivate");
1691 /* copy an array of mcp_kreq_ether_send_t's to the mcp. Copy
1692 backwards one at a time and handle ring wraps */
1695 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1696 mcp_kreq_ether_send_t *src, int cnt)
1698 int idx, starting_slot;
1699 starting_slot = tx->req;
1702 idx = (starting_slot + cnt) & tx->mask;
1703 mxge_pio_copy(&tx->lanai[idx],
1704 &src[cnt], sizeof(*src));
1710 * copy an array of mcp_kreq_ether_send_t's to the mcp. Copy
1711 * at most 32 bytes at a time, so as to avoid involving the software
1712 * pio handler in the nic. We re-write the first segment's flags
1713 * to mark them valid only after writing the entire chain
1717 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1722 volatile uint32_t *dst_ints;
1723 mcp_kreq_ether_send_t *srcp;
1724 volatile mcp_kreq_ether_send_t *dstp, *dst;
1727 idx = tx->req & tx->mask;
1729 last_flags = src->flags;
1732 dst = dstp = &tx->lanai[idx];
1735 if ((idx + cnt) < tx->mask) {
1736 for (i = 0; i < (cnt - 1); i += 2) {
1737 mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1738 wmb(); /* force write every 32 bytes */
1743 /* submit all but the first request, and ensure
1744 that it is submitted below */
1745 mxge_submit_req_backwards(tx, src, cnt);
1749 /* submit the first request */
1750 mxge_pio_copy(dstp, srcp, sizeof(*src));
1751 wmb(); /* barrier before setting valid flag */
1754 /* re-write the last 32-bits with the valid flags */
1755 src->flags = last_flags;
1756 src_ints = (uint32_t *)src;
1758 dst_ints = (volatile uint32_t *)dst;
1760 *dst_ints = *src_ints;
1768 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1769 int busdma_seg_cnt, int ip_off)
1772 mcp_kreq_ether_send_t *req;
1773 bus_dma_segment_t *seg;
1776 uint32_t low, high_swapped;
1777 int len, seglen, cum_len, cum_len_next;
1778 int next_is_first, chop, cnt, rdma_count, small;
1779 uint16_t pseudo_hdr_offset, cksum_offset, mss;
1780 uint8_t flags, flags_next;
1783 mss = m->m_pkthdr.tso_segsz;
1785 /* negative cum_len signifies to the
1786 * send loop that we are still in the
1787 * header portion of the TSO packet.
1790 /* ensure we have the ethernet, IP and TCP
1791 header together in the first mbuf, copy
1792 it to a scratch buffer if not */
1793 if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1794 m_copydata(m, 0, ip_off + sizeof (*ip),
1796 ip = (struct ip *)(ss->scratch + ip_off);
1798 ip = (struct ip *)(mtod(m, char *) + ip_off);
1800 if (__predict_false(m->m_len < ip_off + (ip->ip_hl << 2)
1802 m_copydata(m, 0, ip_off + (ip->ip_hl << 2)
1803 + sizeof (*tcp), ss->scratch);
1804 ip = (struct ip *)(mtod(m, char *) + ip_off);
1807 tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1808 cum_len = -(ip_off + ((ip->ip_hl + tcp->th_off) << 2));
1810 /* TSO implies checksum offload on this hardware */
1811 cksum_offset = ip_off + (ip->ip_hl << 2);
1812 flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1815 /* for TSO, pseudo_hdr_offset holds mss.
1816 * The firmware figures out where to put
1817 * the checksum by parsing the header. */
1818 pseudo_hdr_offset = htobe16(mss);
1825 /* "rdma_count" is the number of RDMAs belonging to the
1826 * current packet BEFORE the current send request. For
1827 * non-TSO packets, this is equal to "count".
1828 * For TSO packets, rdma_count needs to be reset
1829 * to 0 after a segment cut.
1831 * The rdma_count field of the send request is
1832 * the number of RDMAs of the packet starting at
1833 * that request. For TSO send requests with one ore more cuts
1834 * in the middle, this is the number of RDMAs starting
1835 * after the last cut in the request. All previous
1836 * segments before the last cut implicitly have 1 RDMA.
1838 * Since the number of RDMAs is not known beforehand,
1839 * it must be filled-in retroactively - after each
1840 * segmentation cut or at the end of the entire packet.
1843 while (busdma_seg_cnt) {
1844 /* Break the busdma segment up into pieces*/
1845 low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1846 high_swapped = htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1850 flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1852 cum_len_next = cum_len + seglen;
1853 (req-rdma_count)->rdma_count = rdma_count + 1;
1854 if (__predict_true(cum_len >= 0)) {
1856 chop = (cum_len_next > mss);
1857 cum_len_next = cum_len_next % mss;
1858 next_is_first = (cum_len_next == 0);
1859 flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1860 flags_next |= next_is_first *
1862 rdma_count |= -(chop | next_is_first);
1863 rdma_count += chop & !next_is_first;
1864 } else if (cum_len_next >= 0) {
1869 small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1870 flags_next = MXGEFW_FLAGS_TSO_PLD |
1871 MXGEFW_FLAGS_FIRST |
1872 (small * MXGEFW_FLAGS_SMALL);
1875 req->addr_high = high_swapped;
1876 req->addr_low = htobe32(low);
1877 req->pseudo_hdr_offset = pseudo_hdr_offset;
1879 req->rdma_count = 1;
1880 req->length = htobe16(seglen);
1881 req->cksum_offset = cksum_offset;
1882 req->flags = flags | ((cum_len & 1) *
1883 MXGEFW_FLAGS_ALIGN_ODD);
1886 cum_len = cum_len_next;
1891 if (__predict_false(cksum_offset > seglen))
1892 cksum_offset -= seglen;
1895 if (__predict_false(cnt > tx->max_desc))
1901 (req-rdma_count)->rdma_count = rdma_count;
1905 req->flags |= MXGEFW_FLAGS_TSO_LAST;
1906 } while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1908 tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1909 mxge_submit_req(tx, tx->req_list, cnt);
1910 #ifdef IFNET_BUF_RING
1911 if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1912 /* tell the NIC to start polling this slice */
1914 tx->queue_active = 1;
1922 bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1926 kprintf("tx->max_desc exceeded via TSO!\n");
1927 kprintf("mss = %d, %ld, %d!\n", mss,
1928 (long)seg - (long)tx->seg_list, tx->max_desc);
1935 #endif /* IFCAP_TSO4 */
1937 #ifdef MXGE_NEW_VLAN_API
1939 * We reproduce the software vlan tag insertion from
1940 * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
1941 * vlan tag insertion. We need to advertise this in order to have the
1942 * vlan interface respect our csum offload flags.
1944 static struct mbuf *
1945 mxge_vlan_tag_insert(struct mbuf *m)
1947 struct ether_vlan_header *evl;
1949 M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, MB_DONTWAIT);
1950 if (__predict_false(m == NULL))
1952 if (m->m_len < sizeof(*evl)) {
1953 m = m_pullup(m, sizeof(*evl));
1954 if (__predict_false(m == NULL))
1958 * Transform the Ethernet header into an Ethernet header
1959 * with 802.1Q encapsulation.
1961 evl = mtod(m, struct ether_vlan_header *);
1962 bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
1963 (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
1964 evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
1965 evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
1966 m->m_flags &= ~M_VLANTAG;
1969 #endif /* MXGE_NEW_VLAN_API */
1972 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
1975 mcp_kreq_ether_send_t *req;
1976 bus_dma_segment_t *seg;
1981 int cnt, cum_len, err, i, idx, odd_flag, ip_off;
1982 uint16_t pseudo_hdr_offset;
1983 uint8_t flags, cksum_offset;
1990 ip_off = sizeof (struct ether_header);
1991 #ifdef MXGE_NEW_VLAN_API
1992 if (m->m_flags & M_VLANTAG) {
1993 m = mxge_vlan_tag_insert(m);
1994 if (__predict_false(m == NULL))
1996 ip_off += ETHER_VLAN_ENCAP_LEN;
1999 /* (try to) map the frame for DMA */
2000 idx = tx->req & tx->mask;
2001 err = bus_dmamap_load_mbuf_segment(tx->dmat, tx->info[idx].map,
2002 m, tx->seg_list, 1, &cnt,
2004 if (__predict_false(err == EFBIG)) {
2005 /* Too many segments in the chain. Try
2007 m_tmp = m_defrag(m, M_NOWAIT);
2008 if (m_tmp == NULL) {
2013 err = bus_dmamap_load_mbuf_segment(tx->dmat,
2015 m, tx->seg_list, 1, &cnt,
2018 if (__predict_false(err != 0)) {
2019 device_printf(sc->dev, "bus_dmamap_load_mbuf_segment returned %d"
2020 " packet len = %d\n", err, m->m_pkthdr.len);
2023 bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2024 BUS_DMASYNC_PREWRITE);
2025 tx->info[idx].m = m;
2028 /* TSO is different enough, we handle it in another routine */
2029 if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2030 mxge_encap_tso(ss, m, cnt, ip_off);
2037 pseudo_hdr_offset = 0;
2038 flags = MXGEFW_FLAGS_NO_TSO;
2040 /* checksum offloading? */
2041 if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
2042 /* ensure ip header is in first mbuf, copy
2043 it to a scratch buffer if not */
2044 if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
2045 m_copydata(m, 0, ip_off + sizeof (*ip),
2047 ip = (struct ip *)(ss->scratch + ip_off);
2049 ip = (struct ip *)(mtod(m, char *) + ip_off);
2051 cksum_offset = ip_off + (ip->ip_hl << 2);
2052 pseudo_hdr_offset = cksum_offset + m->m_pkthdr.csum_data;
2053 pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2054 req->cksum_offset = cksum_offset;
2055 flags |= MXGEFW_FLAGS_CKSUM;
2056 odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2060 if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2061 flags |= MXGEFW_FLAGS_SMALL;
2063 /* convert segments into a request list */
2066 req->flags = MXGEFW_FLAGS_FIRST;
2067 for (i = 0; i < cnt; i++) {
2069 htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2071 htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2072 req->length = htobe16(seg->ds_len);
2073 req->cksum_offset = cksum_offset;
2074 if (cksum_offset > seg->ds_len)
2075 cksum_offset -= seg->ds_len;
2078 req->pseudo_hdr_offset = pseudo_hdr_offset;
2079 req->pad = 0; /* complete solid 16-byte block */
2080 req->rdma_count = 1;
2081 req->flags |= flags | ((cum_len & 1) * odd_flag);
2082 cum_len += seg->ds_len;
2088 /* pad runts to 60 bytes */
2092 htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2094 htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2095 req->length = htobe16(60 - cum_len);
2096 req->cksum_offset = 0;
2097 req->pseudo_hdr_offset = pseudo_hdr_offset;
2098 req->pad = 0; /* complete solid 16-byte block */
2099 req->rdma_count = 1;
2100 req->flags |= flags | ((cum_len & 1) * odd_flag);
2104 tx->req_list[0].rdma_count = cnt;
2106 /* print what the firmware will see */
2107 for (i = 0; i < cnt; i++) {
2108 kprintf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2109 "cso:%d, flags:0x%x, rdma:%d\n",
2110 i, (int)ntohl(tx->req_list[i].addr_high),
2111 (int)ntohl(tx->req_list[i].addr_low),
2112 (int)ntohs(tx->req_list[i].length),
2113 (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2114 tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2115 tx->req_list[i].rdma_count);
2117 kprintf("--------------\n");
2119 tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2120 mxge_submit_req(tx, tx->req_list, cnt);
2121 #ifdef IFNET_BUF_RING
2122 if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2123 /* tell the NIC to start polling this slice */
2125 tx->queue_active = 1;
2138 #ifdef IFNET_BUF_RING
2140 mxge_qflush(struct ifnet *ifp)
2142 mxge_softc_t *sc = ifp->if_softc;
2147 for (slice = 0; slice < sc->num_slices; slice++) {
2148 tx = &sc->ss[slice].tx;
2149 lockmgr(&tx->lock, LK_EXCLUSIVE);
2150 while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2152 lockmgr(&tx->lock, LK_RELEASE);
2158 mxge_start_locked(struct mxge_slice_state *ss)
2169 while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2170 m = drbr_dequeue(ifp, tx->br);
2174 /* let BPF see it */
2177 /* give it to the nic */
2180 /* ran out of transmit slots */
2181 if (((ss->if_flags & IFF_OACTIVE) == 0)
2182 && (!drbr_empty(ifp, tx->br))) {
2183 ss->if_flags |= IFF_OACTIVE;
2189 mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2200 if ((ss->if_flags & (IFF_RUNNING|IFF_OACTIVE)) !=
2202 err = drbr_enqueue(ifp, tx->br, m);
2206 if (drbr_empty(ifp, tx->br) &&
2207 ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2208 /* let BPF see it */
2210 /* give it to the nic */
2212 } else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2215 if (!drbr_empty(ifp, tx->br))
2216 mxge_start_locked(ss);
2221 mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2223 mxge_softc_t *sc = ifp->if_softc;
2224 struct mxge_slice_state *ss;
2229 slice = m->m_pkthdr.flowid;
2230 slice &= (sc->num_slices - 1); /* num_slices always power of 2 */
2232 ss = &sc->ss[slice];
2235 if (lockmgr(&tx->lock, LK_EXCLUSIVE|LK_NOWAIT)) {
2236 err = mxge_transmit_locked(ss, m);
2237 lockmgr(&tx->lock, LK_RELEASE);
2239 err = drbr_enqueue(ifp, tx->br, m);
2248 mxge_start_locked(struct mxge_slice_state *ss)
2258 while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2259 IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
2263 /* let BPF see it */
2266 /* give it to the nic */
2269 /* ran out of transmit slots */
2270 if ((sc->ifp->if_flags & IFF_OACTIVE) == 0) {
2271 sc->ifp->if_flags |= IFF_OACTIVE;
2277 mxge_start(struct ifnet *ifp)
2279 mxge_softc_t *sc = ifp->if_softc;
2280 struct mxge_slice_state *ss;
2282 /* only use the first slice for now */
2284 lockmgr(&ss->tx.lock, LK_EXCLUSIVE);
2285 mxge_start_locked(ss);
2286 lockmgr(&ss->tx.lock, LK_RELEASE);
2290 * copy an array of mcp_kreq_ether_recv_t's to the mcp. Copy
2291 * at most 32 bytes at a time, so as to avoid involving the software
2292 * pio handler in the nic. We re-write the first segment's low
2293 * DMA address to mark it valid only after we write the entire chunk
2297 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2298 mcp_kreq_ether_recv_t *src)
2302 low = src->addr_low;
2303 src->addr_low = 0xffffffff;
2304 mxge_pio_copy(dst, src, 4 * sizeof (*src));
2306 mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2308 src->addr_low = low;
2309 dst->addr_low = low;
2314 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2316 bus_dma_segment_t seg;
2318 mxge_rx_ring_t *rx = &ss->rx_small;
2321 m = m_gethdr(MB_DONTWAIT, MT_DATA);
2328 err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2329 &seg, 1, &cnt, BUS_DMA_NOWAIT);
2334 rx->info[idx].m = m;
2335 rx->shadow[idx].addr_low =
2336 htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2337 rx->shadow[idx].addr_high =
2338 htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2342 mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2347 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2349 bus_dma_segment_t seg[3];
2351 mxge_rx_ring_t *rx = &ss->rx_big;
2354 if (rx->cl_size == MCLBYTES)
2355 m = m_getcl(MB_DONTWAIT, MT_DATA, M_PKTHDR);
2357 m = m_getjcl(MB_DONTWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2363 m->m_len = rx->mlen;
2364 err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2365 seg, 1, &cnt, BUS_DMA_NOWAIT);
2370 rx->info[idx].m = m;
2371 rx->shadow[idx].addr_low =
2372 htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2373 rx->shadow[idx].addr_high =
2374 htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2376 #if MXGE_VIRT_JUMBOS
2377 for (i = 1; i < cnt; i++) {
2378 rx->shadow[idx + i].addr_low =
2379 htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2380 rx->shadow[idx + i].addr_high =
2381 htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2386 for (i = 0; i < rx->nbufs; i++) {
2387 if ((idx & 7) == 7) {
2388 mxge_submit_8rx(&rx->lanai[idx - 7],
2389 &rx->shadow[idx - 7]);
2397 * Myri10GE hardware checksums are not valid if the sender
2398 * padded the frame with non-zero padding. This is because
2399 * the firmware just does a simple 16-bit 1s complement
2400 * checksum across the entire frame, excluding the first 14
2401 * bytes. It is best to simply to check the checksum and
2402 * tell the stack about it only if the checksum is good
2405 static inline uint16_t
2406 mxge_rx_csum(struct mbuf *m, int csum)
2408 struct ether_header *eh;
2412 eh = mtod(m, struct ether_header *);
2414 /* only deal with IPv4 TCP & UDP for now */
2415 if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2417 ip = (struct ip *)(eh + 1);
2418 if (__predict_false(ip->ip_p != IPPROTO_TCP &&
2419 ip->ip_p != IPPROTO_UDP))
2422 c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2423 htonl(ntohs(csum) + ntohs(ip->ip_len) +
2424 - (ip->ip_hl << 2) + ip->ip_p));
2433 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2435 struct ether_vlan_header *evl;
2436 struct ether_header *eh;
2439 evl = mtod(m, struct ether_vlan_header *);
2440 eh = mtod(m, struct ether_header *);
2443 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2444 * after what the firmware thought was the end of the ethernet
2448 /* put checksum into host byte order */
2449 *csum = ntohs(*csum);
2450 partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2451 (*csum) += ~partial;
2452 (*csum) += ((*csum) < ~partial);
2453 (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2454 (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2456 /* restore checksum to network byte order;
2457 later consumers expect this */
2458 *csum = htons(*csum);
2461 #ifdef MXGE_NEW_VLAN_API
2462 m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2466 mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2470 VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2471 m_tag_prepend(m, mtag);
2475 m->m_flags |= M_VLANTAG;
2478 * Remove the 802.1q header by copying the Ethernet
2479 * addresses over it and adjusting the beginning of
2480 * the data in the mbuf. The encapsulated Ethernet
2481 * type field is already in place.
2483 bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2484 ETHER_HDR_LEN - ETHER_TYPE_LEN);
2485 m_adj(m, ETHER_VLAN_ENCAP_LEN);
2490 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2495 struct ether_header *eh;
2497 bus_dmamap_t old_map;
2499 uint16_t tcpudp_csum;
2504 idx = rx->cnt & rx->mask;
2505 rx->cnt += rx->nbufs;
2506 /* save a pointer to the received mbuf */
2507 m = rx->info[idx].m;
2508 /* try to replace the received mbuf */
2509 if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2510 /* drop the frame -- the old mbuf is re-cycled */
2515 /* unmap the received buffer */
2516 old_map = rx->info[idx].map;
2517 bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2518 bus_dmamap_unload(rx->dmat, old_map);
2520 /* swap the bus_dmamap_t's */
2521 rx->info[idx].map = rx->extra_map;
2522 rx->extra_map = old_map;
2524 /* mcp implicitly skips 1st 2 bytes so that packet is properly
2526 m->m_data += MXGEFW_PAD;
2528 m->m_pkthdr.rcvif = ifp;
2529 m->m_len = m->m_pkthdr.len = len;
2531 eh = mtod(m, struct ether_header *);
2532 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2533 mxge_vlan_tag_remove(m, &csum);
2535 /* if the checksum is valid, mark it in the mbuf header */
2536 if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2537 if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2539 /* otherwise, it was a UDP frame, or a TCP frame which
2540 we could not do LRO on. Tell the stack that the
2542 m->m_pkthdr.csum_data = 0xffff;
2543 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2545 /* flowid only valid if RSS hashing is enabled */
2546 if (sc->num_slices > 1) {
2547 m->m_pkthdr.flowid = (ss - sc->ss);
2548 m->m_flags |= M_FLOWID;
2550 /* pass the frame up the stack */
2551 (*ifp->if_input)(ifp, m);
2555 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2559 struct ether_header *eh;
2562 bus_dmamap_t old_map;
2564 uint16_t tcpudp_csum;
2569 idx = rx->cnt & rx->mask;
2571 /* save a pointer to the received mbuf */
2572 m = rx->info[idx].m;
2573 /* try to replace the received mbuf */
2574 if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2575 /* drop the frame -- the old mbuf is re-cycled */
2580 /* unmap the received buffer */
2581 old_map = rx->info[idx].map;
2582 bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2583 bus_dmamap_unload(rx->dmat, old_map);
2585 /* swap the bus_dmamap_t's */
2586 rx->info[idx].map = rx->extra_map;
2587 rx->extra_map = old_map;
2589 /* mcp implicitly skips 1st 2 bytes so that packet is properly
2591 m->m_data += MXGEFW_PAD;
2593 m->m_pkthdr.rcvif = ifp;
2594 m->m_len = m->m_pkthdr.len = len;
2596 eh = mtod(m, struct ether_header *);
2597 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2598 mxge_vlan_tag_remove(m, &csum);
2600 /* if the checksum is valid, mark it in the mbuf header */
2601 if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2602 if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2604 /* otherwise, it was a UDP frame, or a TCP frame which
2605 we could not do LRO on. Tell the stack that the
2607 m->m_pkthdr.csum_data = 0xffff;
2608 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2610 /* flowid only valid if RSS hashing is enabled */
2611 if (sc->num_slices > 1) {
2612 m->m_pkthdr.flowid = (ss - sc->ss);
2613 m->m_flags |= M_FLOWID;
2615 /* pass the frame up the stack */
2616 (*ifp->if_input)(ifp, m);
2620 mxge_clean_rx_done(struct mxge_slice_state *ss)
2622 mxge_rx_done_t *rx_done = &ss->rx_done;
2628 while (rx_done->entry[rx_done->idx].length != 0) {
2629 length = ntohs(rx_done->entry[rx_done->idx].length);
2630 rx_done->entry[rx_done->idx].length = 0;
2631 checksum = rx_done->entry[rx_done->idx].checksum;
2632 if (length <= (MHLEN - MXGEFW_PAD))
2633 mxge_rx_done_small(ss, length, checksum);
2635 mxge_rx_done_big(ss, length, checksum);
2637 rx_done->idx = rx_done->cnt & rx_done->mask;
2639 /* limit potential for livelock */
2640 if (__predict_false(++limit > rx_done->mask / 2))
2644 while (!SLIST_EMPTY(&ss->lro_active)) {
2645 struct lro_entry *lro = SLIST_FIRST(&ss->lro_active);
2646 SLIST_REMOVE_HEAD(&ss->lro_active, next);
2647 mxge_lro_flush(ss, lro);
2654 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2665 while (tx->pkt_done != mcp_idx) {
2666 idx = tx->done & tx->mask;
2668 m = tx->info[idx].m;
2669 /* mbuf and DMA map only attached to the first
2672 ss->obytes += m->m_pkthdr.len;
2673 if (m->m_flags & M_MCAST)
2676 tx->info[idx].m = NULL;
2677 map = tx->info[idx].map;
2678 bus_dmamap_unload(tx->dmat, map);
2681 if (tx->info[idx].flag) {
2682 tx->info[idx].flag = 0;
2687 /* If we have space, clear IFF_OACTIVE to tell the stack that
2688 its OK to send packets */
2689 #ifdef IFNET_BUF_RING
2690 flags = &ss->if_flags;
2692 flags = &ifp->if_flags;
2694 lockmgr(&ss->tx.lock, LK_EXCLUSIVE);
2695 if ((*flags) & IFF_OACTIVE &&
2696 tx->req - tx->done < (tx->mask + 1)/4) {
2697 *(flags) &= ~IFF_OACTIVE;
2699 mxge_start_locked(ss);
2701 #ifdef IFNET_BUF_RING
2702 if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2703 /* let the NIC stop polling this queue, since there
2704 * are no more transmits pending */
2705 if (tx->req == tx->done) {
2707 tx->queue_active = 0;
2713 lockmgr(&ss->tx.lock, LK_RELEASE);
2717 static struct mxge_media_type mxge_xfp_media_types[] =
2719 {IFM_10G_CX4, 0x7f, "10GBASE-CX4 (module)"},
2720 {IFM_10G_SR, (1 << 7), "10GBASE-SR"},
2721 {IFM_10G_LR, (1 << 6), "10GBASE-LR"},
2722 {0, (1 << 5), "10GBASE-ER"},
2723 {IFM_10G_LRM, (1 << 4), "10GBASE-LRM"},
2724 {0, (1 << 3), "10GBASE-SW"},
2725 {0, (1 << 2), "10GBASE-LW"},
2726 {0, (1 << 1), "10GBASE-EW"},
2727 {0, (1 << 0), "Reserved"}
2729 static struct mxge_media_type mxge_sfp_media_types[] =
2731 {0, (1 << 7), "Reserved"},
2732 {IFM_10G_LRM, (1 << 6), "10GBASE-LRM"},
2733 {IFM_10G_LR, (1 << 5), "10GBASE-LR"},
2734 {IFM_10G_SR, (1 << 4), "10GBASE-SR"}
2738 mxge_set_media(mxge_softc_t *sc, int type)
2740 sc->media_flags |= type;
2741 ifmedia_add(&sc->media, sc->media_flags, 0, NULL);
2742 ifmedia_set(&sc->media, sc->media_flags);
2747 * Determine the media type for a NIC. Some XFPs will identify
2748 * themselves only when their link is up, so this is initiated via a
2749 * link up interrupt. However, this can potentially take up to
2750 * several milliseconds, so it is run via the watchdog routine, rather
2751 * than in the interrupt handler itself. This need only be done
2752 * once, not each time the link is up.
2755 mxge_media_probe(mxge_softc_t *sc)
2760 struct mxge_media_type *mxge_media_types = NULL;
2761 int i, err, ms, mxge_media_type_entries;
2764 sc->need_media_probe = 0;
2766 /* if we've already set a media type, we're done */
2767 if (sc->media_flags != (IFM_ETHER | IFM_AUTO))
2771 * parse the product code to deterimine the interface type
2772 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2773 * after the 3rd dash in the driver's cached copy of the
2774 * EEPROM's product code string.
2776 ptr = sc->product_code_string;
2778 device_printf(sc->dev, "Missing product code\n");
2781 for (i = 0; i < 3; i++, ptr++) {
2782 ptr = index(ptr, '-');
2784 device_printf(sc->dev,
2785 "only %d dashes in PC?!?\n", i);
2791 mxge_set_media(sc, IFM_10G_CX4);
2794 else if (*ptr == 'Q') {
2795 /* -Q is Quad Ribbon Fiber */
2796 device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2797 /* FreeBSD has no media type for Quad ribbon fiber */
2803 mxge_media_types = mxge_xfp_media_types;
2804 mxge_media_type_entries =
2805 sizeof (mxge_xfp_media_types) /
2806 sizeof (mxge_xfp_media_types[0]);
2807 byte = MXGE_XFP_COMPLIANCE_BYTE;
2811 if (*ptr == 'S' || *(ptr +1) == 'S') {
2812 /* -S or -2S is SFP+ */
2813 mxge_media_types = mxge_sfp_media_types;
2814 mxge_media_type_entries =
2815 sizeof (mxge_sfp_media_types) /
2816 sizeof (mxge_sfp_media_types[0]);
2821 if (mxge_media_types == NULL) {
2822 device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2827 * At this point we know the NIC has an XFP cage, so now we
2828 * try to determine what is in the cage by using the
2829 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2830 * register. We read just one byte, which may take over
2834 cmd.data0 = 0; /* just fetch 1 byte, not all 256 */
2836 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2837 if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
2838 device_printf(sc->dev, "failed to read XFP\n");
2840 if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
2841 device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
2843 if (err != MXGEFW_CMD_OK) {
2847 /* now we wait for the data to be cached */
2849 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2850 for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
2853 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2855 if (err != MXGEFW_CMD_OK) {
2856 device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
2857 cage_type, err, ms);
2861 if (cmd.data0 == mxge_media_types[0].bitmask) {
2863 device_printf(sc->dev, "%s:%s\n", cage_type,
2864 mxge_media_types[0].name);
2865 mxge_set_media(sc, IFM_10G_CX4);
2868 for (i = 1; i < mxge_media_type_entries; i++) {
2869 if (cmd.data0 & mxge_media_types[i].bitmask) {
2871 device_printf(sc->dev, "%s:%s\n",
2873 mxge_media_types[i].name);
2875 mxge_set_media(sc, mxge_media_types[i].flag);
2879 device_printf(sc->dev, "%s media 0x%x unknown\n", cage_type,
2886 mxge_intr(void *arg)
2888 struct mxge_slice_state *ss = arg;
2889 mxge_softc_t *sc = ss->sc;
2890 mcp_irq_data_t *stats = ss->fw_stats;
2891 mxge_tx_ring_t *tx = &ss->tx;
2892 mxge_rx_done_t *rx_done = &ss->rx_done;
2893 uint32_t send_done_count;
2897 #ifndef IFNET_BUF_RING
2898 /* an interrupt on a non-zero slice is implicitly valid
2899 since MSI-X irqs are not shared */
2901 mxge_clean_rx_done(ss);
2902 *ss->irq_claim = be32toh(3);
2907 /* make sure the DMA has finished */
2908 if (!stats->valid) {
2911 valid = stats->valid;
2913 if (sc->legacy_irq) {
2914 /* lower legacy IRQ */
2915 *sc->irq_deassert = 0;
2916 if (!mxge_deassert_wait)
2917 /* don't wait for conf. that irq is low */
2923 /* loop while waiting for legacy irq deassertion */
2925 /* check for transmit completes and receives */
2926 send_done_count = be32toh(stats->send_done_count);
2927 while ((send_done_count != tx->pkt_done) ||
2928 (rx_done->entry[rx_done->idx].length != 0)) {
2929 if (send_done_count != tx->pkt_done)
2930 mxge_tx_done(ss, (int)send_done_count);
2931 mxge_clean_rx_done(ss);
2932 send_done_count = be32toh(stats->send_done_count);
2934 if (sc->legacy_irq && mxge_deassert_wait)
2936 } while (*((volatile uint8_t *) &stats->valid));
2938 /* fw link & error stats meaningful only on the first slice */
2939 if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
2940 if (sc->link_state != stats->link_up) {
2941 sc->link_state = stats->link_up;
2942 if (sc->link_state) {
2943 sc->ifp->if_link_state = LINK_STATE_UP;
2944 if_link_state_change(sc->ifp);
2946 device_printf(sc->dev, "link up\n");
2948 sc->ifp->if_link_state = LINK_STATE_DOWN;
2949 if_link_state_change(sc->ifp);
2951 device_printf(sc->dev, "link down\n");
2953 sc->need_media_probe = 1;
2955 if (sc->rdma_tags_available !=
2956 be32toh(stats->rdma_tags_available)) {
2957 sc->rdma_tags_available =
2958 be32toh(stats->rdma_tags_available);
2959 device_printf(sc->dev, "RDMA timed out! %d tags "
2960 "left\n", sc->rdma_tags_available);
2963 if (stats->link_down) {
2964 sc->down_cnt += stats->link_down;
2966 sc->ifp->if_link_state = LINK_STATE_DOWN;
2967 if_link_state_change(sc->ifp);
2971 /* check to see if we have rx token to pass back */
2973 *ss->irq_claim = be32toh(3);
2974 *(ss->irq_claim + 1) = be32toh(3);
2978 mxge_init(void *arg)
2985 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
2987 struct lro_entry *lro_entry;
2990 while (!SLIST_EMPTY(&ss->lro_free)) {
2991 lro_entry = SLIST_FIRST(&ss->lro_free);
2992 SLIST_REMOVE_HEAD(&ss->lro_free, next);
2993 kfree(lro_entry, M_DEVBUF);
2996 for (i = 0; i <= ss->rx_big.mask; i++) {
2997 if (ss->rx_big.info[i].m == NULL)
2999 bus_dmamap_unload(ss->rx_big.dmat,
3000 ss->rx_big.info[i].map);
3001 m_freem(ss->rx_big.info[i].m);
3002 ss->rx_big.info[i].m = NULL;
3005 for (i = 0; i <= ss->rx_small.mask; i++) {
3006 if (ss->rx_small.info[i].m == NULL)
3008 bus_dmamap_unload(ss->rx_small.dmat,
3009 ss->rx_small.info[i].map);
3010 m_freem(ss->rx_small.info[i].m);
3011 ss->rx_small.info[i].m = NULL;
3014 /* transmit ring used only on the first slice */
3015 if (ss->tx.info == NULL)
3018 for (i = 0; i <= ss->tx.mask; i++) {
3019 ss->tx.info[i].flag = 0;
3020 if (ss->tx.info[i].m == NULL)
3022 bus_dmamap_unload(ss->tx.dmat,
3023 ss->tx.info[i].map);
3024 m_freem(ss->tx.info[i].m);
3025 ss->tx.info[i].m = NULL;
3030 mxge_free_mbufs(mxge_softc_t *sc)
3034 for (slice = 0; slice < sc->num_slices; slice++)
3035 mxge_free_slice_mbufs(&sc->ss[slice]);
3039 mxge_free_slice_rings(struct mxge_slice_state *ss)
3044 if (ss->rx_done.entry != NULL)
3045 mxge_dma_free(&ss->rx_done.dma);
3046 ss->rx_done.entry = NULL;
3048 if (ss->tx.req_bytes != NULL)
3049 kfree(ss->tx.req_bytes, M_DEVBUF);
3050 ss->tx.req_bytes = NULL;
3052 if (ss->tx.seg_list != NULL)
3053 kfree(ss->tx.seg_list, M_DEVBUF);
3054 ss->tx.seg_list = NULL;
3056 if (ss->rx_small.shadow != NULL)
3057 kfree(ss->rx_small.shadow, M_DEVBUF);
3058 ss->rx_small.shadow = NULL;
3060 if (ss->rx_big.shadow != NULL)
3061 kfree(ss->rx_big.shadow, M_DEVBUF);
3062 ss->rx_big.shadow = NULL;
3064 if (ss->tx.info != NULL) {
3065 if (ss->tx.dmat != NULL) {
3066 for (i = 0; i <= ss->tx.mask; i++) {
3067 bus_dmamap_destroy(ss->tx.dmat,
3068 ss->tx.info[i].map);
3070 bus_dma_tag_destroy(ss->tx.dmat);
3072 kfree(ss->tx.info, M_DEVBUF);
3076 if (ss->rx_small.info != NULL) {
3077 if (ss->rx_small.dmat != NULL) {
3078 for (i = 0; i <= ss->rx_small.mask; i++) {
3079 bus_dmamap_destroy(ss->rx_small.dmat,
3080 ss->rx_small.info[i].map);
3082 bus_dmamap_destroy(ss->rx_small.dmat,
3083 ss->rx_small.extra_map);
3084 bus_dma_tag_destroy(ss->rx_small.dmat);
3086 kfree(ss->rx_small.info, M_DEVBUF);
3088 ss->rx_small.info = NULL;
3090 if (ss->rx_big.info != NULL) {
3091 if (ss->rx_big.dmat != NULL) {
3092 for (i = 0; i <= ss->rx_big.mask; i++) {
3093 bus_dmamap_destroy(ss->rx_big.dmat,
3094 ss->rx_big.info[i].map);
3096 bus_dmamap_destroy(ss->rx_big.dmat,
3097 ss->rx_big.extra_map);
3098 bus_dma_tag_destroy(ss->rx_big.dmat);
3100 kfree(ss->rx_big.info, M_DEVBUF);
3102 ss->rx_big.info = NULL;
3106 mxge_free_rings(mxge_softc_t *sc)
3110 for (slice = 0; slice < sc->num_slices; slice++)
3111 mxge_free_slice_rings(&sc->ss[slice]);
3115 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3116 int tx_ring_entries)
3118 mxge_softc_t *sc = ss->sc;
3124 /* allocate per-slice receive resources */
3126 ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3127 ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3129 /* allocate the rx shadow rings */
3130 bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3131 ss->rx_small.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3132 if (ss->rx_small.shadow == NULL)
3135 bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3136 ss->rx_big.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3137 if (ss->rx_big.shadow == NULL)
3140 /* allocate the rx host info rings */
3141 bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3142 ss->rx_small.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3143 if (ss->rx_small.info == NULL)
3146 bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3147 ss->rx_big.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3148 if (ss->rx_big.info == NULL)
3151 /* allocate the rx busdma resources */
3152 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3154 4096, /* boundary */
3155 BUS_SPACE_MAXADDR, /* low */
3156 BUS_SPACE_MAXADDR, /* high */
3157 NULL, NULL, /* filter */
3158 MHLEN, /* maxsize */
3160 MHLEN, /* maxsegsize */
3161 BUS_DMA_ALLOCNOW, /* flags */
3162 &ss->rx_small.dmat); /* tag */
3164 device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3169 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3171 #if MXGE_VIRT_JUMBOS
3172 4096, /* boundary */
3176 BUS_SPACE_MAXADDR, /* low */
3177 BUS_SPACE_MAXADDR, /* high */
3178 NULL, NULL, /* filter */
3179 3*4096, /* maxsize */
3180 #if MXGE_VIRT_JUMBOS
3182 4096, /* maxsegsize*/
3185 MJUM9BYTES, /* maxsegsize*/
3187 BUS_DMA_ALLOCNOW, /* flags */
3188 &ss->rx_big.dmat); /* tag */
3190 device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3194 for (i = 0; i <= ss->rx_small.mask; i++) {
3195 err = bus_dmamap_create(ss->rx_small.dmat, 0,
3196 &ss->rx_small.info[i].map);
3198 device_printf(sc->dev, "Err %d rx_small dmamap\n",
3203 err = bus_dmamap_create(ss->rx_small.dmat, 0,
3204 &ss->rx_small.extra_map);
3206 device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3211 for (i = 0; i <= ss->rx_big.mask; i++) {
3212 err = bus_dmamap_create(ss->rx_big.dmat, 0,
3213 &ss->rx_big.info[i].map);
3215 device_printf(sc->dev, "Err %d rx_big dmamap\n",
3220 err = bus_dmamap_create(ss->rx_big.dmat, 0,
3221 &ss->rx_big.extra_map);
3223 device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3228 /* now allocate TX resouces */
3230 #ifndef IFNET_BUF_RING
3231 /* only use a single TX ring for now */
3232 if (ss != ss->sc->ss)
3236 ss->tx.mask = tx_ring_entries - 1;
3237 ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3240 /* allocate the tx request copy block */
3242 sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3243 ss->tx.req_bytes = kmalloc(bytes, M_DEVBUF, M_WAITOK);
3244 if (ss->tx.req_bytes == NULL)
3246 /* ensure req_list entries are aligned to 8 bytes */
3247 ss->tx.req_list = (mcp_kreq_ether_send_t *)
3248 ((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3250 /* allocate the tx busdma segment list */
3251 bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3252 ss->tx.seg_list = (bus_dma_segment_t *)
3253 kmalloc(bytes, M_DEVBUF, M_WAITOK);
3254 if (ss->tx.seg_list == NULL)
3257 /* allocate the tx host info ring */
3258 bytes = tx_ring_entries * sizeof (*ss->tx.info);
3259 ss->tx.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3260 if (ss->tx.info == NULL)
3263 /* allocate the tx busdma resources */
3264 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3266 sc->tx_boundary, /* boundary */
3267 BUS_SPACE_MAXADDR, /* low */
3268 BUS_SPACE_MAXADDR, /* high */
3269 NULL, NULL, /* filter */
3270 65536 + 256, /* maxsize */
3271 ss->tx.max_desc - 2, /* num segs */
3272 sc->tx_boundary, /* maxsegsz */
3273 BUS_DMA_ALLOCNOW, /* flags */
3274 &ss->tx.dmat); /* tag */
3277 device_printf(sc->dev, "Err %d allocating tx dmat\n",
3282 /* now use these tags to setup dmamaps for each slot
3284 for (i = 0; i <= ss->tx.mask; i++) {
3285 err = bus_dmamap_create(ss->tx.dmat, 0,
3286 &ss->tx.info[i].map);
3288 device_printf(sc->dev, "Err %d tx dmamap\n",
3298 mxge_alloc_rings(mxge_softc_t *sc)
3302 int tx_ring_entries, rx_ring_entries;
3305 /* get ring sizes */
3306 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3307 tx_ring_size = cmd.data0;
3309 device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3313 tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3314 rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3315 IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
3316 sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
3317 IFQ_SET_READY(&sc->ifp->if_snd);
3319 for (slice = 0; slice < sc->num_slices; slice++) {
3320 err = mxge_alloc_slice_rings(&sc->ss[slice],
3329 mxge_free_rings(sc);
3336 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3338 int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3340 if (bufsize < MCLBYTES) {
3341 /* easy, everything fits in a single buffer */
3342 *big_buf_size = MCLBYTES;
3343 *cl_size = MCLBYTES;
3348 if (bufsize < MJUMPAGESIZE) {
3349 /* still easy, everything still fits in a single buffer */
3350 *big_buf_size = MJUMPAGESIZE;
3351 *cl_size = MJUMPAGESIZE;
3355 #if MXGE_VIRT_JUMBOS
3356 /* now we need to use virtually contiguous buffers */
3357 *cl_size = MJUM9BYTES;
3358 *big_buf_size = 4096;
3359 *nbufs = mtu / 4096 + 1;
3360 /* needs to be a power of two, so round up */
3364 *cl_size = MJUM9BYTES;
3365 *big_buf_size = MJUM9BYTES;
3371 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3376 struct lro_entry *lro_entry;
3381 slice = ss - sc->ss;
3383 SLIST_INIT(&ss->lro_free);
3384 SLIST_INIT(&ss->lro_active);
3386 for (i = 0; i < sc->lro_cnt; i++) {
3387 lro_entry = (struct lro_entry *)
3388 kmalloc(sizeof (*lro_entry), M_DEVBUF,
3390 if (lro_entry == NULL) {
3394 SLIST_INSERT_HEAD(&ss->lro_free, lro_entry, next);
3396 /* get the lanai pointers to the send and receive rings */
3399 #ifndef IFNET_BUF_RING
3400 /* We currently only send from the first slice */
3404 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3406 (volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3407 ss->tx.send_go = (volatile uint32_t *)
3408 (sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3409 ss->tx.send_stop = (volatile uint32_t *)
3410 (sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3411 #ifndef IFNET_BUF_RING
3415 err |= mxge_send_cmd(sc,
3416 MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3417 ss->rx_small.lanai =
3418 (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3420 err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3422 (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3425 device_printf(sc->dev,
3426 "failed to get ring sizes or locations\n");
3430 /* stock receive rings */
3431 for (i = 0; i <= ss->rx_small.mask; i++) {
3432 map = ss->rx_small.info[i].map;
3433 err = mxge_get_buf_small(ss, map, i);
3435 device_printf(sc->dev, "alloced %d/%d smalls\n",
3436 i, ss->rx_small.mask + 1);
3440 for (i = 0; i <= ss->rx_big.mask; i++) {
3441 ss->rx_big.shadow[i].addr_low = 0xffffffff;
3442 ss->rx_big.shadow[i].addr_high = 0xffffffff;
3444 ss->rx_big.nbufs = nbufs;
3445 ss->rx_big.cl_size = cl_size;
3446 ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3447 ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3448 for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3449 map = ss->rx_big.info[i].map;
3450 err = mxge_get_buf_big(ss, map, i);
3452 device_printf(sc->dev, "alloced %d/%d bigs\n",
3453 i, ss->rx_big.mask + 1);
3461 mxge_open(mxge_softc_t *sc)
3464 int err, big_bytes, nbufs, slice, cl_size, i;
3466 volatile uint8_t *itable;
3467 struct mxge_slice_state *ss;
3469 /* Copy the MAC address in case it was overridden */
3470 bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3472 err = mxge_reset(sc, 1);
3474 device_printf(sc->dev, "failed to reset\n");
3478 if (sc->num_slices > 1) {
3479 /* setup the indirection table */
3480 cmd.data0 = sc->num_slices;
3481 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3484 err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3487 device_printf(sc->dev,
3488 "failed to setup rss tables\n");
3492 /* just enable an identity mapping */
3493 itable = sc->sram + cmd.data0;
3494 for (i = 0; i < sc->num_slices; i++)
3495 itable[i] = (uint8_t)i;
3498 cmd.data1 = mxge_rss_hash_type;
3499 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3501 device_printf(sc->dev, "failed to enable slices\n");
3507 mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3510 err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3512 /* error is only meaningful if we're trying to set
3513 MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3514 if (err && nbufs > 1) {
3515 device_printf(sc->dev,
3516 "Failed to set alway-use-n to %d\n",
3520 /* Give the firmware the mtu and the big and small buffer
3521 sizes. The firmware wants the big buf size to be a power
3522 of two. Luckily, FreeBSD's clusters are powers of two */
3523 cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3524 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3525 cmd.data0 = MHLEN - MXGEFW_PAD;
3526 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3528 cmd.data0 = big_bytes;
3529 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3532 device_printf(sc->dev, "failed to setup params\n");
3536 /* Now give him the pointer to the stats block */
3538 #ifdef IFNET_BUF_RING
3539 slice < sc->num_slices;
3544 ss = &sc->ss[slice];
3546 MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3548 MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3549 cmd.data2 = sizeof(struct mcp_irq_data);
3550 cmd.data2 |= (slice << 16);
3551 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3555 bus = sc->ss->fw_stats_dma.bus_addr;
3556 bus += offsetof(struct mcp_irq_data, send_done_count);
3557 cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3558 cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3559 err = mxge_send_cmd(sc,
3560 MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3562 /* Firmware cannot support multicast without STATS_DMA_V2 */
3563 sc->fw_multicast_support = 0;
3565 sc->fw_multicast_support = 1;
3569 device_printf(sc->dev, "failed to setup params\n");
3573 for (slice = 0; slice < sc->num_slices; slice++) {
3574 err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3576 device_printf(sc->dev, "couldn't open slice %d\n",
3582 /* Finally, start the firmware running */
3583 err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3585 device_printf(sc->dev, "Couldn't bring up link\n");
3588 #ifdef IFNET_BUF_RING
3589 for (slice = 0; slice < sc->num_slices; slice++) {
3590 ss = &sc->ss[slice];
3591 ss->if_flags |= IFF_RUNNING;
3592 ss->if_flags &= ~IFF_OACTIVE;
3595 sc->ifp->if_flags |= IFF_RUNNING;
3596 sc->ifp->if_flags &= ~IFF_OACTIVE;
3597 callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3603 mxge_free_mbufs(sc);
3609 mxge_close(mxge_softc_t *sc)
3612 int err, old_down_cnt;
3613 #ifdef IFNET_BUF_RING
3614 struct mxge_slice_state *ss;
3618 callout_stop(&sc->co_hdl);
3619 #ifdef IFNET_BUF_RING
3620 for (slice = 0; slice < sc->num_slices; slice++) {
3621 ss = &sc->ss[slice];
3622 ss->if_flags &= ~IFF_RUNNING;
3625 sc->ifp->if_flags &= ~IFF_RUNNING;
3626 old_down_cnt = sc->down_cnt;
3628 err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3630 device_printf(sc->dev, "Couldn't bring down link\n");
3632 if (old_down_cnt == sc->down_cnt) {
3633 /* wait for down irq */
3634 DELAY(10 * sc->intr_coal_delay);
3637 if (old_down_cnt == sc->down_cnt) {
3638 device_printf(sc->dev, "never got down irq\n");
3641 mxge_free_mbufs(sc);
3647 mxge_setup_cfg_space(mxge_softc_t *sc)
3649 device_t dev = sc->dev;
3651 uint16_t cmd, lnk, pectl;
3653 /* find the PCIe link width and set max read request to 4KB*/
3654 if (pci_find_extcap(dev, PCIY_EXPRESS, ®) == 0) {
3655 lnk = pci_read_config(dev, reg + 0x12, 2);
3656 sc->link_width = (lnk >> 4) & 0x3f;
3658 pectl = pci_read_config(dev, reg + 0x8, 2);
3659 pectl = (pectl & ~0x7000) | (5 << 12);
3660 pci_write_config(dev, reg + 0x8, pectl, 2);
3663 /* Enable DMA and Memory space access */
3664 pci_enable_busmaster(dev);
3665 cmd = pci_read_config(dev, PCIR_COMMAND, 2);
3666 cmd |= PCIM_CMD_MEMEN;
3667 pci_write_config(dev, PCIR_COMMAND, cmd, 2);
3671 mxge_read_reboot(mxge_softc_t *sc)
3673 device_t dev = sc->dev;
3676 /* find the vendor specific offset */
3677 if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
3678 device_printf(sc->dev,
3679 "could not find vendor specific offset\n");
3680 return (uint32_t)-1;
3682 /* enable read32 mode */
3683 pci_write_config(dev, vs + 0x10, 0x3, 1);
3684 /* tell NIC which register to read */
3685 pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3686 return (pci_read_config(dev, vs + 0x14, 4));
3690 mxge_watchdog_reset(mxge_softc_t *sc, int slice)
3692 struct pci_devinfo *dinfo;
3700 device_printf(sc->dev, "Watchdog reset!\n");
3703 * check to see if the NIC rebooted. If it did, then all of
3704 * PCI config space has been reset, and things like the
3705 * busmaster bit will be zero. If this is the case, then we
3706 * must restore PCI config space before the NIC can be used
3709 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3710 if (cmd == 0xffff) {
3712 * maybe the watchdog caught the NIC rebooting; wait
3713 * up to 100ms for it to finish. If it does not come
3714 * back, then give up
3717 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3718 if (cmd == 0xffff) {
3719 device_printf(sc->dev, "NIC disappeared!\n");
3723 if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3724 /* print the reboot status */
3725 reboot = mxge_read_reboot(sc);
3726 device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3728 /* restore PCI configuration space */
3729 dinfo = device_get_ivars(sc->dev);
3730 pci_cfg_restore(sc->dev, dinfo);
3732 /* and redo any changes we made to our config space */
3733 mxge_setup_cfg_space(sc);
3735 if (sc->ifp->if_flags & IFF_RUNNING) {
3737 err = mxge_open(sc);
3740 tx = &sc->ss[slice].tx;
3741 device_printf(sc->dev,
3742 "NIC did not reboot, slice %d ring state:\n",
3744 device_printf(sc->dev,
3745 "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3746 tx->req, tx->done, tx->queue_active);
3747 device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3748 tx->activate, tx->deactivate);
3749 device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3751 be32toh(sc->ss->fw_stats->send_done_count));
3752 device_printf(sc->dev, "not resetting\n");
3758 mxge_watchdog(mxge_softc_t *sc)
3761 uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3764 /* see if we have outstanding transmits, which
3765 have been pending for more than mxge_ticks */
3767 #ifdef IFNET_BUF_RING
3768 (i < sc->num_slices) && (err == 0);
3770 (i < 1) && (err == 0);
3774 if (tx->req != tx->done &&
3775 tx->watchdog_req != tx->watchdog_done &&
3776 tx->done == tx->watchdog_done) {
3777 /* check for pause blocking before resetting */
3778 if (tx->watchdog_rx_pause == rx_pause)
3779 err = mxge_watchdog_reset(sc, i);
3781 device_printf(sc->dev, "Flow control blocking "
3782 "xmits, check link partner\n");
3785 tx->watchdog_req = tx->req;
3786 tx->watchdog_done = tx->done;
3787 tx->watchdog_rx_pause = rx_pause;
3790 if (sc->need_media_probe)
3791 mxge_media_probe(sc);
3796 mxge_update_stats(mxge_softc_t *sc)
3798 struct mxge_slice_state *ss;
3799 u_long ipackets = 0;
3800 u_long opackets = 0;
3801 #ifdef IFNET_BUF_RING
3809 for (slice = 0; slice < sc->num_slices; slice++) {
3810 ss = &sc->ss[slice];
3811 ipackets += ss->ipackets;
3812 opackets += ss->opackets;
3813 #ifdef IFNET_BUF_RING
3814 obytes += ss->obytes;
3815 omcasts += ss->omcasts;
3816 odrops += ss->tx.br->br_drops;
3818 oerrors += ss->oerrors;
3820 sc->ifp->if_ipackets = ipackets;
3821 sc->ifp->if_opackets = opackets;
3822 #ifdef IFNET_BUF_RING
3823 sc->ifp->if_obytes = obytes;
3824 sc->ifp->if_omcasts = omcasts;
3825 sc->ifp->if_snd.ifq_drops = odrops;
3827 sc->ifp->if_oerrors = oerrors;
3831 mxge_tick(void *arg)
3833 mxge_softc_t *sc = arg;
3836 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
3837 /* aggregate stats from different slices */
3838 mxge_update_stats(sc);
3839 if (!sc->watchdog_countdown) {
3840 err = mxge_watchdog(sc);
3841 sc->watchdog_countdown = 4;
3843 sc->watchdog_countdown--;
3845 callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3846 lockmgr(&sc->driver_lock, LK_RELEASE);
3850 mxge_media_change(struct ifnet *ifp)
3856 mxge_change_mtu(mxge_softc_t *sc, int mtu)
3858 struct ifnet *ifp = sc->ifp;
3859 int real_mtu, old_mtu;
3863 real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3864 if ((real_mtu > sc->max_mtu) || real_mtu < 60)
3866 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
3867 old_mtu = ifp->if_mtu;
3869 if (ifp->if_flags & IFF_RUNNING) {
3871 err = mxge_open(sc);
3873 ifp->if_mtu = old_mtu;
3875 (void) mxge_open(sc);
3878 lockmgr(&sc->driver_lock, LK_RELEASE);
3883 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
3885 mxge_softc_t *sc = ifp->if_softc;
3890 ifmr->ifm_status = IFM_AVALID;
3891 ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
3892 ifmr->ifm_active = IFM_AUTO | IFM_ETHER;
3893 ifmr->ifm_active |= sc->link_state ? IFM_FDX : 0;
3897 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data, struct ucred *cr)
3899 mxge_softc_t *sc = ifp->if_softc;
3900 struct ifreq *ifr = (struct ifreq *)data;
3908 err = ether_ioctl(ifp, command, data);
3912 err = mxge_change_mtu(sc, ifr->ifr_mtu);
3916 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
3918 lockmgr(&sc->driver_lock, LK_RELEASE);
3921 if (ifp->if_flags & IFF_UP) {
3922 if (!(ifp->if_flags & IFF_RUNNING)) {
3923 err = mxge_open(sc);
3925 /* take care of promis can allmulti
3927 mxge_change_promisc(sc,
3928 ifp->if_flags & IFF_PROMISC);
3929 mxge_set_multicast_list(sc);
3932 if (ifp->if_flags & IFF_RUNNING) {
3936 lockmgr(&sc->driver_lock, LK_RELEASE);
3941 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
3942 mxge_set_multicast_list(sc);
3943 lockmgr(&sc->driver_lock, LK_RELEASE);
3947 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
3948 mask = ifr->ifr_reqcap ^ ifp->if_capenable;
3949 if (mask & IFCAP_TXCSUM) {
3950 if (IFCAP_TXCSUM & ifp->if_capenable) {
3951 ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
3952 ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
3955 ifp->if_capenable |= IFCAP_TXCSUM;
3956 ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
3958 } else if (mask & IFCAP_RXCSUM) {
3959 if (IFCAP_RXCSUM & ifp->if_capenable) {
3960 ifp->if_capenable &= ~IFCAP_RXCSUM;
3963 ifp->if_capenable |= IFCAP_RXCSUM;
3967 if (mask & IFCAP_TSO4) {
3968 if (IFCAP_TSO4 & ifp->if_capenable) {
3969 ifp->if_capenable &= ~IFCAP_TSO4;
3970 ifp->if_hwassist &= ~CSUM_TSO;
3971 } else if (IFCAP_TXCSUM & ifp->if_capenable) {
3972 ifp->if_capenable |= IFCAP_TSO4;
3973 ifp->if_hwassist |= CSUM_TSO;
3975 kprintf("mxge requires tx checksum offload"
3976 " be enabled to use TSO\n");
3980 if (mask & IFCAP_LRO) {
3981 if (IFCAP_LRO & ifp->if_capenable)
3982 err = mxge_change_lro_locked(sc, 0);
3984 err = mxge_change_lro_locked(sc, mxge_lro_cnt);
3986 if (mask & IFCAP_VLAN_HWTAGGING)
3987 ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
3988 lockmgr(&sc->driver_lock, LK_RELEASE);
3989 VLAN_CAPABILITIES(ifp);
3994 err = ifmedia_ioctl(ifp, (struct ifreq *)data,
3995 &sc->media, command);
4005 mxge_fetch_tunables(mxge_softc_t *sc)
4008 TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4009 TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4010 &mxge_flow_control);
4011 TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4012 &mxge_intr_coal_delay);
4013 TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4014 &mxge_nvidia_ecrc_enable);
4015 TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4016 &mxge_force_firmware);
4017 TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4018 &mxge_deassert_wait);
4019 TUNABLE_INT_FETCH("hw.mxge.verbose",
4021 TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4022 TUNABLE_INT_FETCH("hw.mxge.lro_cnt", &sc->lro_cnt);
4023 TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4024 TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4025 TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4026 if (sc->lro_cnt != 0)
4027 mxge_lro_cnt = sc->lro_cnt;
4031 if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4032 mxge_intr_coal_delay = 30;
4033 if (mxge_ticks == 0)
4034 mxge_ticks = hz / 2;
4035 sc->pause = mxge_flow_control;
4036 if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4037 || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4038 mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
4040 if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4041 mxge_initial_mtu < ETHER_MIN_LEN)
4042 mxge_initial_mtu = ETHERMTU_JUMBO;
4047 mxge_free_slices(mxge_softc_t *sc)
4049 struct mxge_slice_state *ss;
4056 for (i = 0; i < sc->num_slices; i++) {
4058 if (ss->fw_stats != NULL) {
4059 mxge_dma_free(&ss->fw_stats_dma);
4060 ss->fw_stats = NULL;
4061 #ifdef IFNET_BUF_RING
4062 if (ss->tx.br != NULL) {
4063 drbr_free(ss->tx.br, M_DEVBUF);
4067 lockuninit(&ss->tx.lock);
4069 if (ss->rx_done.entry != NULL) {
4070 mxge_dma_free(&ss->rx_done.dma);
4071 ss->rx_done.entry = NULL;
4074 kfree(sc->ss, M_DEVBUF);
4079 mxge_alloc_slices(mxge_softc_t *sc)
4082 struct mxge_slice_state *ss;
4084 int err, i, max_intr_slots;
4086 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4088 device_printf(sc->dev, "Cannot determine rx ring size\n");
4091 sc->rx_ring_size = cmd.data0;
4092 max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4094 bytes = sizeof (*sc->ss) * sc->num_slices;
4095 sc->ss = kmalloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4098 for (i = 0; i < sc->num_slices; i++) {
4103 /* allocate per-slice rx interrupt queues */
4105 bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4106 err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4109 ss->rx_done.entry = ss->rx_done.dma.addr;
4110 bzero(ss->rx_done.entry, bytes);
4113 * allocate the per-slice firmware stats; stats
4114 * (including tx) are used used only on the first
4117 #ifndef IFNET_BUF_RING
4122 bytes = sizeof (*ss->fw_stats);
4123 err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4124 sizeof (*ss->fw_stats), 64);
4127 ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4128 ksnprintf(ss->tx.lock_name, sizeof(ss->tx.lock_name),
4129 "%s:tx(%d)", device_get_nameunit(sc->dev), i);
4130 lockinit(&ss->tx.lock, ss->tx.lock_name, 0, LK_CANRECURSE);
4131 #ifdef IFNET_BUF_RING
4132 ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4140 mxge_free_slices(sc);
4145 mxge_slice_probe(mxge_softc_t *sc)
4149 int msix_cnt, status, max_intr_slots;
4153 * don't enable multiple slices if they are not enabled,
4154 * or if this is not an SMP system
4157 if (mxge_max_slices == 0 || mxge_max_slices == 1 || ncpus < 2)
4160 /* see how many MSI-X interrupts are available */
4161 msix_cnt = pci_msix_count(sc->dev);
4165 /* now load the slice aware firmware see what it supports */
4166 old_fw = sc->fw_name;
4167 if (old_fw == mxge_fw_aligned)
4168 sc->fw_name = mxge_fw_rss_aligned;
4170 sc->fw_name = mxge_fw_rss_unaligned;
4171 status = mxge_load_firmware(sc, 0);
4173 device_printf(sc->dev, "Falling back to a single slice\n");
4177 /* try to send a reset command to the card to see if it
4179 memset(&cmd, 0, sizeof (cmd));
4180 status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4182 device_printf(sc->dev, "failed reset\n");
4186 /* get rx ring size */
4187 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4189 device_printf(sc->dev, "Cannot determine rx ring size\n");
4192 max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4194 /* tell it the size of the interrupt queues */
4195 cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4196 status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4198 device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4202 /* ask the maximum number of slices it supports */
4203 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4205 device_printf(sc->dev,
4206 "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4209 sc->num_slices = cmd.data0;
4210 if (sc->num_slices > msix_cnt)
4211 sc->num_slices = msix_cnt;
4213 if (mxge_max_slices == -1) {
4214 /* cap to number of CPUs in system */
4215 if (sc->num_slices > ncpus)
4216 sc->num_slices = ncpus;
4218 if (sc->num_slices > mxge_max_slices)
4219 sc->num_slices = mxge_max_slices;
4221 /* make sure it is a power of two */
4222 while (sc->num_slices & (sc->num_slices - 1))
4226 device_printf(sc->dev, "using %d slices\n",
4232 sc->fw_name = old_fw;
4233 (void) mxge_load_firmware(sc, 0);
4237 mxge_add_msix_irqs(mxge_softc_t *sc)
4240 int count, err, i, rid;
4243 sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4246 if (sc->msix_table_res == NULL) {
4247 device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4251 count = sc->num_slices;
4252 err = pci_alloc_msix(sc->dev, &count);
4254 device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4255 "err = %d \n", sc->num_slices, err);
4256 goto abort_with_msix_table;
4258 if (count < sc->num_slices) {
4259 device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4260 count, sc->num_slices);
4261 device_printf(sc->dev,
4262 "Try setting hw.mxge.max_slices to %d\n",
4265 goto abort_with_msix;
4267 bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4268 sc->msix_irq_res = kmalloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4269 if (sc->msix_irq_res == NULL) {
4271 goto abort_with_msix;
4274 for (i = 0; i < sc->num_slices; i++) {
4276 sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4279 if (sc->msix_irq_res[i] == NULL) {
4280 device_printf(sc->dev, "couldn't allocate IRQ res"
4281 " for message %d\n", i);
4283 goto abort_with_res;
4287 bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4288 sc->msix_ih = kmalloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4290 for (i = 0; i < sc->num_slices; i++) {
4291 err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4293 mxge_intr, &sc->ss[i], &sc->msix_ih[i],
4294 XXX /* serializer */);
4296 device_printf(sc->dev, "couldn't setup intr for "
4298 goto abort_with_intr;
4303 device_printf(sc->dev, "using %d msix IRQs:",
4305 for (i = 0; i < sc->num_slices; i++)
4306 kprintf(" %ld", rman_get_start(sc->msix_irq_res[i]));
4312 for (i = 0; i < sc->num_slices; i++) {
4313 if (sc->msix_ih[i] != NULL) {
4314 bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4316 sc->msix_ih[i] = NULL;
4319 kfree(sc->msix_ih, M_DEVBUF);
4323 for (i = 0; i < sc->num_slices; i++) {
4325 if (sc->msix_irq_res[i] != NULL)
4326 bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4327 sc->msix_irq_res[i]);
4328 sc->msix_irq_res[i] = NULL;
4330 kfree(sc->msix_irq_res, M_DEVBUF);
4334 pci_release_msi(sc->dev);
4336 abort_with_msix_table:
4337 bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4338 sc->msix_table_res);
4344 mxge_add_single_irq(mxge_softc_t *sc)
4346 int count, err, rid;
4348 count = pci_msi_count(sc->dev);
4349 if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4355 sc->irq_res = bus_alloc_resource(sc->dev, SYS_RES_IRQ, &rid, 0, ~0,
4356 1, RF_SHAREABLE | RF_ACTIVE);
4357 if (sc->irq_res == NULL) {
4358 device_printf(sc->dev, "could not alloc interrupt\n");
4362 device_printf(sc->dev, "using %s irq %ld\n",
4363 sc->legacy_irq ? "INTx" : "MSI",
4364 rman_get_start(sc->irq_res));
4365 err = bus_setup_intr(sc->dev, sc->irq_res,
4367 mxge_intr, &sc->ss[0], &sc->ih,
4368 XXX /* serializer */);
4370 bus_release_resource(sc->dev, SYS_RES_IRQ,
4371 sc->legacy_irq ? 0 : 1, sc->irq_res);
4372 if (!sc->legacy_irq)
4373 pci_release_msi(sc->dev);
4379 mxge_rem_msix_irqs(mxge_softc_t *sc)
4383 for (i = 0; i < sc->num_slices; i++) {
4384 if (sc->msix_ih[i] != NULL) {
4385 bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4387 sc->msix_ih[i] = NULL;
4390 kfree(sc->msix_ih, M_DEVBUF);
4392 for (i = 0; i < sc->num_slices; i++) {
4394 if (sc->msix_irq_res[i] != NULL)
4395 bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4396 sc->msix_irq_res[i]);
4397 sc->msix_irq_res[i] = NULL;
4399 kfree(sc->msix_irq_res, M_DEVBUF);
4401 bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4402 sc->msix_table_res);
4404 pci_release_msi(sc->dev);
4409 mxge_rem_single_irq(mxge_softc_t *sc)
4411 bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4412 bus_release_resource(sc->dev, SYS_RES_IRQ,
4413 sc->legacy_irq ? 0 : 1, sc->irq_res);
4414 if (!sc->legacy_irq)
4415 pci_release_msi(sc->dev);
4419 mxge_rem_irq(mxge_softc_t *sc)
4421 if (sc->num_slices > 1)
4422 mxge_rem_msix_irqs(sc);
4424 mxge_rem_single_irq(sc);
4428 mxge_add_irq(mxge_softc_t *sc)
4432 if (sc->num_slices > 1)
4433 err = mxge_add_msix_irqs(sc);
4435 err = mxge_add_single_irq(sc);
4437 if (0 && err == 0 && sc->num_slices > 1) {
4438 mxge_rem_msix_irqs(sc);
4439 err = mxge_add_msix_irqs(sc);
4446 mxge_attach(device_t dev)
4448 mxge_softc_t *sc = device_get_softc(dev);
4449 struct ifnet *ifp = &sc->arpcom.ac_if;
4453 * avoid rewriting half the lines in this file to use
4454 * &sc->arpcom.ac_if instead
4458 mxge_fetch_tunables(sc);
4460 err = bus_dma_tag_create(NULL, /* parent */
4463 BUS_SPACE_MAXADDR, /* low */
4464 BUS_SPACE_MAXADDR, /* high */
4465 NULL, NULL, /* filter */
4466 65536 + 256, /* maxsize */
4467 MXGE_MAX_SEND_DESC, /* num segs */
4468 65536, /* maxsegsize */
4470 &sc->parent_dmat); /* tag */
4473 device_printf(sc->dev, "Err %d allocating parent dmat\n",
4475 goto abort_with_nothing;
4479 if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4481 ksnprintf(sc->cmd_lock_name, sizeof(sc->cmd_lock_name), "%s:cmd",
4482 device_get_nameunit(dev));
4483 lockinit(&sc->cmd_lock, sc->cmd_lock_name, 0, LK_CANRECURSE);
4484 ksnprintf(sc->driver_lock_name, sizeof(sc->driver_lock_name),
4485 "%s:drv", device_get_nameunit(dev));
4486 lockinit(&sc->driver_lock, sc->driver_lock_name,
4489 callout_init(&sc->co_hdl);
4491 mxge_setup_cfg_space(sc);
4493 /* Map the board into the kernel */
4495 sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
4497 if (sc->mem_res == NULL) {
4498 device_printf(dev, "could not map memory\n");
4500 goto abort_with_lock;
4502 sc->sram = rman_get_virtual(sc->mem_res);
4503 sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4504 if (sc->sram_size > rman_get_size(sc->mem_res)) {
4505 device_printf(dev, "impossible memory region size %ld\n",
4506 rman_get_size(sc->mem_res));
4508 goto abort_with_mem_res;
4511 /* make NULL terminated copy of the EEPROM strings section of
4513 bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4514 bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4515 rman_get_bushandle(sc->mem_res),
4516 sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4518 MXGE_EEPROM_STRINGS_SIZE - 2);
4519 err = mxge_parse_strings(sc);
4521 goto abort_with_mem_res;
4523 /* Enable write combining for efficient use of PCIe bus */
4526 /* Allocate the out of band dma memory */
4527 err = mxge_dma_alloc(sc, &sc->cmd_dma,
4528 sizeof (mxge_cmd_t), 64);
4530 goto abort_with_mem_res;
4531 sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4532 err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4534 goto abort_with_cmd_dma;
4536 err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4538 goto abort_with_zeropad_dma;
4540 /* select & load the firmware */
4541 err = mxge_select_firmware(sc);
4543 goto abort_with_dmabench;
4544 sc->intr_coal_delay = mxge_intr_coal_delay;
4546 mxge_slice_probe(sc);
4547 err = mxge_alloc_slices(sc);
4549 goto abort_with_dmabench;
4551 err = mxge_reset(sc, 0);
4553 goto abort_with_slices;
4555 err = mxge_alloc_rings(sc);
4557 device_printf(sc->dev, "failed to allocate rings\n");
4558 goto abort_with_dmabench;
4561 err = mxge_add_irq(sc);
4563 device_printf(sc->dev, "failed to add irq\n");
4564 goto abort_with_rings;
4567 ifp->if_baudrate = IF_Gbps(10UL);
4568 ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4571 ifp->if_capabilities |= IFCAP_LRO;
4574 #ifdef MXGE_NEW_VLAN_API
4575 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4578 sc->max_mtu = mxge_max_mtu(sc);
4579 if (sc->max_mtu >= 9000)
4580 ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4582 device_printf(dev, "MTU limited to %d. Install "
4583 "latest firmware for 9000 byte jumbo support\n",
4584 sc->max_mtu - ETHER_HDR_LEN);
4585 ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4586 ifp->if_capenable = ifp->if_capabilities;
4587 if (sc->lro_cnt == 0)
4588 ifp->if_capenable &= ~IFCAP_LRO;
4590 ifp->if_init = mxge_init;
4592 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4593 ifp->if_ioctl = mxge_ioctl;
4594 ifp->if_start = mxge_start;
4595 /* Initialise the ifmedia structure */
4596 ifmedia_init(&sc->media, 0, mxge_media_change,
4598 mxge_set_media(sc, IFM_ETHER | IFM_AUTO);
4599 mxge_media_probe(sc);
4601 ether_ifattach(ifp, sc->mac_addr);
4602 /* ether_ifattach sets mtu to ETHERMTU */
4603 if (mxge_initial_mtu != ETHERMTU)
4604 mxge_change_mtu(sc, mxge_initial_mtu);
4606 mxge_add_sysctls(sc);
4607 #ifdef IFNET_BUF_RING
4608 ifp->if_transmit = mxge_transmit;
4609 ifp->if_qflush = mxge_qflush;
4614 mxge_free_rings(sc);
4616 mxge_free_slices(sc);
4617 abort_with_dmabench:
4618 mxge_dma_free(&sc->dmabench_dma);
4619 abort_with_zeropad_dma:
4620 mxge_dma_free(&sc->zeropad_dma);
4622 mxge_dma_free(&sc->cmd_dma);
4624 bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4626 pci_disable_busmaster(dev);
4627 lockuninit(&sc->cmd_lock);
4628 lockuninit(&sc->driver_lock);
4630 abort_with_parent_dmat:
4631 bus_dma_tag_destroy(sc->parent_dmat);
4638 mxge_detach(device_t dev)
4640 mxge_softc_t *sc = device_get_softc(dev);
4642 if (mxge_vlans_active(sc)) {
4643 device_printf(sc->dev,
4644 "Detach vlans before removing module\n");
4647 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
4649 if (sc->ifp->if_flags & IFF_RUNNING)
4651 lockmgr(&sc->driver_lock, LK_RELEASE);
4652 ether_ifdetach(sc->ifp);
4653 callout_drain(&sc->co_hdl);
4654 ifmedia_removeall(&sc->media);
4655 mxge_dummy_rdma(sc, 0);
4656 mxge_rem_sysctls(sc);
4658 mxge_free_rings(sc);
4659 mxge_free_slices(sc);
4660 mxge_dma_free(&sc->dmabench_dma);
4661 mxge_dma_free(&sc->zeropad_dma);
4662 mxge_dma_free(&sc->cmd_dma);
4663 bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4664 pci_disable_busmaster(dev);
4665 lockuninit(&sc->cmd_lock);
4666 lockuninit(&sc->driver_lock);
4668 bus_dma_tag_destroy(sc->parent_dmat);
4673 mxge_shutdown(device_t dev)
4679 This file uses Myri10GE driver indentation.
4682 c-file-style:"linux"