1 /******************************************************************************
3 Copyright (c) 2006-2009, Myricom Inc.
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
9 1. Redistributions of source code must retain the above copyright notice,
10 this list of conditions and the following disclaimer.
12 2. Neither the name of the Myricom Inc, nor the names of its
13 contributors may be used to endorse or promote products derived from
14 this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
28 ***************************************************************************/
30 #include <sys/cdefs.h>
31 /*__FBSDID("$FreeBSD: src/sys/dev/mxge/if_mxge.c,v 1.63 2009/06/26 11:45:06 rwatson Exp $");*/
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/linker.h>
36 #include <sys/firmware.h>
37 #include <sys/endian.h>
38 #include <sys/sockio.h>
40 #include <sys/malloc.h>
41 #include <sys/kernel.h>
43 #include <sys/module.h>
44 #include <sys/socket.h>
45 #include <sys/sysctl.h>
47 /* count xmits ourselves, rather than via drbr */
50 #include <net/if_arp.h>
51 #include <net/ifq_var.h>
52 #include <net/ethernet.h>
53 #include <net/if_dl.h>
54 #include <net/if_media.h>
58 #include <net/if_types.h>
59 #include <net/vlan/if_vlan_var.h>
62 #include <netinet/in_systm.h>
63 #include <netinet/in.h>
64 #include <netinet/ip.h>
65 #include <netinet/tcp.h>
67 #include <machine/resource.h>
71 #include <bus/pci/pcireg.h>
72 #include <bus/pci/pcivar.h>
73 #include <bus/pci/pci_private.h> /* XXX for pci_cfg_restore */
75 #include <vm/vm.h> /* for pmap_mapdev() */
78 #if defined(__i386) || defined(__amd64)
79 #include <machine/specialreg.h>
82 #include <dev/netif/mxge/mxge_mcp.h>
83 #include <dev/netif/mxge/mcp_gen_header.h>
84 /*#define MXGE_FAKE_IFP*/
85 #include <dev/netif/mxge/if_mxge_var.h>
87 #include <sys/buf_ring.h>
93 static int mxge_nvidia_ecrc_enable = 1;
94 static int mxge_force_firmware = 0;
95 static int mxge_intr_coal_delay = 30;
96 static int mxge_deassert_wait = 1;
97 static int mxge_flow_control = 1;
98 static int mxge_verbose = 0;
99 static int mxge_lro_cnt = 8;
100 static int mxge_ticks;
101 static int mxge_max_slices = 1;
102 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
103 static int mxge_always_promisc = 0;
104 static int mxge_initial_mtu = ETHERMTU_JUMBO;
105 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
106 static char *mxge_fw_aligned = "mxge_eth_z8e";
107 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
108 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
110 static int mxge_probe(device_t dev);
111 static int mxge_attach(device_t dev);
112 static int mxge_detach(device_t dev);
113 static int mxge_shutdown(device_t dev);
114 static void mxge_intr(void *arg);
116 static device_method_t mxge_methods[] =
118 /* Device interface */
119 DEVMETHOD(device_probe, mxge_probe),
120 DEVMETHOD(device_attach, mxge_attach),
121 DEVMETHOD(device_detach, mxge_detach),
122 DEVMETHOD(device_shutdown, mxge_shutdown),
126 static driver_t mxge_driver =
130 sizeof(mxge_softc_t),
133 static devclass_t mxge_devclass;
135 /* Declare ourselves to be a child of the PCI bus.*/
136 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
137 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
138 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
140 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
141 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
142 static int mxge_close(mxge_softc_t *sc);
143 static int mxge_open(mxge_softc_t *sc);
144 static void mxge_tick(void *arg);
147 mxge_probe(device_t dev)
152 if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
153 ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
154 (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
155 rev = pci_get_revid(dev);
157 case MXGE_PCI_REV_Z8E:
158 device_set_desc(dev, "Myri10G-PCIE-8A");
160 case MXGE_PCI_REV_Z8ES:
161 device_set_desc(dev, "Myri10G-PCIE-8B");
164 device_set_desc(dev, "Myri10G-PCIE-8??");
165 device_printf(dev, "Unrecognized rev %d NIC\n",
175 mxge_enable_wc(mxge_softc_t *sc)
178 #if defined(__i386) || defined(__amd64)
183 len = rman_get_size(sc->mem_res);
184 err = pmap_change_attr((vm_offset_t) sc->sram,
185 len, PAT_WRITE_COMBINING);
187 device_printf(sc->dev, "pmap_change_attr failed, %d\n",
193 sc->wc = 0; /* TBD: PAT support */
198 /* callback to get our DMA address */
200 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
204 *(bus_addr_t *) arg = segs->ds_addr;
209 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
210 bus_size_t alignment)
213 device_t dev = sc->dev;
214 bus_size_t boundary, maxsegsize;
216 if (bytes > 4096 && alignment == 4096) {
224 /* allocate DMAable memory tags */
225 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
226 alignment, /* alignment */
227 boundary, /* boundary */
228 BUS_SPACE_MAXADDR, /* low */
229 BUS_SPACE_MAXADDR, /* high */
230 NULL, NULL, /* filter */
233 maxsegsize, /* maxsegsize */
234 BUS_DMA_COHERENT, /* flags */
235 &dma->dmat); /* tag */
237 device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
241 /* allocate DMAable memory & map */
242 err = bus_dmamem_alloc(dma->dmat, &dma->addr,
243 (BUS_DMA_WAITOK | BUS_DMA_COHERENT
244 | BUS_DMA_ZERO), &dma->map);
246 device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
247 goto abort_with_dmat;
250 /* load the memory */
251 err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
252 mxge_dmamap_callback,
253 (void *)&dma->bus_addr, 0);
255 device_printf(dev, "couldn't load map (err = %d)\n", err);
261 bus_dmamem_free(dma->dmat, dma->addr, dma->map);
263 (void)bus_dma_tag_destroy(dma->dmat);
269 mxge_dma_free(mxge_dma_t *dma)
271 bus_dmamap_unload(dma->dmat, dma->map);
272 bus_dmamem_free(dma->dmat, dma->addr, dma->map);
273 (void)bus_dma_tag_destroy(dma->dmat);
277 * The eeprom strings on the lanaiX have the format
284 mxge_parse_strings(mxge_softc_t *sc)
286 #define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
291 ptr = sc->eeprom_strings;
292 limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
294 while (ptr < limit && *ptr != '\0') {
295 if (memcmp(ptr, "MAC=", 4) == 0) {
297 sc->mac_addr_string = ptr;
298 for (i = 0; i < 6; i++) {
300 if ((ptr + 2) > limit)
302 sc->mac_addr[i] = strtoul(ptr, NULL, 16);
305 } else if (memcmp(ptr, "PC=", 3) == 0) {
307 strncpy(sc->product_code_string, ptr,
308 sizeof (sc->product_code_string) - 1);
309 } else if (memcmp(ptr, "SN=", 3) == 0) {
311 strncpy(sc->serial_number_string, ptr,
312 sizeof (sc->serial_number_string) - 1);
314 MXGE_NEXT_STRING(ptr);
321 device_printf(sc->dev, "failed to parse eeprom_strings\n");
326 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
328 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
331 unsigned long base, off;
333 device_t pdev, mcp55;
334 uint16_t vendor_id, device_id, word;
335 uintptr_t bus, slot, func, ivend, idev;
339 if (!mxge_nvidia_ecrc_enable)
342 pdev = device_get_parent(device_get_parent(sc->dev));
344 device_printf(sc->dev, "could not find parent?\n");
347 vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
348 device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
350 if (vendor_id != 0x10de)
355 if (device_id == 0x005d) {
356 /* ck804, base address is magic */
358 } else if (device_id >= 0x0374 && device_id <= 0x378) {
359 /* mcp55, base address stored in chipset */
360 mcp55 = pci_find_bsf(0, 0, 0);
362 0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
363 0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
364 word = pci_read_config(mcp55, 0x90, 2);
365 base = ((unsigned long)word & 0x7ffeU) << 25;
372 Test below is commented because it is believed that doing
373 config read/write beyond 0xff will access the config space
374 for the next larger function. Uncomment this and remove
375 the hacky pmap_mapdev() way of accessing config space when
376 FreeBSD grows support for extended pcie config space access
379 /* See if we can, by some miracle, access the extended
381 val = pci_read_config(pdev, 0x178, 4);
382 if (val != 0xffffffff) {
384 pci_write_config(pdev, 0x178, val, 4);
388 /* Rather than using normal pci config space writes, we must
389 * map the Nvidia config space ourselves. This is because on
390 * opteron/nvidia class machine the 0xe000000 mapping is
391 * handled by the nvidia chipset, that means the internal PCI
392 * device (the on-chip northbridge), or the amd-8131 bridge
393 * and things behind them are not visible by this method.
396 BUS_READ_IVAR(device_get_parent(pdev), pdev,
398 BUS_READ_IVAR(device_get_parent(pdev), pdev,
399 PCI_IVAR_SLOT, &slot);
400 BUS_READ_IVAR(device_get_parent(pdev), pdev,
401 PCI_IVAR_FUNCTION, &func);
402 BUS_READ_IVAR(device_get_parent(pdev), pdev,
403 PCI_IVAR_VENDOR, &ivend);
404 BUS_READ_IVAR(device_get_parent(pdev), pdev,
405 PCI_IVAR_DEVICE, &idev);
408 + 0x00100000UL * (unsigned long)bus
409 + 0x00001000UL * (unsigned long)(func
412 /* map it into the kernel */
413 va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
417 device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
420 /* get a pointer to the config space mapped into the kernel */
421 cfgptr = va + (off & PAGE_MASK);
423 /* make sure that we can really access it */
424 vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
425 device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
426 if (! (vendor_id == ivend && device_id == idev)) {
427 device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
428 vendor_id, device_id);
429 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
433 ptr32 = (uint32_t*)(cfgptr + 0x178);
436 if (val == 0xffffffff) {
437 device_printf(sc->dev, "extended mapping failed\n");
438 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
442 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
444 device_printf(sc->dev,
445 "Enabled ECRC on upstream Nvidia bridge "
447 (int)bus, (int)slot, (int)func);
452 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
454 device_printf(sc->dev,
455 "Nforce 4 chipset on non-x86/amd64!?!?!\n");
462 mxge_dma_test(mxge_softc_t *sc, int test_type)
465 bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
471 /* Run a small DMA test.
472 * The magic multipliers to the length tell the firmware
473 * to do DMA read, write, or read+write tests. The
474 * results are returned in cmd.data0. The upper 16
475 * bits of the return is the number of transfers completed.
476 * The lower 16 bits is the time in 0.5us ticks that the
477 * transfers took to complete.
480 len = sc->tx_boundary;
482 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
483 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
484 cmd.data2 = len * 0x10000;
485 status = mxge_send_cmd(sc, test_type, &cmd);
490 sc->read_dma = ((cmd.data0>>16) * len * 2) /
491 (cmd.data0 & 0xffff);
492 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
493 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
494 cmd.data2 = len * 0x1;
495 status = mxge_send_cmd(sc, test_type, &cmd);
500 sc->write_dma = ((cmd.data0>>16) * len * 2) /
501 (cmd.data0 & 0xffff);
503 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
504 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
505 cmd.data2 = len * 0x10001;
506 status = mxge_send_cmd(sc, test_type, &cmd);
511 sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
512 (cmd.data0 & 0xffff);
515 if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
516 device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
523 * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
524 * when the PCI-E Completion packets are aligned on an 8-byte
525 * boundary. Some PCI-E chip sets always align Completion packets; on
526 * the ones that do not, the alignment can be enforced by enabling
527 * ECRC generation (if supported).
529 * When PCI-E Completion packets are not aligned, it is actually more
530 * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
532 * If the driver can neither enable ECRC nor verify that it has
533 * already been enabled, then it must use a firmware image which works
534 * around unaligned completion packets (ethp_z8e.dat), and it should
535 * also ensure that it never gives the device a Read-DMA which is
536 * larger than 2KB by setting the tx_boundary to 2KB. If ECRC is
537 * enabled, then the driver should use the aligned (eth_z8e.dat)
538 * firmware image, and set tx_boundary to 4KB.
542 mxge_firmware_probe(mxge_softc_t *sc)
544 device_t dev = sc->dev;
548 sc->tx_boundary = 4096;
550 * Verify the max read request size was set to 4KB
551 * before trying the test with 4KB.
553 if (pci_find_extcap(dev, PCIY_EXPRESS, ®) == 0) {
554 pectl = pci_read_config(dev, reg + 0x8, 2);
555 if ((pectl & (5 << 12)) != (5 << 12)) {
556 device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
558 sc->tx_boundary = 2048;
563 * load the optimized firmware (which assumes aligned PCIe
564 * completions) in order to see if it works on this host.
566 sc->fw_name = mxge_fw_aligned;
567 status = mxge_load_firmware(sc, 1);
573 * Enable ECRC if possible
575 mxge_enable_nvidia_ecrc(sc);
578 * Run a DMA test which watches for unaligned completions and
579 * aborts on the first one seen.
582 status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
584 return 0; /* keep the aligned firmware */
587 device_printf(dev, "DMA test failed: %d\n", status);
588 if (status == ENOSYS)
589 device_printf(dev, "Falling back to ethp! "
590 "Please install up to date fw\n");
595 mxge_select_firmware(mxge_softc_t *sc)
600 if (mxge_force_firmware != 0) {
601 if (mxge_force_firmware == 1)
606 device_printf(sc->dev,
607 "Assuming %s completions (forced)\n",
608 aligned ? "aligned" : "unaligned");
612 /* if the PCIe link width is 4 or less, we can use the aligned
613 firmware and skip any checks */
614 if (sc->link_width != 0 && sc->link_width <= 4) {
615 device_printf(sc->dev,
616 "PCIe x%d Link, expect reduced performance\n",
622 if (0 == mxge_firmware_probe(sc))
627 sc->fw_name = mxge_fw_aligned;
628 sc->tx_boundary = 4096;
630 sc->fw_name = mxge_fw_unaligned;
631 sc->tx_boundary = 2048;
633 return (mxge_load_firmware(sc, 0));
643 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
647 if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
648 device_printf(sc->dev, "Bad firmware type: 0x%x\n",
649 be32toh(hdr->mcp_type));
653 /* save firmware version for sysctl */
654 strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
656 device_printf(sc->dev, "firmware id: %s\n", hdr->version);
658 ksscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
659 &sc->fw_ver_minor, &sc->fw_ver_tiny);
661 if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
662 && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
663 device_printf(sc->dev, "Found firmware version %s\n",
665 device_printf(sc->dev, "Driver needs %d.%d\n",
666 MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
674 z_alloc(void *nil, u_int items, u_int size)
678 ptr = kmalloc(items * size, M_TEMP, M_NOWAIT);
683 z_free(void *nil, void *ptr)
690 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
693 const mcp_gen_header_t *hdr;
700 fw = firmware_image_load(sc->fw_name, NULL);
702 device_printf(sc->dev, "Could not find firmware image %s\n",
707 /* setup zlib and decompress f/w */
708 bzero(&zs, sizeof (zs));
711 status = inflateInit(&zs);
712 if (status != Z_OK) {
717 /* the uncompressed size is stored as the firmware version,
718 which would otherwise go unused */
719 fw_len = (size_t) fw->version;
720 inflate_buffer = kmalloc(fw_len, M_TEMP, M_NOWAIT);
721 if (inflate_buffer == NULL)
723 zs.avail_in = fw->datasize;
724 zs.next_in = __DECONST(char *, fw->data);
725 zs.avail_out = fw_len;
726 zs.next_out = inflate_buffer;
727 status = inflate(&zs, Z_FINISH);
728 if (status != Z_STREAM_END) {
729 device_printf(sc->dev, "zlib %d\n", status);
731 goto abort_with_buffer;
735 hdr_offset = htobe32(*(const uint32_t *)
736 (fw->fw_image + MCP_HEADER_PTR_OFFSET));
737 if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
738 device_printf(sc->dev, "Bad firmware file");
742 hdr = (const void*)(fw->fw_image + hdr_offset);
744 status = mxge_validate_firmware(sc, hdr);
748 /* Copy the inflated firmware to NIC SRAM. */
749 for (i = 0; i < fw_len; i += 256) {
750 mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
752 min(256U, (unsigned)(fw_len - i)));
762 kfree(inflate_buffer, M_TEMP);
767 firmware_image_unload(fw);
772 * Enable or disable periodic RDMAs from the host to make certain
773 * chipsets resend dropped PCIe messages
777 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
780 volatile uint32_t *confirm;
781 volatile char *submit;
782 uint32_t *buf, dma_low, dma_high;
785 buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
787 /* clear confirmation addr */
788 confirm = (volatile uint32_t *)sc->cmd;
792 /* send an rdma command to the PCIe engine, and wait for the
793 response in the confirmation address. The firmware should
794 write a -1 there to indicate it is alive and well
797 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
798 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
799 buf[0] = htobe32(dma_high); /* confirm addr MSW */
800 buf[1] = htobe32(dma_low); /* confirm addr LSW */
801 buf[2] = htobe32(0xffffffff); /* confirm data */
802 dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
803 dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
804 buf[3] = htobe32(dma_high); /* dummy addr MSW */
805 buf[4] = htobe32(dma_low); /* dummy addr LSW */
806 buf[5] = htobe32(enable); /* enable? */
809 submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
811 mxge_pio_copy(submit, buf, 64);
816 while (*confirm != 0xffffffff && i < 20) {
820 if (*confirm != 0xffffffff) {
821 device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
822 (enable ? "enable" : "disable"), confirm,
829 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
832 char buf_bytes[sizeof(*buf) + 8];
833 volatile mcp_cmd_response_t *response = sc->cmd;
834 volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
835 uint32_t dma_low, dma_high;
836 int err, sleep_total = 0;
838 /* ensure buf is aligned to 8 bytes */
839 buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
841 buf->data0 = htobe32(data->data0);
842 buf->data1 = htobe32(data->data1);
843 buf->data2 = htobe32(data->data2);
844 buf->cmd = htobe32(cmd);
845 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
846 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
848 buf->response_addr.low = htobe32(dma_low);
849 buf->response_addr.high = htobe32(dma_high);
850 lockmgr(&sc->cmd_lock, LK_EXCLUSIVE);
851 response->result = 0xffffffff;
853 mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
855 /* wait up to 20ms */
857 for (sleep_total = 0; sleep_total < 20; sleep_total++) {
858 bus_dmamap_sync(sc->cmd_dma.dmat,
859 sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
861 switch (be32toh(response->result)) {
863 data->data0 = be32toh(response->data);
869 case MXGEFW_CMD_UNKNOWN:
872 case MXGEFW_CMD_ERROR_UNALIGNED:
875 case MXGEFW_CMD_ERROR_BUSY:
879 device_printf(sc->dev,
881 "failed, result = %d\n",
882 cmd, be32toh(response->result));
890 device_printf(sc->dev, "mxge: command %d timed out"
892 cmd, be32toh(response->result));
893 lockmgr(&sc->cmd_lock, LK_RELEASE);
898 mxge_adopt_running_firmware(mxge_softc_t *sc)
900 struct mcp_gen_header *hdr;
901 const size_t bytes = sizeof (struct mcp_gen_header);
905 /* find running firmware header */
906 hdr_offset = htobe32(*(volatile uint32_t *)
907 (sc->sram + MCP_HEADER_PTR_OFFSET));
909 if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
910 device_printf(sc->dev,
911 "Running firmware has bad header offset (%d)\n",
916 /* copy header of running firmware from SRAM to host memory to
917 * validate firmware */
918 hdr = kmalloc(bytes, M_DEVBUF, M_NOWAIT);
920 device_printf(sc->dev, "could not kmalloc firmware hdr\n");
923 bus_space_read_region_1(rman_get_bustag(sc->mem_res),
924 rman_get_bushandle(sc->mem_res),
925 hdr_offset, (char *)hdr, bytes);
926 status = mxge_validate_firmware(sc, hdr);
927 kfree(hdr, M_DEVBUF);
930 * check to see if adopted firmware has bug where adopting
931 * it will cause broadcasts to be filtered unless the NIC
932 * is kept in ALLMULTI mode
934 if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
935 sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
936 sc->adopted_rx_filter_bug = 1;
937 device_printf(sc->dev, "Adopting fw %d.%d.%d: "
938 "working around rx filter bug\n",
939 sc->fw_ver_major, sc->fw_ver_minor,
948 mxge_load_firmware(mxge_softc_t *sc, int adopt)
950 volatile uint32_t *confirm;
951 volatile char *submit;
953 uint32_t *buf, size, dma_low, dma_high;
956 buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
958 size = sc->sram_size;
959 status = mxge_load_firmware_helper(sc, &size);
963 /* Try to use the currently running firmware, if
965 status = mxge_adopt_running_firmware(sc);
967 device_printf(sc->dev,
968 "failed to adopt running firmware\n");
971 device_printf(sc->dev,
972 "Successfully adopted running firmware\n");
973 if (sc->tx_boundary == 4096) {
974 device_printf(sc->dev,
975 "Using firmware currently running on NIC"
977 device_printf(sc->dev,
978 "performance consider loading optimized "
981 sc->fw_name = mxge_fw_unaligned;
982 sc->tx_boundary = 2048;
985 /* clear confirmation addr */
986 confirm = (volatile uint32_t *)sc->cmd;
989 /* send a reload command to the bootstrap MCP, and wait for the
990 response in the confirmation address. The firmware should
991 write a -1 there to indicate it is alive and well
994 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
995 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
997 buf[0] = htobe32(dma_high); /* confirm addr MSW */
998 buf[1] = htobe32(dma_low); /* confirm addr LSW */
999 buf[2] = htobe32(0xffffffff); /* confirm data */
1001 /* FIX: All newest firmware should un-protect the bottom of
1002 the sram before handoff. However, the very first interfaces
1003 do not. Therefore the handoff copy must skip the first 8 bytes
1005 /* where the code starts*/
1006 buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1007 buf[4] = htobe32(size - 8); /* length of code */
1008 buf[5] = htobe32(8); /* where to copy to */
1009 buf[6] = htobe32(0); /* where to jump to */
1011 submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1012 mxge_pio_copy(submit, buf, 64);
1017 while (*confirm != 0xffffffff && i < 20) {
1020 bus_dmamap_sync(sc->cmd_dma.dmat,
1021 sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1023 if (*confirm != 0xffffffff) {
1024 device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1033 mxge_update_mac_address(mxge_softc_t *sc)
1036 uint8_t *addr = sc->mac_addr;
1040 cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1041 | (addr[2] << 8) | addr[3]);
1043 cmd.data1 = ((addr[4] << 8) | (addr[5]));
1045 status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1050 mxge_change_pause(mxge_softc_t *sc, int pause)
1056 status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1059 status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1063 device_printf(sc->dev, "Failed to set flow control mode\n");
1071 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1076 if (mxge_always_promisc)
1080 status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1083 status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1087 device_printf(sc->dev, "Failed to set promisc mode\n");
1092 mxge_set_multicast_list(mxge_softc_t *sc)
1095 struct ifmultiaddr *ifma;
1096 struct ifnet *ifp = sc->ifp;
1099 /* This firmware is known to not support multicast */
1100 if (!sc->fw_multicast_support)
1103 /* Disable multicast filtering while we play with the lists*/
1104 err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1106 device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1107 " error status: %d\n", err);
1111 if (sc->adopted_rx_filter_bug)
1114 if (ifp->if_flags & IFF_ALLMULTI)
1115 /* request to disable multicast filtering, so quit here */
1118 /* Flush all the filters */
1120 err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1122 device_printf(sc->dev,
1123 "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1124 ", error status: %d\n", err);
1128 /* Walk the multicast list, and add each address */
1130 if_maddr_rlock(ifp);
1131 LIST_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1132 if (ifma->ifma_addr->sa_family != AF_LINK)
1134 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1136 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1138 cmd.data0 = htonl(cmd.data0);
1139 cmd.data1 = htonl(cmd.data1);
1140 err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1142 device_printf(sc->dev, "Failed "
1143 "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1145 /* abort, leaving multicast filtering off */
1146 if_maddr_runlock(ifp);
1150 if_maddr_runlock(ifp);
1151 /* Enable multicast filtering */
1152 err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1154 device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1155 ", error status: %d\n", err);
1160 mxge_max_mtu(mxge_softc_t *sc)
1165 if (MJUMPAGESIZE - MXGEFW_PAD > MXGEFW_MAX_MTU)
1166 return MXGEFW_MAX_MTU - MXGEFW_PAD;
1168 /* try to set nbufs to see if it we can
1169 use virtually contiguous jumbos */
1171 status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1174 return MXGEFW_MAX_MTU - MXGEFW_PAD;
1176 /* otherwise, we're limited to MJUMPAGESIZE */
1177 return MJUMPAGESIZE - MXGEFW_PAD;
1181 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1183 struct mxge_slice_state *ss;
1184 mxge_rx_done_t *rx_done;
1185 volatile uint32_t *irq_claim;
1189 /* try to send a reset command to the card to see if it
1191 memset(&cmd, 0, sizeof (cmd));
1192 status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1194 device_printf(sc->dev, "failed reset\n");
1198 mxge_dummy_rdma(sc, 1);
1201 /* set the intrq size */
1202 cmd.data0 = sc->rx_ring_size;
1203 status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1206 * Even though we already know how many slices are supported
1207 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1208 * has magic side effects, and must be called after a reset.
1209 * It must be called prior to calling any RSS related cmds,
1210 * including assigning an interrupt queue for anything but
1211 * slice 0. It must also be called *after*
1212 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1213 * the firmware to compute offsets.
1216 if (sc->num_slices > 1) {
1217 /* ask the maximum number of slices it supports */
1218 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1221 device_printf(sc->dev,
1222 "failed to get number of slices\n");
1226 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1227 * to setting up the interrupt queue DMA
1229 cmd.data0 = sc->num_slices;
1230 cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1231 #ifdef IFNET_BUF_RING
1232 cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1234 status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1237 device_printf(sc->dev,
1238 "failed to set number of slices\n");
1244 if (interrupts_setup) {
1245 /* Now exchange information about interrupts */
1246 for (slice = 0; slice < sc->num_slices; slice++) {
1247 rx_done = &sc->ss[slice].rx_done;
1248 memset(rx_done->entry, 0, sc->rx_ring_size);
1249 cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1250 cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1252 status |= mxge_send_cmd(sc,
1253 MXGEFW_CMD_SET_INTRQ_DMA,
1258 status |= mxge_send_cmd(sc,
1259 MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1262 sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1264 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1265 irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1268 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1270 sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1272 device_printf(sc->dev, "failed set interrupt parameters\n");
1277 *sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1280 /* run a DMA benchmark */
1281 (void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1283 for (slice = 0; slice < sc->num_slices; slice++) {
1284 ss = &sc->ss[slice];
1286 ss->irq_claim = irq_claim + (2 * slice);
1287 /* reset mcp/driver shared state back to 0 */
1288 ss->rx_done.idx = 0;
1289 ss->rx_done.cnt = 0;
1292 ss->tx.pkt_done = 0;
1293 ss->tx.queue_active = 0;
1294 ss->tx.activate = 0;
1295 ss->tx.deactivate = 0;
1300 ss->rx_small.cnt = 0;
1301 ss->lro_bad_csum = 0;
1303 ss->lro_flushed = 0;
1304 if (ss->fw_stats != NULL) {
1305 ss->fw_stats->valid = 0;
1306 ss->fw_stats->send_done_count = 0;
1309 sc->rdma_tags_available = 15;
1310 status = mxge_update_mac_address(sc);
1311 mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1312 mxge_change_pause(sc, sc->pause);
1313 mxge_set_multicast_list(sc);
1318 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1321 unsigned int intr_coal_delay;
1325 intr_coal_delay = sc->intr_coal_delay;
1326 err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1330 if (intr_coal_delay == sc->intr_coal_delay)
1333 if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1336 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
1337 *sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1338 sc->intr_coal_delay = intr_coal_delay;
1340 lockmgr(&sc->driver_lock, LK_RELEASE);
1345 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1348 unsigned int enabled;
1352 enabled = sc->pause;
1353 err = sysctl_handle_int(oidp, &enabled, arg2, req);
1357 if (enabled == sc->pause)
1360 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
1361 err = mxge_change_pause(sc, enabled);
1362 lockmgr(&sc->driver_lock, LK_RELEASE);
1367 mxge_change_lro_locked(mxge_softc_t *sc, int lro_cnt)
1374 ifp->if_capenable &= ~IFCAP_LRO;
1376 ifp->if_capenable |= IFCAP_LRO;
1377 sc->lro_cnt = lro_cnt;
1378 if (ifp->if_flags & IFF_RUNNING) {
1380 err = mxge_open(sc);
1386 mxge_change_lro(SYSCTL_HANDLER_ARGS)
1389 unsigned int lro_cnt;
1393 lro_cnt = sc->lro_cnt;
1394 err = sysctl_handle_int(oidp, &lro_cnt, arg2, req);
1398 if (lro_cnt == sc->lro_cnt)
1404 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
1405 err = mxge_change_lro_locked(sc, lro_cnt);
1406 lockmgr(&sc->driver_lock, LK_RELEASE);
1411 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1417 arg2 = be32toh(*(int *)arg1);
1419 err = sysctl_handle_int(oidp, arg1, arg2, req);
1425 mxge_rem_sysctls(mxge_softc_t *sc)
1427 struct mxge_slice_state *ss;
1430 if (sc->slice_sysctl_tree == NULL)
1433 for (slice = 0; slice < sc->num_slices; slice++) {
1434 ss = &sc->ss[slice];
1435 if (ss == NULL || ss->sysctl_tree == NULL)
1437 sysctl_ctx_free(&ss->sysctl_ctx);
1438 ss->sysctl_tree = NULL;
1440 sysctl_ctx_free(&sc->slice_sysctl_ctx);
1441 sc->slice_sysctl_tree = NULL;
1445 mxge_add_sysctls(mxge_softc_t *sc)
1447 struct sysctl_ctx_list *ctx;
1448 struct sysctl_oid_list *children;
1450 struct mxge_slice_state *ss;
1454 ctx = &sc->sysctl_ctx;
1455 sysctl_ctx_init(ctx);
1456 sc->sysctl_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_STATIC_CHILDREN(_hw),
1458 device_get_nameunit(sc->dev),
1460 if (sc->sysctl_tree == NULL) {
1461 device_printf(sc->dev, "can't add sysctl node\n");
1465 children = SYSCTL_CHILDREN(sc->sysctl_tree);
1466 fw = sc->ss[0].fw_stats;
1468 /* random information */
1469 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1471 CTLFLAG_RD, &sc->fw_version,
1472 0, "firmware version");
1473 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1475 CTLFLAG_RD, &sc->serial_number_string,
1476 0, "serial number");
1477 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1479 CTLFLAG_RD, &sc->product_code_string,
1481 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1483 CTLFLAG_RD, &sc->link_width,
1485 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1487 CTLFLAG_RD, &sc->tx_boundary,
1489 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1491 CTLFLAG_RD, &sc->wc,
1492 0, "write combining PIO?");
1493 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1495 CTLFLAG_RD, &sc->read_dma,
1496 0, "DMA Read speed in MB/s");
1497 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1499 CTLFLAG_RD, &sc->write_dma,
1500 0, "DMA Write speed in MB/s");
1501 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1502 "read_write_dma_MBs",
1503 CTLFLAG_RD, &sc->read_write_dma,
1504 0, "DMA concurrent Read/Write speed in MB/s");
1507 /* performance related tunables */
1508 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1510 CTLTYPE_INT|CTLFLAG_RW, sc,
1511 0, mxge_change_intr_coal,
1512 "I", "interrupt coalescing delay in usecs");
1514 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1515 "flow_control_enabled",
1516 CTLTYPE_INT|CTLFLAG_RW, sc,
1517 0, mxge_change_flow_control,
1518 "I", "interrupt coalescing delay in usecs");
1520 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1522 CTLFLAG_RW, &mxge_deassert_wait,
1523 0, "Wait for IRQ line to go low in ihandler");
1525 /* stats block from firmware is in network byte order.
1527 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1529 CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1530 0, mxge_handle_be32,
1532 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1533 "rdma_tags_available",
1534 CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1535 0, mxge_handle_be32,
1536 "I", "rdma_tags_available");
1537 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1538 "dropped_bad_crc32",
1539 CTLTYPE_INT|CTLFLAG_RD,
1540 &fw->dropped_bad_crc32,
1541 0, mxge_handle_be32,
1542 "I", "dropped_bad_crc32");
1543 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1545 CTLTYPE_INT|CTLFLAG_RD,
1546 &fw->dropped_bad_phy,
1547 0, mxge_handle_be32,
1548 "I", "dropped_bad_phy");
1549 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1550 "dropped_link_error_or_filtered",
1551 CTLTYPE_INT|CTLFLAG_RD,
1552 &fw->dropped_link_error_or_filtered,
1553 0, mxge_handle_be32,
1554 "I", "dropped_link_error_or_filtered");
1555 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1556 "dropped_link_overflow",
1557 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1558 0, mxge_handle_be32,
1559 "I", "dropped_link_overflow");
1560 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1561 "dropped_multicast_filtered",
1562 CTLTYPE_INT|CTLFLAG_RD,
1563 &fw->dropped_multicast_filtered,
1564 0, mxge_handle_be32,
1565 "I", "dropped_multicast_filtered");
1566 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1567 "dropped_no_big_buffer",
1568 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1569 0, mxge_handle_be32,
1570 "I", "dropped_no_big_buffer");
1571 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1572 "dropped_no_small_buffer",
1573 CTLTYPE_INT|CTLFLAG_RD,
1574 &fw->dropped_no_small_buffer,
1575 0, mxge_handle_be32,
1576 "I", "dropped_no_small_buffer");
1577 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1579 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1580 0, mxge_handle_be32,
1581 "I", "dropped_overrun");
1582 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1584 CTLTYPE_INT|CTLFLAG_RD,
1586 0, mxge_handle_be32,
1587 "I", "dropped_pause");
1588 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1590 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1591 0, mxge_handle_be32,
1592 "I", "dropped_runt");
1594 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1595 "dropped_unicast_filtered",
1596 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1597 0, mxge_handle_be32,
1598 "I", "dropped_unicast_filtered");
1600 /* verbose printing? */
1601 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1603 CTLFLAG_RW, &mxge_verbose,
1604 0, "verbose printing");
1607 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1609 CTLTYPE_INT|CTLFLAG_RW, sc,
1611 "I", "number of lro merge queues");
1614 /* add counters exported for debugging from all slices */
1615 sysctl_ctx_init(&sc->slice_sysctl_ctx);
1616 sc->slice_sysctl_tree =
1617 SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1618 "slice", CTLFLAG_RD, 0, "");
1620 for (slice = 0; slice < sc->num_slices; slice++) {
1621 ss = &sc->ss[slice];
1622 sysctl_ctx_init(&ss->sysctl_ctx);
1623 ctx = &ss->sysctl_ctx;
1624 children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1625 ksprintf(slice_num, "%d", slice);
1627 SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1629 children = SYSCTL_CHILDREN(ss->sysctl_tree);
1630 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1632 CTLFLAG_RD, &ss->rx_small.cnt,
1634 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1636 CTLFLAG_RD, &ss->rx_big.cnt,
1638 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1639 "lro_flushed", CTLFLAG_RD, &ss->lro_flushed,
1640 0, "number of lro merge queues flushed");
1642 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1643 "lro_queued", CTLFLAG_RD, &ss->lro_queued,
1644 0, "number of frames appended to lro merge"
1647 #ifndef IFNET_BUF_RING
1648 /* only transmit from slice 0 for now */
1652 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1654 CTLFLAG_RD, &ss->tx.req,
1657 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1659 CTLFLAG_RD, &ss->tx.done,
1661 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1663 CTLFLAG_RD, &ss->tx.pkt_done,
1665 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1667 CTLFLAG_RD, &ss->tx.stall,
1669 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1671 CTLFLAG_RD, &ss->tx.wake,
1673 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1675 CTLFLAG_RD, &ss->tx.defrag,
1677 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1679 CTLFLAG_RD, &ss->tx.queue_active,
1680 0, "tx_queue_active");
1681 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1683 CTLFLAG_RD, &ss->tx.activate,
1685 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1687 CTLFLAG_RD, &ss->tx.deactivate,
1688 0, "tx_deactivate");
1692 /* copy an array of mcp_kreq_ether_send_t's to the mcp. Copy
1693 backwards one at a time and handle ring wraps */
1696 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1697 mcp_kreq_ether_send_t *src, int cnt)
1699 int idx, starting_slot;
1700 starting_slot = tx->req;
1703 idx = (starting_slot + cnt) & tx->mask;
1704 mxge_pio_copy(&tx->lanai[idx],
1705 &src[cnt], sizeof(*src));
1711 * copy an array of mcp_kreq_ether_send_t's to the mcp. Copy
1712 * at most 32 bytes at a time, so as to avoid involving the software
1713 * pio handler in the nic. We re-write the first segment's flags
1714 * to mark them valid only after writing the entire chain
1718 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1723 volatile uint32_t *dst_ints;
1724 mcp_kreq_ether_send_t *srcp;
1725 volatile mcp_kreq_ether_send_t *dstp, *dst;
1728 idx = tx->req & tx->mask;
1730 last_flags = src->flags;
1733 dst = dstp = &tx->lanai[idx];
1736 if ((idx + cnt) < tx->mask) {
1737 for (i = 0; i < (cnt - 1); i += 2) {
1738 mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1739 wmb(); /* force write every 32 bytes */
1744 /* submit all but the first request, and ensure
1745 that it is submitted below */
1746 mxge_submit_req_backwards(tx, src, cnt);
1750 /* submit the first request */
1751 mxge_pio_copy(dstp, srcp, sizeof(*src));
1752 wmb(); /* barrier before setting valid flag */
1755 /* re-write the last 32-bits with the valid flags */
1756 src->flags = last_flags;
1757 src_ints = (uint32_t *)src;
1759 dst_ints = (volatile uint32_t *)dst;
1761 *dst_ints = *src_ints;
1769 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1770 int busdma_seg_cnt, int ip_off)
1773 mcp_kreq_ether_send_t *req;
1774 bus_dma_segment_t *seg;
1777 uint32_t low, high_swapped;
1778 int len, seglen, cum_len, cum_len_next;
1779 int next_is_first, chop, cnt, rdma_count, small;
1780 uint16_t pseudo_hdr_offset, cksum_offset, mss;
1781 uint8_t flags, flags_next;
1784 mss = m->m_pkthdr.tso_segsz;
1786 /* negative cum_len signifies to the
1787 * send loop that we are still in the
1788 * header portion of the TSO packet.
1791 /* ensure we have the ethernet, IP and TCP
1792 header together in the first mbuf, copy
1793 it to a scratch buffer if not */
1794 if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1795 m_copydata(m, 0, ip_off + sizeof (*ip),
1797 ip = (struct ip *)(ss->scratch + ip_off);
1799 ip = (struct ip *)(mtod(m, char *) + ip_off);
1801 if (__predict_false(m->m_len < ip_off + (ip->ip_hl << 2)
1803 m_copydata(m, 0, ip_off + (ip->ip_hl << 2)
1804 + sizeof (*tcp), ss->scratch);
1805 ip = (struct ip *)(mtod(m, char *) + ip_off);
1808 tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1809 cum_len = -(ip_off + ((ip->ip_hl + tcp->th_off) << 2));
1811 /* TSO implies checksum offload on this hardware */
1812 cksum_offset = ip_off + (ip->ip_hl << 2);
1813 flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1816 /* for TSO, pseudo_hdr_offset holds mss.
1817 * The firmware figures out where to put
1818 * the checksum by parsing the header. */
1819 pseudo_hdr_offset = htobe16(mss);
1826 /* "rdma_count" is the number of RDMAs belonging to the
1827 * current packet BEFORE the current send request. For
1828 * non-TSO packets, this is equal to "count".
1829 * For TSO packets, rdma_count needs to be reset
1830 * to 0 after a segment cut.
1832 * The rdma_count field of the send request is
1833 * the number of RDMAs of the packet starting at
1834 * that request. For TSO send requests with one ore more cuts
1835 * in the middle, this is the number of RDMAs starting
1836 * after the last cut in the request. All previous
1837 * segments before the last cut implicitly have 1 RDMA.
1839 * Since the number of RDMAs is not known beforehand,
1840 * it must be filled-in retroactively - after each
1841 * segmentation cut or at the end of the entire packet.
1844 while (busdma_seg_cnt) {
1845 /* Break the busdma segment up into pieces*/
1846 low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1847 high_swapped = htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1851 flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1853 cum_len_next = cum_len + seglen;
1854 (req-rdma_count)->rdma_count = rdma_count + 1;
1855 if (__predict_true(cum_len >= 0)) {
1857 chop = (cum_len_next > mss);
1858 cum_len_next = cum_len_next % mss;
1859 next_is_first = (cum_len_next == 0);
1860 flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1861 flags_next |= next_is_first *
1863 rdma_count |= -(chop | next_is_first);
1864 rdma_count += chop & !next_is_first;
1865 } else if (cum_len_next >= 0) {
1870 small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1871 flags_next = MXGEFW_FLAGS_TSO_PLD |
1872 MXGEFW_FLAGS_FIRST |
1873 (small * MXGEFW_FLAGS_SMALL);
1876 req->addr_high = high_swapped;
1877 req->addr_low = htobe32(low);
1878 req->pseudo_hdr_offset = pseudo_hdr_offset;
1880 req->rdma_count = 1;
1881 req->length = htobe16(seglen);
1882 req->cksum_offset = cksum_offset;
1883 req->flags = flags | ((cum_len & 1) *
1884 MXGEFW_FLAGS_ALIGN_ODD);
1887 cum_len = cum_len_next;
1892 if (__predict_false(cksum_offset > seglen))
1893 cksum_offset -= seglen;
1896 if (__predict_false(cnt > tx->max_desc))
1902 (req-rdma_count)->rdma_count = rdma_count;
1906 req->flags |= MXGEFW_FLAGS_TSO_LAST;
1907 } while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1909 tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1910 mxge_submit_req(tx, tx->req_list, cnt);
1911 #ifdef IFNET_BUF_RING
1912 if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1913 /* tell the NIC to start polling this slice */
1915 tx->queue_active = 1;
1923 bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1927 kprintf("tx->max_desc exceeded via TSO!\n");
1928 kprintf("mss = %d, %ld, %d!\n", mss,
1929 (long)seg - (long)tx->seg_list, tx->max_desc);
1936 #endif /* IFCAP_TSO4 */
1938 #ifdef MXGE_NEW_VLAN_API
1940 * We reproduce the software vlan tag insertion from
1941 * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
1942 * vlan tag insertion. We need to advertise this in order to have the
1943 * vlan interface respect our csum offload flags.
1945 static struct mbuf *
1946 mxge_vlan_tag_insert(struct mbuf *m)
1948 struct ether_vlan_header *evl;
1950 M_PREPEND(m, EVL_ENCAPLEN, MB_DONTWAIT);
1951 if (__predict_false(m == NULL))
1953 if (m->m_len < sizeof(*evl)) {
1954 m = m_pullup(m, sizeof(*evl));
1955 if (__predict_false(m == NULL))
1959 * Transform the Ethernet header into an Ethernet header
1960 * with 802.1Q encapsulation.
1962 evl = mtod(m, struct ether_vlan_header *);
1963 bcopy((char *)evl + EVL_ENCAPLEN,
1964 (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
1965 evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
1966 evl->evl_tag = htons(m->m_pkthdr.ether_vlantag);
1967 m->m_flags &= ~M_VLANTAG;
1970 #endif /* MXGE_NEW_VLAN_API */
1973 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
1976 mcp_kreq_ether_send_t *req;
1977 bus_dma_segment_t *seg;
1982 int cnt, cum_len, err, i, idx, odd_flag, ip_off;
1983 uint16_t pseudo_hdr_offset;
1984 uint8_t flags, cksum_offset;
1991 ip_off = sizeof (struct ether_header);
1992 #ifdef MXGE_NEW_VLAN_API
1993 if (m->m_flags & M_VLANTAG) {
1994 m = mxge_vlan_tag_insert(m);
1995 if (__predict_false(m == NULL))
1997 ip_off += EVL_ENCAPLEN;
2000 /* (try to) map the frame for DMA */
2001 idx = tx->req & tx->mask;
2002 err = bus_dmamap_load_mbuf_segment(tx->dmat, tx->info[idx].map,
2003 m, tx->seg_list, 1, &cnt,
2005 if (__predict_false(err == EFBIG)) {
2006 /* Too many segments in the chain. Try
2008 m_tmp = m_defrag(m, M_NOWAIT);
2009 if (m_tmp == NULL) {
2014 err = bus_dmamap_load_mbuf_segment(tx->dmat,
2016 m, tx->seg_list, 1, &cnt,
2019 if (__predict_false(err != 0)) {
2020 device_printf(sc->dev, "bus_dmamap_load_mbuf_segment returned %d"
2021 " packet len = %d\n", err, m->m_pkthdr.len);
2024 bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2025 BUS_DMASYNC_PREWRITE);
2026 tx->info[idx].m = m;
2029 /* TSO is different enough, we handle it in another routine */
2030 if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2031 mxge_encap_tso(ss, m, cnt, ip_off);
2038 pseudo_hdr_offset = 0;
2039 flags = MXGEFW_FLAGS_NO_TSO;
2041 /* checksum offloading? */
2042 if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
2043 /* ensure ip header is in first mbuf, copy
2044 it to a scratch buffer if not */
2045 if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
2046 m_copydata(m, 0, ip_off + sizeof (*ip),
2048 ip = (struct ip *)(ss->scratch + ip_off);
2050 ip = (struct ip *)(mtod(m, char *) + ip_off);
2052 cksum_offset = ip_off + (ip->ip_hl << 2);
2053 pseudo_hdr_offset = cksum_offset + m->m_pkthdr.csum_data;
2054 pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2055 req->cksum_offset = cksum_offset;
2056 flags |= MXGEFW_FLAGS_CKSUM;
2057 odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2061 if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2062 flags |= MXGEFW_FLAGS_SMALL;
2064 /* convert segments into a request list */
2067 req->flags = MXGEFW_FLAGS_FIRST;
2068 for (i = 0; i < cnt; i++) {
2070 htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2072 htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2073 req->length = htobe16(seg->ds_len);
2074 req->cksum_offset = cksum_offset;
2075 if (cksum_offset > seg->ds_len)
2076 cksum_offset -= seg->ds_len;
2079 req->pseudo_hdr_offset = pseudo_hdr_offset;
2080 req->pad = 0; /* complete solid 16-byte block */
2081 req->rdma_count = 1;
2082 req->flags |= flags | ((cum_len & 1) * odd_flag);
2083 cum_len += seg->ds_len;
2089 /* pad runts to 60 bytes */
2093 htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2095 htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2096 req->length = htobe16(60 - cum_len);
2097 req->cksum_offset = 0;
2098 req->pseudo_hdr_offset = pseudo_hdr_offset;
2099 req->pad = 0; /* complete solid 16-byte block */
2100 req->rdma_count = 1;
2101 req->flags |= flags | ((cum_len & 1) * odd_flag);
2105 tx->req_list[0].rdma_count = cnt;
2107 /* print what the firmware will see */
2108 for (i = 0; i < cnt; i++) {
2109 kprintf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2110 "cso:%d, flags:0x%x, rdma:%d\n",
2111 i, (int)ntohl(tx->req_list[i].addr_high),
2112 (int)ntohl(tx->req_list[i].addr_low),
2113 (int)ntohs(tx->req_list[i].length),
2114 (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2115 tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2116 tx->req_list[i].rdma_count);
2118 kprintf("--------------\n");
2120 tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2121 mxge_submit_req(tx, tx->req_list, cnt);
2122 #ifdef IFNET_BUF_RING
2123 if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2124 /* tell the NIC to start polling this slice */
2126 tx->queue_active = 1;
2139 #ifdef IFNET_BUF_RING
2141 mxge_qflush(struct ifnet *ifp)
2143 mxge_softc_t *sc = ifp->if_softc;
2148 for (slice = 0; slice < sc->num_slices; slice++) {
2149 tx = &sc->ss[slice].tx;
2150 lockmgr(&tx->lock, LK_EXCLUSIVE);
2151 while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2153 lockmgr(&tx->lock, LK_RELEASE);
2159 mxge_start_locked(struct mxge_slice_state *ss)
2170 while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2171 m = drbr_dequeue(ifp, tx->br);
2175 /* let BPF see it */
2178 /* give it to the nic */
2181 /* ran out of transmit slots */
2182 if (((ss->if_flags & IFF_OACTIVE) == 0)
2183 && (!drbr_empty(ifp, tx->br))) {
2184 ss->if_flags |= IFF_OACTIVE;
2190 mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2201 if ((ss->if_flags & (IFF_RUNNING|IFF_OACTIVE)) !=
2203 err = drbr_enqueue(ifp, tx->br, m);
2207 if (drbr_empty(ifp, tx->br) &&
2208 ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2209 /* let BPF see it */
2211 /* give it to the nic */
2213 } else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2216 if (!drbr_empty(ifp, tx->br))
2217 mxge_start_locked(ss);
2222 mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2224 mxge_softc_t *sc = ifp->if_softc;
2225 struct mxge_slice_state *ss;
2230 slice = m->m_pkthdr.flowid;
2231 slice &= (sc->num_slices - 1); /* num_slices always power of 2 */
2233 ss = &sc->ss[slice];
2236 if (lockmgr(&tx->lock, LK_EXCLUSIVE|LK_NOWAIT)) {
2237 err = mxge_transmit_locked(ss, m);
2238 lockmgr(&tx->lock, LK_RELEASE);
2240 err = drbr_enqueue(ifp, tx->br, m);
2249 mxge_start_locked(struct mxge_slice_state *ss)
2259 while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2260 m = ifq_dequeue(&ifp->if_snd, NULL);
2264 /* let BPF see it */
2267 /* give it to the nic */
2270 /* ran out of transmit slots */
2271 if ((sc->ifp->if_flags & IFF_OACTIVE) == 0) {
2272 sc->ifp->if_flags |= IFF_OACTIVE;
2278 mxge_start(struct ifnet *ifp)
2280 mxge_softc_t *sc = ifp->if_softc;
2281 struct mxge_slice_state *ss;
2283 /* only use the first slice for now */
2285 lockmgr(&ss->tx.lock, LK_EXCLUSIVE);
2286 mxge_start_locked(ss);
2287 lockmgr(&ss->tx.lock, LK_RELEASE);
2291 * copy an array of mcp_kreq_ether_recv_t's to the mcp. Copy
2292 * at most 32 bytes at a time, so as to avoid involving the software
2293 * pio handler in the nic. We re-write the first segment's low
2294 * DMA address to mark it valid only after we write the entire chunk
2298 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2299 mcp_kreq_ether_recv_t *src)
2303 low = src->addr_low;
2304 src->addr_low = 0xffffffff;
2305 mxge_pio_copy(dst, src, 4 * sizeof (*src));
2307 mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2309 src->addr_low = low;
2310 dst->addr_low = low;
2315 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2317 bus_dma_segment_t seg;
2319 mxge_rx_ring_t *rx = &ss->rx_small;
2322 m = m_gethdr(MB_DONTWAIT, MT_DATA);
2329 err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2330 &seg, 1, &cnt, BUS_DMA_NOWAIT);
2335 rx->info[idx].m = m;
2336 rx->shadow[idx].addr_low =
2337 htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2338 rx->shadow[idx].addr_high =
2339 htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2343 mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2348 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2350 bus_dma_segment_t seg[3];
2352 mxge_rx_ring_t *rx = &ss->rx_big;
2355 if (rx->cl_size == MCLBYTES)
2356 m = m_getcl(MB_DONTWAIT, MT_DATA, M_PKTHDR);
2358 m = m_getjcl(MB_DONTWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2364 m->m_len = rx->mlen;
2365 err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2366 seg, 1, &cnt, BUS_DMA_NOWAIT);
2371 rx->info[idx].m = m;
2372 rx->shadow[idx].addr_low =
2373 htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2374 rx->shadow[idx].addr_high =
2375 htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2377 #if MXGE_VIRT_JUMBOS
2378 for (i = 1; i < cnt; i++) {
2379 rx->shadow[idx + i].addr_low =
2380 htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2381 rx->shadow[idx + i].addr_high =
2382 htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2387 for (i = 0; i < rx->nbufs; i++) {
2388 if ((idx & 7) == 7) {
2389 mxge_submit_8rx(&rx->lanai[idx - 7],
2390 &rx->shadow[idx - 7]);
2398 * Myri10GE hardware checksums are not valid if the sender
2399 * padded the frame with non-zero padding. This is because
2400 * the firmware just does a simple 16-bit 1s complement
2401 * checksum across the entire frame, excluding the first 14
2402 * bytes. It is best to simply to check the checksum and
2403 * tell the stack about it only if the checksum is good
2406 static inline uint16_t
2407 mxge_rx_csum(struct mbuf *m, int csum)
2409 struct ether_header *eh;
2413 eh = mtod(m, struct ether_header *);
2415 /* only deal with IPv4 TCP & UDP for now */
2416 if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2418 ip = (struct ip *)(eh + 1);
2419 if (__predict_false(ip->ip_p != IPPROTO_TCP &&
2420 ip->ip_p != IPPROTO_UDP))
2423 c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2424 htonl(ntohs(csum) + ntohs(ip->ip_len) +
2425 - (ip->ip_hl << 2) + ip->ip_p));
2434 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2436 struct ether_vlan_header *evl;
2437 struct ether_header *eh;
2440 evl = mtod(m, struct ether_vlan_header *);
2441 eh = mtod(m, struct ether_header *);
2444 * fix checksum by subtracting EVL_ENCAPLEN bytes
2445 * after what the firmware thought was the end of the ethernet
2449 /* put checksum into host byte order */
2450 *csum = ntohs(*csum);
2451 partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2452 (*csum) += ~partial;
2453 (*csum) += ((*csum) < ~partial);
2454 (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2455 (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2457 /* restore checksum to network byte order;
2458 later consumers expect this */
2459 *csum = htons(*csum);
2462 #ifdef MXGE_NEW_VLAN_API
2463 m->m_pkthdr.ether_vlantag = ntohs(evl->evl_tag);
2467 mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2471 VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2472 m_tag_prepend(m, mtag);
2476 m->m_flags |= M_VLANTAG;
2479 * Remove the 802.1q header by copying the Ethernet
2480 * addresses over it and adjusting the beginning of
2481 * the data in the mbuf. The encapsulated Ethernet
2482 * type field is already in place.
2484 bcopy((char *)evl, (char *)evl + EVL_ENCAPLEN,
2485 ETHER_HDR_LEN - ETHER_TYPE_LEN);
2486 m_adj(m, EVL_ENCAPLEN);
2491 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2496 struct ether_header *eh;
2498 bus_dmamap_t old_map;
2500 uint16_t tcpudp_csum;
2505 idx = rx->cnt & rx->mask;
2506 rx->cnt += rx->nbufs;
2507 /* save a pointer to the received mbuf */
2508 m = rx->info[idx].m;
2509 /* try to replace the received mbuf */
2510 if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2511 /* drop the frame -- the old mbuf is re-cycled */
2516 /* unmap the received buffer */
2517 old_map = rx->info[idx].map;
2518 bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2519 bus_dmamap_unload(rx->dmat, old_map);
2521 /* swap the bus_dmamap_t's */
2522 rx->info[idx].map = rx->extra_map;
2523 rx->extra_map = old_map;
2525 /* mcp implicitly skips 1st 2 bytes so that packet is properly
2527 m->m_data += MXGEFW_PAD;
2529 m->m_pkthdr.rcvif = ifp;
2530 m->m_len = m->m_pkthdr.len = len;
2532 eh = mtod(m, struct ether_header *);
2533 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2534 mxge_vlan_tag_remove(m, &csum);
2536 /* if the checksum is valid, mark it in the mbuf header */
2537 if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2538 if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2540 /* otherwise, it was a UDP frame, or a TCP frame which
2541 we could not do LRO on. Tell the stack that the
2543 m->m_pkthdr.csum_data = 0xffff;
2544 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2546 /* flowid only valid if RSS hashing is enabled */
2547 if (sc->num_slices > 1) {
2548 m->m_pkthdr.flowid = (ss - sc->ss);
2549 m->m_flags |= M_FLOWID;
2551 /* pass the frame up the stack */
2552 (*ifp->if_input)(ifp, m);
2556 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2560 struct ether_header *eh;
2563 bus_dmamap_t old_map;
2565 uint16_t tcpudp_csum;
2570 idx = rx->cnt & rx->mask;
2572 /* save a pointer to the received mbuf */
2573 m = rx->info[idx].m;
2574 /* try to replace the received mbuf */
2575 if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2576 /* drop the frame -- the old mbuf is re-cycled */
2581 /* unmap the received buffer */
2582 old_map = rx->info[idx].map;
2583 bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2584 bus_dmamap_unload(rx->dmat, old_map);
2586 /* swap the bus_dmamap_t's */
2587 rx->info[idx].map = rx->extra_map;
2588 rx->extra_map = old_map;
2590 /* mcp implicitly skips 1st 2 bytes so that packet is properly
2592 m->m_data += MXGEFW_PAD;
2594 m->m_pkthdr.rcvif = ifp;
2595 m->m_len = m->m_pkthdr.len = len;
2597 eh = mtod(m, struct ether_header *);
2598 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2599 mxge_vlan_tag_remove(m, &csum);
2601 /* if the checksum is valid, mark it in the mbuf header */
2602 if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2603 if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2605 /* otherwise, it was a UDP frame, or a TCP frame which
2606 we could not do LRO on. Tell the stack that the
2608 m->m_pkthdr.csum_data = 0xffff;
2609 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2611 /* flowid only valid if RSS hashing is enabled */
2612 if (sc->num_slices > 1) {
2613 m->m_pkthdr.flowid = (ss - sc->ss);
2614 m->m_flags |= M_FLOWID;
2616 /* pass the frame up the stack */
2617 (*ifp->if_input)(ifp, m);
2621 mxge_clean_rx_done(struct mxge_slice_state *ss)
2623 mxge_rx_done_t *rx_done = &ss->rx_done;
2629 while (rx_done->entry[rx_done->idx].length != 0) {
2630 length = ntohs(rx_done->entry[rx_done->idx].length);
2631 rx_done->entry[rx_done->idx].length = 0;
2632 checksum = rx_done->entry[rx_done->idx].checksum;
2633 if (length <= (MHLEN - MXGEFW_PAD))
2634 mxge_rx_done_small(ss, length, checksum);
2636 mxge_rx_done_big(ss, length, checksum);
2638 rx_done->idx = rx_done->cnt & rx_done->mask;
2640 /* limit potential for livelock */
2641 if (__predict_false(++limit > rx_done->mask / 2))
2645 while (!SLIST_EMPTY(&ss->lro_active)) {
2646 struct lro_entry *lro = SLIST_FIRST(&ss->lro_active);
2647 SLIST_REMOVE_HEAD(&ss->lro_active, next);
2648 mxge_lro_flush(ss, lro);
2655 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2666 while (tx->pkt_done != mcp_idx) {
2667 idx = tx->done & tx->mask;
2669 m = tx->info[idx].m;
2670 /* mbuf and DMA map only attached to the first
2673 ss->obytes += m->m_pkthdr.len;
2674 if (m->m_flags & M_MCAST)
2677 tx->info[idx].m = NULL;
2678 map = tx->info[idx].map;
2679 bus_dmamap_unload(tx->dmat, map);
2682 if (tx->info[idx].flag) {
2683 tx->info[idx].flag = 0;
2688 /* If we have space, clear IFF_OACTIVE to tell the stack that
2689 its OK to send packets */
2690 #ifdef IFNET_BUF_RING
2691 flags = &ss->if_flags;
2693 flags = &ifp->if_flags;
2695 lockmgr(&ss->tx.lock, LK_EXCLUSIVE);
2696 if ((*flags) & IFF_OACTIVE &&
2697 tx->req - tx->done < (tx->mask + 1)/4) {
2698 *(flags) &= ~IFF_OACTIVE;
2700 mxge_start_locked(ss);
2702 #ifdef IFNET_BUF_RING
2703 if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2704 /* let the NIC stop polling this queue, since there
2705 * are no more transmits pending */
2706 if (tx->req == tx->done) {
2708 tx->queue_active = 0;
2714 lockmgr(&ss->tx.lock, LK_RELEASE);
2718 static struct mxge_media_type mxge_xfp_media_types[] =
2720 {IFM_10G_CX4, 0x7f, "10GBASE-CX4 (module)"},
2721 {IFM_10G_SR, (1 << 7), "10GBASE-SR"},
2722 {IFM_10G_LR, (1 << 6), "10GBASE-LR"},
2723 {0, (1 << 5), "10GBASE-ER"},
2724 {IFM_10G_LRM, (1 << 4), "10GBASE-LRM"},
2725 {0, (1 << 3), "10GBASE-SW"},
2726 {0, (1 << 2), "10GBASE-LW"},
2727 {0, (1 << 1), "10GBASE-EW"},
2728 {0, (1 << 0), "Reserved"}
2730 static struct mxge_media_type mxge_sfp_media_types[] =
2732 {0, (1 << 7), "Reserved"},
2733 {IFM_10G_LRM, (1 << 6), "10GBASE-LRM"},
2734 {IFM_10G_LR, (1 << 5), "10GBASE-LR"},
2735 {IFM_10G_SR, (1 << 4), "10GBASE-SR"}
2739 mxge_set_media(mxge_softc_t *sc, int type)
2741 sc->media_flags |= type;
2742 ifmedia_add(&sc->media, sc->media_flags, 0, NULL);
2743 ifmedia_set(&sc->media, sc->media_flags);
2748 * Determine the media type for a NIC. Some XFPs will identify
2749 * themselves only when their link is up, so this is initiated via a
2750 * link up interrupt. However, this can potentially take up to
2751 * several milliseconds, so it is run via the watchdog routine, rather
2752 * than in the interrupt handler itself. This need only be done
2753 * once, not each time the link is up.
2756 mxge_media_probe(mxge_softc_t *sc)
2761 struct mxge_media_type *mxge_media_types = NULL;
2762 int i, err, ms, mxge_media_type_entries;
2765 sc->need_media_probe = 0;
2767 /* if we've already set a media type, we're done */
2768 if (sc->media_flags != (IFM_ETHER | IFM_AUTO))
2772 * parse the product code to deterimine the interface type
2773 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2774 * after the 3rd dash in the driver's cached copy of the
2775 * EEPROM's product code string.
2777 ptr = sc->product_code_string;
2779 device_printf(sc->dev, "Missing product code\n");
2782 for (i = 0; i < 3; i++, ptr++) {
2783 ptr = index(ptr, '-');
2785 device_printf(sc->dev,
2786 "only %d dashes in PC?!?\n", i);
2792 mxge_set_media(sc, IFM_10G_CX4);
2795 else if (*ptr == 'Q') {
2796 /* -Q is Quad Ribbon Fiber */
2797 device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2798 /* FreeBSD has no media type for Quad ribbon fiber */
2804 mxge_media_types = mxge_xfp_media_types;
2805 mxge_media_type_entries =
2806 sizeof (mxge_xfp_media_types) /
2807 sizeof (mxge_xfp_media_types[0]);
2808 byte = MXGE_XFP_COMPLIANCE_BYTE;
2812 if (*ptr == 'S' || *(ptr +1) == 'S') {
2813 /* -S or -2S is SFP+ */
2814 mxge_media_types = mxge_sfp_media_types;
2815 mxge_media_type_entries =
2816 sizeof (mxge_sfp_media_types) /
2817 sizeof (mxge_sfp_media_types[0]);
2822 if (mxge_media_types == NULL) {
2823 device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2828 * At this point we know the NIC has an XFP cage, so now we
2829 * try to determine what is in the cage by using the
2830 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2831 * register. We read just one byte, which may take over
2835 cmd.data0 = 0; /* just fetch 1 byte, not all 256 */
2837 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2838 if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
2839 device_printf(sc->dev, "failed to read XFP\n");
2841 if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
2842 device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
2844 if (err != MXGEFW_CMD_OK) {
2848 /* now we wait for the data to be cached */
2850 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2851 for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
2854 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2856 if (err != MXGEFW_CMD_OK) {
2857 device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
2858 cage_type, err, ms);
2862 if (cmd.data0 == mxge_media_types[0].bitmask) {
2864 device_printf(sc->dev, "%s:%s\n", cage_type,
2865 mxge_media_types[0].name);
2866 mxge_set_media(sc, IFM_10G_CX4);
2869 for (i = 1; i < mxge_media_type_entries; i++) {
2870 if (cmd.data0 & mxge_media_types[i].bitmask) {
2872 device_printf(sc->dev, "%s:%s\n",
2874 mxge_media_types[i].name);
2876 mxge_set_media(sc, mxge_media_types[i].flag);
2880 device_printf(sc->dev, "%s media 0x%x unknown\n", cage_type,
2887 mxge_intr(void *arg)
2889 struct mxge_slice_state *ss = arg;
2890 mxge_softc_t *sc = ss->sc;
2891 mcp_irq_data_t *stats = ss->fw_stats;
2892 mxge_tx_ring_t *tx = &ss->tx;
2893 mxge_rx_done_t *rx_done = &ss->rx_done;
2894 uint32_t send_done_count;
2898 #ifndef IFNET_BUF_RING
2899 /* an interrupt on a non-zero slice is implicitly valid
2900 since MSI-X irqs are not shared */
2902 mxge_clean_rx_done(ss);
2903 *ss->irq_claim = be32toh(3);
2908 /* make sure the DMA has finished */
2909 if (!stats->valid) {
2912 valid = stats->valid;
2914 if (sc->legacy_irq) {
2915 /* lower legacy IRQ */
2916 *sc->irq_deassert = 0;
2917 if (!mxge_deassert_wait)
2918 /* don't wait for conf. that irq is low */
2924 /* loop while waiting for legacy irq deassertion */
2926 /* check for transmit completes and receives */
2927 send_done_count = be32toh(stats->send_done_count);
2928 while ((send_done_count != tx->pkt_done) ||
2929 (rx_done->entry[rx_done->idx].length != 0)) {
2930 if (send_done_count != tx->pkt_done)
2931 mxge_tx_done(ss, (int)send_done_count);
2932 mxge_clean_rx_done(ss);
2933 send_done_count = be32toh(stats->send_done_count);
2935 if (sc->legacy_irq && mxge_deassert_wait)
2937 } while (*((volatile uint8_t *) &stats->valid));
2939 /* fw link & error stats meaningful only on the first slice */
2940 if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
2941 if (sc->link_state != stats->link_up) {
2942 sc->link_state = stats->link_up;
2943 if (sc->link_state) {
2944 sc->ifp->if_link_state = LINK_STATE_UP;
2945 if_link_state_change(sc->ifp);
2947 device_printf(sc->dev, "link up\n");
2949 sc->ifp->if_link_state = LINK_STATE_DOWN;
2950 if_link_state_change(sc->ifp);
2952 device_printf(sc->dev, "link down\n");
2954 sc->need_media_probe = 1;
2956 if (sc->rdma_tags_available !=
2957 be32toh(stats->rdma_tags_available)) {
2958 sc->rdma_tags_available =
2959 be32toh(stats->rdma_tags_available);
2960 device_printf(sc->dev, "RDMA timed out! %d tags "
2961 "left\n", sc->rdma_tags_available);
2964 if (stats->link_down) {
2965 sc->down_cnt += stats->link_down;
2967 sc->ifp->if_link_state = LINK_STATE_DOWN;
2968 if_link_state_change(sc->ifp);
2972 /* check to see if we have rx token to pass back */
2974 *ss->irq_claim = be32toh(3);
2975 *(ss->irq_claim + 1) = be32toh(3);
2979 mxge_init(void *arg)
2986 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
2988 struct lro_entry *lro_entry;
2991 while (!SLIST_EMPTY(&ss->lro_free)) {
2992 lro_entry = SLIST_FIRST(&ss->lro_free);
2993 SLIST_REMOVE_HEAD(&ss->lro_free, next);
2994 kfree(lro_entry, M_DEVBUF);
2997 for (i = 0; i <= ss->rx_big.mask; i++) {
2998 if (ss->rx_big.info[i].m == NULL)
3000 bus_dmamap_unload(ss->rx_big.dmat,
3001 ss->rx_big.info[i].map);
3002 m_freem(ss->rx_big.info[i].m);
3003 ss->rx_big.info[i].m = NULL;
3006 for (i = 0; i <= ss->rx_small.mask; i++) {
3007 if (ss->rx_small.info[i].m == NULL)
3009 bus_dmamap_unload(ss->rx_small.dmat,
3010 ss->rx_small.info[i].map);
3011 m_freem(ss->rx_small.info[i].m);
3012 ss->rx_small.info[i].m = NULL;
3015 /* transmit ring used only on the first slice */
3016 if (ss->tx.info == NULL)
3019 for (i = 0; i <= ss->tx.mask; i++) {
3020 ss->tx.info[i].flag = 0;
3021 if (ss->tx.info[i].m == NULL)
3023 bus_dmamap_unload(ss->tx.dmat,
3024 ss->tx.info[i].map);
3025 m_freem(ss->tx.info[i].m);
3026 ss->tx.info[i].m = NULL;
3031 mxge_free_mbufs(mxge_softc_t *sc)
3035 for (slice = 0; slice < sc->num_slices; slice++)
3036 mxge_free_slice_mbufs(&sc->ss[slice]);
3040 mxge_free_slice_rings(struct mxge_slice_state *ss)
3045 if (ss->rx_done.entry != NULL)
3046 mxge_dma_free(&ss->rx_done.dma);
3047 ss->rx_done.entry = NULL;
3049 if (ss->tx.req_bytes != NULL)
3050 kfree(ss->tx.req_bytes, M_DEVBUF);
3051 ss->tx.req_bytes = NULL;
3053 if (ss->tx.seg_list != NULL)
3054 kfree(ss->tx.seg_list, M_DEVBUF);
3055 ss->tx.seg_list = NULL;
3057 if (ss->rx_small.shadow != NULL)
3058 kfree(ss->rx_small.shadow, M_DEVBUF);
3059 ss->rx_small.shadow = NULL;
3061 if (ss->rx_big.shadow != NULL)
3062 kfree(ss->rx_big.shadow, M_DEVBUF);
3063 ss->rx_big.shadow = NULL;
3065 if (ss->tx.info != NULL) {
3066 if (ss->tx.dmat != NULL) {
3067 for (i = 0; i <= ss->tx.mask; i++) {
3068 bus_dmamap_destroy(ss->tx.dmat,
3069 ss->tx.info[i].map);
3071 bus_dma_tag_destroy(ss->tx.dmat);
3073 kfree(ss->tx.info, M_DEVBUF);
3077 if (ss->rx_small.info != NULL) {
3078 if (ss->rx_small.dmat != NULL) {
3079 for (i = 0; i <= ss->rx_small.mask; i++) {
3080 bus_dmamap_destroy(ss->rx_small.dmat,
3081 ss->rx_small.info[i].map);
3083 bus_dmamap_destroy(ss->rx_small.dmat,
3084 ss->rx_small.extra_map);
3085 bus_dma_tag_destroy(ss->rx_small.dmat);
3087 kfree(ss->rx_small.info, M_DEVBUF);
3089 ss->rx_small.info = NULL;
3091 if (ss->rx_big.info != NULL) {
3092 if (ss->rx_big.dmat != NULL) {
3093 for (i = 0; i <= ss->rx_big.mask; i++) {
3094 bus_dmamap_destroy(ss->rx_big.dmat,
3095 ss->rx_big.info[i].map);
3097 bus_dmamap_destroy(ss->rx_big.dmat,
3098 ss->rx_big.extra_map);
3099 bus_dma_tag_destroy(ss->rx_big.dmat);
3101 kfree(ss->rx_big.info, M_DEVBUF);
3103 ss->rx_big.info = NULL;
3107 mxge_free_rings(mxge_softc_t *sc)
3111 for (slice = 0; slice < sc->num_slices; slice++)
3112 mxge_free_slice_rings(&sc->ss[slice]);
3116 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3117 int tx_ring_entries)
3119 mxge_softc_t *sc = ss->sc;
3125 /* allocate per-slice receive resources */
3127 ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3128 ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3130 /* allocate the rx shadow rings */
3131 bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3132 ss->rx_small.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3133 if (ss->rx_small.shadow == NULL)
3136 bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3137 ss->rx_big.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3138 if (ss->rx_big.shadow == NULL)
3141 /* allocate the rx host info rings */
3142 bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3143 ss->rx_small.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3144 if (ss->rx_small.info == NULL)
3147 bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3148 ss->rx_big.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3149 if (ss->rx_big.info == NULL)
3152 /* allocate the rx busdma resources */
3153 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3155 4096, /* boundary */
3156 BUS_SPACE_MAXADDR, /* low */
3157 BUS_SPACE_MAXADDR, /* high */
3158 NULL, NULL, /* filter */
3159 MHLEN, /* maxsize */
3161 MHLEN, /* maxsegsize */
3162 BUS_DMA_ALLOCNOW, /* flags */
3163 &ss->rx_small.dmat); /* tag */
3165 device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3170 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3172 #if MXGE_VIRT_JUMBOS
3173 4096, /* boundary */
3177 BUS_SPACE_MAXADDR, /* low */
3178 BUS_SPACE_MAXADDR, /* high */
3179 NULL, NULL, /* filter */
3180 3*4096, /* maxsize */
3181 #if MXGE_VIRT_JUMBOS
3183 4096, /* maxsegsize*/
3186 MJUM9BYTES, /* maxsegsize*/
3188 BUS_DMA_ALLOCNOW, /* flags */
3189 &ss->rx_big.dmat); /* tag */
3191 device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3195 for (i = 0; i <= ss->rx_small.mask; i++) {
3196 err = bus_dmamap_create(ss->rx_small.dmat, 0,
3197 &ss->rx_small.info[i].map);
3199 device_printf(sc->dev, "Err %d rx_small dmamap\n",
3204 err = bus_dmamap_create(ss->rx_small.dmat, 0,
3205 &ss->rx_small.extra_map);
3207 device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3212 for (i = 0; i <= ss->rx_big.mask; i++) {
3213 err = bus_dmamap_create(ss->rx_big.dmat, 0,
3214 &ss->rx_big.info[i].map);
3216 device_printf(sc->dev, "Err %d rx_big dmamap\n",
3221 err = bus_dmamap_create(ss->rx_big.dmat, 0,
3222 &ss->rx_big.extra_map);
3224 device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3229 /* now allocate TX resouces */
3231 #ifndef IFNET_BUF_RING
3232 /* only use a single TX ring for now */
3233 if (ss != ss->sc->ss)
3237 ss->tx.mask = tx_ring_entries - 1;
3238 ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3241 /* allocate the tx request copy block */
3243 sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3244 ss->tx.req_bytes = kmalloc(bytes, M_DEVBUF, M_WAITOK);
3245 if (ss->tx.req_bytes == NULL)
3247 /* ensure req_list entries are aligned to 8 bytes */
3248 ss->tx.req_list = (mcp_kreq_ether_send_t *)
3249 ((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3251 /* allocate the tx busdma segment list */
3252 bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3253 ss->tx.seg_list = (bus_dma_segment_t *)
3254 kmalloc(bytes, M_DEVBUF, M_WAITOK);
3255 if (ss->tx.seg_list == NULL)
3258 /* allocate the tx host info ring */
3259 bytes = tx_ring_entries * sizeof (*ss->tx.info);
3260 ss->tx.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3261 if (ss->tx.info == NULL)
3264 /* allocate the tx busdma resources */
3265 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3267 sc->tx_boundary, /* boundary */
3268 BUS_SPACE_MAXADDR, /* low */
3269 BUS_SPACE_MAXADDR, /* high */
3270 NULL, NULL, /* filter */
3271 65536 + 256, /* maxsize */
3272 ss->tx.max_desc - 2, /* num segs */
3273 sc->tx_boundary, /* maxsegsz */
3274 BUS_DMA_ALLOCNOW, /* flags */
3275 &ss->tx.dmat); /* tag */
3278 device_printf(sc->dev, "Err %d allocating tx dmat\n",
3283 /* now use these tags to setup dmamaps for each slot
3285 for (i = 0; i <= ss->tx.mask; i++) {
3286 err = bus_dmamap_create(ss->tx.dmat, 0,
3287 &ss->tx.info[i].map);
3289 device_printf(sc->dev, "Err %d tx dmamap\n",
3299 mxge_alloc_rings(mxge_softc_t *sc)
3303 int tx_ring_entries, rx_ring_entries;
3306 /* get ring sizes */
3307 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3308 tx_ring_size = cmd.data0;
3310 device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3314 tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3315 rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3316 ifq_set_maxlen(&sc->ifp->if_snd, tx_ring_entries - 1);
3317 ifq_set_ready(&sc->ifp->if_snd);
3319 for (slice = 0; slice < sc->num_slices; slice++) {
3320 err = mxge_alloc_slice_rings(&sc->ss[slice],
3329 mxge_free_rings(sc);
3336 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3338 int bufsize = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN + MXGEFW_PAD;
3340 if (bufsize < MCLBYTES) {
3341 /* easy, everything fits in a single buffer */
3342 *big_buf_size = MCLBYTES;
3343 *cl_size = MCLBYTES;
3348 if (bufsize < MJUMPAGESIZE) {
3349 /* still easy, everything still fits in a single buffer */
3350 *big_buf_size = MJUMPAGESIZE;
3351 *cl_size = MJUMPAGESIZE;
3355 #if MXGE_VIRT_JUMBOS
3356 /* now we need to use virtually contiguous buffers */
3357 *cl_size = MJUM9BYTES;
3358 *big_buf_size = 4096;
3359 *nbufs = mtu / 4096 + 1;
3360 /* needs to be a power of two, so round up */
3364 *cl_size = MJUM9BYTES;
3365 *big_buf_size = MJUM9BYTES;
3371 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3376 struct lro_entry *lro_entry;
3381 slice = ss - sc->ss;
3383 SLIST_INIT(&ss->lro_free);
3384 SLIST_INIT(&ss->lro_active);
3386 for (i = 0; i < sc->lro_cnt; i++) {
3387 lro_entry = (struct lro_entry *)
3388 kmalloc(sizeof (*lro_entry), M_DEVBUF,
3390 if (lro_entry == NULL) {
3394 SLIST_INSERT_HEAD(&ss->lro_free, lro_entry, next);
3396 /* get the lanai pointers to the send and receive rings */
3399 #ifndef IFNET_BUF_RING
3400 /* We currently only send from the first slice */
3404 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3406 (volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3407 ss->tx.send_go = (volatile uint32_t *)
3408 (sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3409 ss->tx.send_stop = (volatile uint32_t *)
3410 (sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3411 #ifndef IFNET_BUF_RING
3415 err |= mxge_send_cmd(sc,
3416 MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3417 ss->rx_small.lanai =
3418 (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3420 err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3422 (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3425 device_printf(sc->dev,
3426 "failed to get ring sizes or locations\n");
3430 /* stock receive rings */
3431 for (i = 0; i <= ss->rx_small.mask; i++) {
3432 map = ss->rx_small.info[i].map;
3433 err = mxge_get_buf_small(ss, map, i);
3435 device_printf(sc->dev, "alloced %d/%d smalls\n",
3436 i, ss->rx_small.mask + 1);
3440 for (i = 0; i <= ss->rx_big.mask; i++) {
3441 ss->rx_big.shadow[i].addr_low = 0xffffffff;
3442 ss->rx_big.shadow[i].addr_high = 0xffffffff;
3444 ss->rx_big.nbufs = nbufs;
3445 ss->rx_big.cl_size = cl_size;
3446 ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3447 EVL_ENCAPLEN + MXGEFW_PAD;
3448 for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3449 map = ss->rx_big.info[i].map;
3450 err = mxge_get_buf_big(ss, map, i);
3452 device_printf(sc->dev, "alloced %d/%d bigs\n",
3453 i, ss->rx_big.mask + 1);
3461 mxge_open(mxge_softc_t *sc)
3464 int err, big_bytes, nbufs, slice, cl_size, i;
3466 volatile uint8_t *itable;
3467 struct mxge_slice_state *ss;
3469 /* Copy the MAC address in case it was overridden */
3470 bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3472 err = mxge_reset(sc, 1);
3474 device_printf(sc->dev, "failed to reset\n");
3478 if (sc->num_slices > 1) {
3479 /* setup the indirection table */
3480 cmd.data0 = sc->num_slices;
3481 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3484 err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3487 device_printf(sc->dev,
3488 "failed to setup rss tables\n");
3492 /* just enable an identity mapping */
3493 itable = sc->sram + cmd.data0;
3494 for (i = 0; i < sc->num_slices; i++)
3495 itable[i] = (uint8_t)i;
3498 cmd.data1 = mxge_rss_hash_type;
3499 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3501 device_printf(sc->dev, "failed to enable slices\n");
3507 mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3510 err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3512 /* error is only meaningful if we're trying to set
3513 MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3514 if (err && nbufs > 1) {
3515 device_printf(sc->dev,
3516 "Failed to set alway-use-n to %d\n",
3520 /* Give the firmware the mtu and the big and small buffer
3521 sizes. The firmware wants the big buf size to be a power
3522 of two. Luckily, FreeBSD's clusters are powers of two */
3523 cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
3524 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3525 cmd.data0 = MHLEN - MXGEFW_PAD;
3526 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3528 cmd.data0 = big_bytes;
3529 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3532 device_printf(sc->dev, "failed to setup params\n");
3536 /* Now give him the pointer to the stats block */
3538 #ifdef IFNET_BUF_RING
3539 slice < sc->num_slices;
3544 ss = &sc->ss[slice];
3546 MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3548 MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3549 cmd.data2 = sizeof(struct mcp_irq_data);
3550 cmd.data2 |= (slice << 16);
3551 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3555 bus = sc->ss->fw_stats_dma.bus_addr;
3556 bus += offsetof(struct mcp_irq_data, send_done_count);
3557 cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3558 cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3559 err = mxge_send_cmd(sc,
3560 MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3562 /* Firmware cannot support multicast without STATS_DMA_V2 */
3563 sc->fw_multicast_support = 0;
3565 sc->fw_multicast_support = 1;
3569 device_printf(sc->dev, "failed to setup params\n");
3573 for (slice = 0; slice < sc->num_slices; slice++) {
3574 err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3576 device_printf(sc->dev, "couldn't open slice %d\n",
3582 /* Finally, start the firmware running */
3583 err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3585 device_printf(sc->dev, "Couldn't bring up link\n");
3588 #ifdef IFNET_BUF_RING
3589 for (slice = 0; slice < sc->num_slices; slice++) {
3590 ss = &sc->ss[slice];
3591 ss->if_flags |= IFF_RUNNING;
3592 ss->if_flags &= ~IFF_OACTIVE;
3595 sc->ifp->if_flags |= IFF_RUNNING;
3596 sc->ifp->if_flags &= ~IFF_OACTIVE;
3597 callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3603 mxge_free_mbufs(sc);
3609 mxge_close(mxge_softc_t *sc)
3612 int err, old_down_cnt;
3613 #ifdef IFNET_BUF_RING
3614 struct mxge_slice_state *ss;
3618 callout_stop(&sc->co_hdl);
3619 #ifdef IFNET_BUF_RING
3620 for (slice = 0; slice < sc->num_slices; slice++) {
3621 ss = &sc->ss[slice];
3622 ss->if_flags &= ~IFF_RUNNING;
3625 sc->ifp->if_flags &= ~IFF_RUNNING;
3626 old_down_cnt = sc->down_cnt;
3628 err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3630 device_printf(sc->dev, "Couldn't bring down link\n");
3632 if (old_down_cnt == sc->down_cnt) {
3633 /* wait for down irq */
3634 DELAY(10 * sc->intr_coal_delay);
3637 if (old_down_cnt == sc->down_cnt) {
3638 device_printf(sc->dev, "never got down irq\n");
3641 mxge_free_mbufs(sc);
3647 mxge_setup_cfg_space(mxge_softc_t *sc)
3649 device_t dev = sc->dev;
3651 uint16_t cmd, lnk, pectl;
3653 /* find the PCIe link width and set max read request to 4KB*/
3654 if (pci_find_extcap(dev, PCIY_EXPRESS, ®) == 0) {
3655 lnk = pci_read_config(dev, reg + 0x12, 2);
3656 sc->link_width = (lnk >> 4) & 0x3f;
3658 pectl = pci_read_config(dev, reg + 0x8, 2);
3659 pectl = (pectl & ~0x7000) | (5 << 12);
3660 pci_write_config(dev, reg + 0x8, pectl, 2);
3663 /* Enable DMA and Memory space access */
3664 pci_enable_busmaster(dev);
3665 cmd = pci_read_config(dev, PCIR_COMMAND, 2);
3666 cmd |= PCIM_CMD_MEMEN;
3667 pci_write_config(dev, PCIR_COMMAND, cmd, 2);
3671 mxge_read_reboot(mxge_softc_t *sc)
3673 device_t dev = sc->dev;
3676 /* find the vendor specific offset */
3677 if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
3678 device_printf(sc->dev,
3679 "could not find vendor specific offset\n");
3680 return (uint32_t)-1;
3682 /* enable read32 mode */
3683 pci_write_config(dev, vs + 0x10, 0x3, 1);
3684 /* tell NIC which register to read */
3685 pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3686 return (pci_read_config(dev, vs + 0x14, 4));
3690 mxge_watchdog_reset(mxge_softc_t *sc, int slice)
3692 struct pci_devinfo *dinfo;
3700 device_printf(sc->dev, "Watchdog reset!\n");
3703 * check to see if the NIC rebooted. If it did, then all of
3704 * PCI config space has been reset, and things like the
3705 * busmaster bit will be zero. If this is the case, then we
3706 * must restore PCI config space before the NIC can be used
3709 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3710 if (cmd == 0xffff) {
3712 * maybe the watchdog caught the NIC rebooting; wait
3713 * up to 100ms for it to finish. If it does not come
3714 * back, then give up
3717 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3718 if (cmd == 0xffff) {
3719 device_printf(sc->dev, "NIC disappeared!\n");
3723 if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3724 /* print the reboot status */
3725 reboot = mxge_read_reboot(sc);
3726 device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3728 /* restore PCI configuration space */
3729 dinfo = device_get_ivars(sc->dev);
3730 pci_cfg_restore(sc->dev, dinfo);
3732 /* and redo any changes we made to our config space */
3733 mxge_setup_cfg_space(sc);
3735 if (sc->ifp->if_flags & IFF_RUNNING) {
3737 err = mxge_open(sc);
3740 tx = &sc->ss[slice].tx;
3741 device_printf(sc->dev,
3742 "NIC did not reboot, slice %d ring state:\n",
3744 device_printf(sc->dev,
3745 "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3746 tx->req, tx->done, tx->queue_active);
3747 device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3748 tx->activate, tx->deactivate);
3749 device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3751 be32toh(sc->ss->fw_stats->send_done_count));
3752 device_printf(sc->dev, "not resetting\n");
3758 mxge_watchdog(mxge_softc_t *sc)
3761 uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3764 /* see if we have outstanding transmits, which
3765 have been pending for more than mxge_ticks */
3767 #ifdef IFNET_BUF_RING
3768 (i < sc->num_slices) && (err == 0);
3770 (i < 1) && (err == 0);
3774 if (tx->req != tx->done &&
3775 tx->watchdog_req != tx->watchdog_done &&
3776 tx->done == tx->watchdog_done) {
3777 /* check for pause blocking before resetting */
3778 if (tx->watchdog_rx_pause == rx_pause)
3779 err = mxge_watchdog_reset(sc, i);
3781 device_printf(sc->dev, "Flow control blocking "
3782 "xmits, check link partner\n");
3785 tx->watchdog_req = tx->req;
3786 tx->watchdog_done = tx->done;
3787 tx->watchdog_rx_pause = rx_pause;
3790 if (sc->need_media_probe)
3791 mxge_media_probe(sc);
3796 mxge_update_stats(mxge_softc_t *sc)
3798 struct mxge_slice_state *ss;
3799 u_long ipackets = 0;
3800 u_long opackets = 0;
3801 #ifdef IFNET_BUF_RING
3809 for (slice = 0; slice < sc->num_slices; slice++) {
3810 ss = &sc->ss[slice];
3811 ipackets += ss->ipackets;
3812 opackets += ss->opackets;
3813 #ifdef IFNET_BUF_RING
3814 obytes += ss->obytes;
3815 omcasts += ss->omcasts;
3816 odrops += ss->tx.br->br_drops;
3818 oerrors += ss->oerrors;
3820 sc->ifp->if_ipackets = ipackets;
3821 sc->ifp->if_opackets = opackets;
3822 #ifdef IFNET_BUF_RING
3823 sc->ifp->if_obytes = obytes;
3824 sc->ifp->if_omcasts = omcasts;
3825 sc->ifp->if_snd.ifq_drops = odrops;
3827 sc->ifp->if_oerrors = oerrors;
3831 mxge_tick(void *arg)
3833 mxge_softc_t *sc = arg;
3836 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
3837 /* aggregate stats from different slices */
3838 mxge_update_stats(sc);
3839 if (!sc->watchdog_countdown) {
3840 err = mxge_watchdog(sc);
3841 sc->watchdog_countdown = 4;
3843 sc->watchdog_countdown--;
3845 callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3846 lockmgr(&sc->driver_lock, LK_RELEASE);
3850 mxge_media_change(struct ifnet *ifp)
3856 mxge_change_mtu(mxge_softc_t *sc, int mtu)
3858 struct ifnet *ifp = sc->ifp;
3859 int real_mtu, old_mtu;
3863 real_mtu = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
3864 if ((real_mtu > sc->max_mtu) || real_mtu < 60)
3866 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
3867 old_mtu = ifp->if_mtu;
3869 if (ifp->if_flags & IFF_RUNNING) {
3871 err = mxge_open(sc);
3873 ifp->if_mtu = old_mtu;
3875 (void) mxge_open(sc);
3878 lockmgr(&sc->driver_lock, LK_RELEASE);
3883 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
3885 mxge_softc_t *sc = ifp->if_softc;
3890 ifmr->ifm_status = IFM_AVALID;
3891 ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
3892 ifmr->ifm_active = IFM_AUTO | IFM_ETHER;
3893 ifmr->ifm_active |= sc->link_state ? IFM_FDX : 0;
3897 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data, struct ucred *cr)
3899 mxge_softc_t *sc = ifp->if_softc;
3900 struct ifreq *ifr = (struct ifreq *)data;
3908 err = ether_ioctl(ifp, command, data);
3912 err = mxge_change_mtu(sc, ifr->ifr_mtu);
3916 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
3918 lockmgr(&sc->driver_lock, LK_RELEASE);
3921 if (ifp->if_flags & IFF_UP) {
3922 if (!(ifp->if_flags & IFF_RUNNING)) {
3923 err = mxge_open(sc);
3925 /* take care of promis can allmulti
3927 mxge_change_promisc(sc,
3928 ifp->if_flags & IFF_PROMISC);
3929 mxge_set_multicast_list(sc);
3932 if (ifp->if_flags & IFF_RUNNING) {
3936 lockmgr(&sc->driver_lock, LK_RELEASE);
3941 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
3942 mxge_set_multicast_list(sc);
3943 lockmgr(&sc->driver_lock, LK_RELEASE);
3947 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
3948 mask = ifr->ifr_reqcap ^ ifp->if_capenable;
3949 if (mask & IFCAP_TXCSUM) {
3950 if (IFCAP_TXCSUM & ifp->if_capenable) {
3951 ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
3952 ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
3955 ifp->if_capenable |= IFCAP_TXCSUM;
3956 ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
3958 } else if (mask & IFCAP_RXCSUM) {
3959 if (IFCAP_RXCSUM & ifp->if_capenable) {
3960 ifp->if_capenable &= ~IFCAP_RXCSUM;
3963 ifp->if_capenable |= IFCAP_RXCSUM;
3967 if (mask & IFCAP_TSO4) {
3968 if (IFCAP_TSO4 & ifp->if_capenable) {
3969 ifp->if_capenable &= ~IFCAP_TSO4;
3970 ifp->if_hwassist &= ~CSUM_TSO;
3971 } else if (IFCAP_TXCSUM & ifp->if_capenable) {
3972 ifp->if_capenable |= IFCAP_TSO4;
3973 ifp->if_hwassist |= CSUM_TSO;
3975 kprintf("mxge requires tx checksum offload"
3976 " be enabled to use TSO\n");
3980 if (mask & IFCAP_LRO) {
3981 if (IFCAP_LRO & ifp->if_capenable)
3982 err = mxge_change_lro_locked(sc, 0);
3984 err = mxge_change_lro_locked(sc, mxge_lro_cnt);
3986 if (mask & IFCAP_VLAN_HWTAGGING)
3987 ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
3988 lockmgr(&sc->driver_lock, LK_RELEASE);
3989 VLAN_CAPABILITIES(ifp);
3994 err = ifmedia_ioctl(ifp, (struct ifreq *)data,
3995 &sc->media, command);
4005 mxge_fetch_tunables(mxge_softc_t *sc)
4008 TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4009 TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4010 &mxge_flow_control);
4011 TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4012 &mxge_intr_coal_delay);
4013 TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4014 &mxge_nvidia_ecrc_enable);
4015 TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4016 &mxge_force_firmware);
4017 TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4018 &mxge_deassert_wait);
4019 TUNABLE_INT_FETCH("hw.mxge.verbose",
4021 TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4022 TUNABLE_INT_FETCH("hw.mxge.lro_cnt", &sc->lro_cnt);
4023 TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4024 TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4025 TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4026 if (sc->lro_cnt != 0)
4027 mxge_lro_cnt = sc->lro_cnt;
4031 if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4032 mxge_intr_coal_delay = 30;
4033 if (mxge_ticks == 0)
4034 mxge_ticks = hz / 2;
4035 sc->pause = mxge_flow_control;
4036 if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4037 || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4038 mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
4040 if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4041 mxge_initial_mtu < ETHER_MIN_LEN)
4042 mxge_initial_mtu = ETHERMTU_JUMBO;
4047 mxge_free_slices(mxge_softc_t *sc)
4049 struct mxge_slice_state *ss;
4056 for (i = 0; i < sc->num_slices; i++) {
4058 if (ss->fw_stats != NULL) {
4059 mxge_dma_free(&ss->fw_stats_dma);
4060 ss->fw_stats = NULL;
4061 #ifdef IFNET_BUF_RING
4062 if (ss->tx.br != NULL) {
4063 drbr_free(ss->tx.br, M_DEVBUF);
4067 lockuninit(&ss->tx.lock);
4069 if (ss->rx_done.entry != NULL) {
4070 mxge_dma_free(&ss->rx_done.dma);
4071 ss->rx_done.entry = NULL;
4074 kfree(sc->ss, M_DEVBUF);
4079 mxge_alloc_slices(mxge_softc_t *sc)
4082 struct mxge_slice_state *ss;
4084 int err, i, max_intr_slots;
4086 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4088 device_printf(sc->dev, "Cannot determine rx ring size\n");
4091 sc->rx_ring_size = cmd.data0;
4092 max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4094 bytes = sizeof (*sc->ss) * sc->num_slices;
4095 sc->ss = kmalloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4098 for (i = 0; i < sc->num_slices; i++) {
4103 /* allocate per-slice rx interrupt queues */
4105 bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4106 err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4109 ss->rx_done.entry = ss->rx_done.dma.addr;
4110 bzero(ss->rx_done.entry, bytes);
4113 * allocate the per-slice firmware stats; stats
4114 * (including tx) are used used only on the first
4117 #ifndef IFNET_BUF_RING
4122 bytes = sizeof (*ss->fw_stats);
4123 err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4124 sizeof (*ss->fw_stats), 64);
4127 ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4128 ksnprintf(ss->tx.lock_name, sizeof(ss->tx.lock_name),
4129 "%s:tx(%d)", device_get_nameunit(sc->dev), i);
4130 lockinit(&ss->tx.lock, ss->tx.lock_name, 0, LK_CANRECURSE);
4131 #ifdef IFNET_BUF_RING
4132 ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4140 mxge_free_slices(sc);
4145 mxge_slice_probe(mxge_softc_t *sc)
4149 int msix_cnt, status, max_intr_slots;
4153 * don't enable multiple slices if they are not enabled,
4154 * or if this is not an SMP system
4157 if (mxge_max_slices == 0 || mxge_max_slices == 1 || ncpus < 2)
4160 /* see how many MSI-X interrupts are available */
4161 msix_cnt = pci_msix_count(sc->dev);
4165 /* now load the slice aware firmware see what it supports */
4166 old_fw = sc->fw_name;
4167 if (old_fw == mxge_fw_aligned)
4168 sc->fw_name = mxge_fw_rss_aligned;
4170 sc->fw_name = mxge_fw_rss_unaligned;
4171 status = mxge_load_firmware(sc, 0);
4173 device_printf(sc->dev, "Falling back to a single slice\n");
4177 /* try to send a reset command to the card to see if it
4179 memset(&cmd, 0, sizeof (cmd));
4180 status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4182 device_printf(sc->dev, "failed reset\n");
4186 /* get rx ring size */
4187 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4189 device_printf(sc->dev, "Cannot determine rx ring size\n");
4192 max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4194 /* tell it the size of the interrupt queues */
4195 cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4196 status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4198 device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4202 /* ask the maximum number of slices it supports */
4203 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4205 device_printf(sc->dev,
4206 "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4209 sc->num_slices = cmd.data0;
4210 if (sc->num_slices > msix_cnt)
4211 sc->num_slices = msix_cnt;
4213 if (mxge_max_slices == -1) {
4214 /* cap to number of CPUs in system */
4215 if (sc->num_slices > ncpus)
4216 sc->num_slices = ncpus;
4218 if (sc->num_slices > mxge_max_slices)
4219 sc->num_slices = mxge_max_slices;
4221 /* make sure it is a power of two */
4222 while (sc->num_slices & (sc->num_slices - 1))
4226 device_printf(sc->dev, "using %d slices\n",
4232 sc->fw_name = old_fw;
4233 (void) mxge_load_firmware(sc, 0);
4237 mxge_add_msix_irqs(mxge_softc_t *sc)
4240 int count, err, i, rid;
4243 sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4246 if (sc->msix_table_res == NULL) {
4247 device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4251 count = sc->num_slices;
4252 err = pci_alloc_msix(sc->dev, &count);
4254 device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4255 "err = %d \n", sc->num_slices, err);
4256 goto abort_with_msix_table;
4258 if (count < sc->num_slices) {
4259 device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4260 count, sc->num_slices);
4261 device_printf(sc->dev,
4262 "Try setting hw.mxge.max_slices to %d\n",
4265 goto abort_with_msix;
4267 bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4268 sc->msix_irq_res = kmalloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4269 if (sc->msix_irq_res == NULL) {
4271 goto abort_with_msix;
4274 for (i = 0; i < sc->num_slices; i++) {
4276 sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4279 if (sc->msix_irq_res[i] == NULL) {
4280 device_printf(sc->dev, "couldn't allocate IRQ res"
4281 " for message %d\n", i);
4283 goto abort_with_res;
4287 bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4288 sc->msix_ih = kmalloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4290 for (i = 0; i < sc->num_slices; i++) {
4291 err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4293 mxge_intr, &sc->ss[i], &sc->msix_ih[i],
4294 XXX /* serializer */);
4296 device_printf(sc->dev, "couldn't setup intr for "
4298 goto abort_with_intr;
4303 device_printf(sc->dev, "using %d msix IRQs:",
4305 for (i = 0; i < sc->num_slices; i++)
4306 kprintf(" %ld", rman_get_start(sc->msix_irq_res[i]));
4312 for (i = 0; i < sc->num_slices; i++) {
4313 if (sc->msix_ih[i] != NULL) {
4314 bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4316 sc->msix_ih[i] = NULL;
4319 kfree(sc->msix_ih, M_DEVBUF);
4323 for (i = 0; i < sc->num_slices; i++) {
4325 if (sc->msix_irq_res[i] != NULL)
4326 bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4327 sc->msix_irq_res[i]);
4328 sc->msix_irq_res[i] = NULL;
4330 kfree(sc->msix_irq_res, M_DEVBUF);
4334 pci_release_msi(sc->dev);
4336 abort_with_msix_table:
4337 bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4338 sc->msix_table_res);
4344 mxge_add_single_irq(mxge_softc_t *sc)
4346 int count, err, rid;
4348 count = pci_msi_count(sc->dev);
4349 if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4355 sc->irq_res = bus_alloc_resource(sc->dev, SYS_RES_IRQ, &rid, 0, ~0,
4356 1, RF_SHAREABLE | RF_ACTIVE);
4357 if (sc->irq_res == NULL) {
4358 device_printf(sc->dev, "could not alloc interrupt\n");
4362 device_printf(sc->dev, "using %s irq %ld\n",
4363 sc->legacy_irq ? "INTx" : "MSI",
4364 rman_get_start(sc->irq_res));
4365 err = bus_setup_intr(sc->dev, sc->irq_res,
4367 mxge_intr, &sc->ss[0], &sc->ih,
4368 XXX /* serializer */);
4370 bus_release_resource(sc->dev, SYS_RES_IRQ,
4371 sc->legacy_irq ? 0 : 1, sc->irq_res);
4372 if (!sc->legacy_irq)
4373 pci_release_msi(sc->dev);
4379 mxge_rem_msix_irqs(mxge_softc_t *sc)
4383 for (i = 0; i < sc->num_slices; i++) {
4384 if (sc->msix_ih[i] != NULL) {
4385 bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4387 sc->msix_ih[i] = NULL;
4390 kfree(sc->msix_ih, M_DEVBUF);
4392 for (i = 0; i < sc->num_slices; i++) {
4394 if (sc->msix_irq_res[i] != NULL)
4395 bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4396 sc->msix_irq_res[i]);
4397 sc->msix_irq_res[i] = NULL;
4399 kfree(sc->msix_irq_res, M_DEVBUF);
4401 bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4402 sc->msix_table_res);
4404 pci_release_msi(sc->dev);
4409 mxge_rem_single_irq(mxge_softc_t *sc)
4411 bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4412 bus_release_resource(sc->dev, SYS_RES_IRQ,
4413 sc->legacy_irq ? 0 : 1, sc->irq_res);
4414 if (!sc->legacy_irq)
4415 pci_release_msi(sc->dev);
4419 mxge_rem_irq(mxge_softc_t *sc)
4421 if (sc->num_slices > 1)
4422 mxge_rem_msix_irqs(sc);
4424 mxge_rem_single_irq(sc);
4428 mxge_add_irq(mxge_softc_t *sc)
4432 if (sc->num_slices > 1)
4433 err = mxge_add_msix_irqs(sc);
4435 err = mxge_add_single_irq(sc);
4437 if (0 && err == 0 && sc->num_slices > 1) {
4438 mxge_rem_msix_irqs(sc);
4439 err = mxge_add_msix_irqs(sc);
4446 mxge_attach(device_t dev)
4448 mxge_softc_t *sc = device_get_softc(dev);
4449 struct ifnet *ifp = &sc->arpcom.ac_if;
4453 * avoid rewriting half the lines in this file to use
4454 * &sc->arpcom.ac_if instead
4458 mxge_fetch_tunables(sc);
4460 err = bus_dma_tag_create(NULL, /* parent */
4463 BUS_SPACE_MAXADDR, /* low */
4464 BUS_SPACE_MAXADDR, /* high */
4465 NULL, NULL, /* filter */
4466 65536 + 256, /* maxsize */
4467 MXGE_MAX_SEND_DESC, /* num segs */
4468 65536, /* maxsegsize */
4470 &sc->parent_dmat); /* tag */
4473 device_printf(sc->dev, "Err %d allocating parent dmat\n",
4475 goto abort_with_nothing;
4479 if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4481 ksnprintf(sc->cmd_lock_name, sizeof(sc->cmd_lock_name), "%s:cmd",
4482 device_get_nameunit(dev));
4483 lockinit(&sc->cmd_lock, sc->cmd_lock_name, 0, LK_CANRECURSE);
4484 ksnprintf(sc->driver_lock_name, sizeof(sc->driver_lock_name),
4485 "%s:drv", device_get_nameunit(dev));
4486 lockinit(&sc->driver_lock, sc->driver_lock_name,
4489 callout_init(&sc->co_hdl);
4491 mxge_setup_cfg_space(sc);
4493 /* Map the board into the kernel */
4495 sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
4497 if (sc->mem_res == NULL) {
4498 device_printf(dev, "could not map memory\n");
4500 goto abort_with_lock;
4502 sc->sram = rman_get_virtual(sc->mem_res);
4503 sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4504 if (sc->sram_size > rman_get_size(sc->mem_res)) {
4505 device_printf(dev, "impossible memory region size %ld\n",
4506 rman_get_size(sc->mem_res));
4508 goto abort_with_mem_res;
4511 /* make NULL terminated copy of the EEPROM strings section of
4513 bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4514 bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4515 rman_get_bushandle(sc->mem_res),
4516 sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4518 MXGE_EEPROM_STRINGS_SIZE - 2);
4519 err = mxge_parse_strings(sc);
4521 goto abort_with_mem_res;
4523 /* Enable write combining for efficient use of PCIe bus */
4526 /* Allocate the out of band dma memory */
4527 err = mxge_dma_alloc(sc, &sc->cmd_dma,
4528 sizeof (mxge_cmd_t), 64);
4530 goto abort_with_mem_res;
4531 sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4532 err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4534 goto abort_with_cmd_dma;
4536 err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4538 goto abort_with_zeropad_dma;
4540 /* select & load the firmware */
4541 err = mxge_select_firmware(sc);
4543 goto abort_with_dmabench;
4544 sc->intr_coal_delay = mxge_intr_coal_delay;
4546 mxge_slice_probe(sc);
4547 err = mxge_alloc_slices(sc);
4549 goto abort_with_dmabench;
4551 err = mxge_reset(sc, 0);
4553 goto abort_with_slices;
4555 err = mxge_alloc_rings(sc);
4557 device_printf(sc->dev, "failed to allocate rings\n");
4558 goto abort_with_dmabench;
4561 err = mxge_add_irq(sc);
4563 device_printf(sc->dev, "failed to add irq\n");
4564 goto abort_with_rings;
4567 ifp->if_baudrate = IF_Gbps(10UL);
4568 ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4571 ifp->if_capabilities |= IFCAP_LRO;
4574 #ifdef MXGE_NEW_VLAN_API
4575 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4578 sc->max_mtu = mxge_max_mtu(sc);
4579 if (sc->max_mtu >= 9000)
4580 ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4582 device_printf(dev, "MTU limited to %d. Install "
4583 "latest firmware for 9000 byte jumbo support\n",
4584 sc->max_mtu - ETHER_HDR_LEN);
4585 ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4586 ifp->if_capenable = ifp->if_capabilities;
4587 if (sc->lro_cnt == 0)
4588 ifp->if_capenable &= ~IFCAP_LRO;
4590 ifp->if_init = mxge_init;
4592 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4593 ifp->if_ioctl = mxge_ioctl;
4594 ifp->if_start = mxge_start;
4595 /* Initialise the ifmedia structure */
4596 ifmedia_init(&sc->media, 0, mxge_media_change,
4598 mxge_set_media(sc, IFM_ETHER | IFM_AUTO);
4599 mxge_media_probe(sc);
4601 ether_ifattach(ifp, sc->mac_addr);
4602 /* ether_ifattach sets mtu to ETHERMTU */
4603 if (mxge_initial_mtu != ETHERMTU)
4604 mxge_change_mtu(sc, mxge_initial_mtu);
4606 mxge_add_sysctls(sc);
4607 #ifdef IFNET_BUF_RING
4608 ifp->if_transmit = mxge_transmit;
4609 ifp->if_qflush = mxge_qflush;
4614 mxge_free_rings(sc);
4616 mxge_free_slices(sc);
4617 abort_with_dmabench:
4618 mxge_dma_free(&sc->dmabench_dma);
4619 abort_with_zeropad_dma:
4620 mxge_dma_free(&sc->zeropad_dma);
4622 mxge_dma_free(&sc->cmd_dma);
4624 bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4626 pci_disable_busmaster(dev);
4627 lockuninit(&sc->cmd_lock);
4628 lockuninit(&sc->driver_lock);
4630 abort_with_parent_dmat:
4631 bus_dma_tag_destroy(sc->parent_dmat);
4638 mxge_detach(device_t dev)
4640 mxge_softc_t *sc = device_get_softc(dev);
4642 if (mxge_vlans_active(sc)) {
4643 device_printf(sc->dev,
4644 "Detach vlans before removing module\n");
4647 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
4649 if (sc->ifp->if_flags & IFF_RUNNING)
4651 lockmgr(&sc->driver_lock, LK_RELEASE);
4652 ether_ifdetach(sc->ifp);
4653 callout_drain(&sc->co_hdl);
4654 ifmedia_removeall(&sc->media);
4655 mxge_dummy_rdma(sc, 0);
4656 mxge_rem_sysctls(sc);
4658 mxge_free_rings(sc);
4659 mxge_free_slices(sc);
4660 mxge_dma_free(&sc->dmabench_dma);
4661 mxge_dma_free(&sc->zeropad_dma);
4662 mxge_dma_free(&sc->cmd_dma);
4663 bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4664 pci_disable_busmaster(dev);
4665 lockuninit(&sc->cmd_lock);
4666 lockuninit(&sc->driver_lock);
4668 bus_dma_tag_destroy(sc->parent_dmat);
4673 mxge_shutdown(device_t dev)
4679 This file uses Myri10GE driver indentation.
4682 c-file-style:"linux"