1 /******************************************************************************
3 Copyright (c) 2006-2009, Myricom Inc.
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
9 1. Redistributions of source code must retain the above copyright notice,
10 this list of conditions and the following disclaimer.
12 2. Neither the name of the Myricom Inc, nor the names of its
13 contributors may be used to endorse or promote products derived from
14 this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
28 ***************************************************************************/
30 #include <sys/cdefs.h>
31 /*__FBSDID("$FreeBSD: src/sys/dev/mxge/if_mxge.c,v 1.63 2009/06/26 11:45:06 rwatson Exp $");*/
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/linker.h>
36 #include <sys/firmware.h>
37 #include <sys/endian.h>
38 #include <sys/in_cksum.h>
39 #include <sys/sockio.h>
41 #include <sys/malloc.h>
42 #include <sys/kernel.h>
44 #include <sys/module.h>
45 #include <sys/socket.h>
46 #include <sys/sysctl.h>
48 /* count xmits ourselves, rather than via drbr */
51 #include <net/if_arp.h>
52 #include <net/ifq_var.h>
53 #include <net/ethernet.h>
54 #include <net/if_dl.h>
55 #include <net/if_media.h>
59 #include <net/if_types.h>
60 #include <net/vlan/if_vlan_var.h>
63 #include <netinet/in_systm.h>
64 #include <netinet/in.h>
65 #include <netinet/ip.h>
66 #include <netinet/tcp.h>
71 #include <bus/pci/pcireg.h>
72 #include <bus/pci/pcivar.h>
73 #include <bus/pci/pci_private.h> /* XXX for pci_cfg_restore */
75 #include <vm/vm.h> /* for pmap_mapdev() */
78 #if defined(__i386) || defined(__amd64)
79 #include <machine/specialreg.h>
82 #include <dev/netif/mxge/mxge_mcp.h>
83 #include <dev/netif/mxge/mcp_gen_header.h>
84 /*#define MXGE_FAKE_IFP*/
85 #include <dev/netif/mxge/if_mxge_var.h>
87 #include <sys/buf_ring.h>
93 static int mxge_nvidia_ecrc_enable = 1;
94 static int mxge_force_firmware = 0;
95 static int mxge_intr_coal_delay = 30;
96 static int mxge_deassert_wait = 1;
97 static int mxge_flow_control = 1;
98 static int mxge_verbose = 0;
99 static int mxge_lro_cnt = 8;
100 static int mxge_ticks;
101 static int mxge_max_slices = 1;
102 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
103 static int mxge_always_promisc = 0;
104 static int mxge_initial_mtu = ETHERMTU_JUMBO;
105 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
106 static char *mxge_fw_aligned = "mxge_eth_z8e";
107 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
108 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
110 static int mxge_probe(device_t dev);
111 static int mxge_attach(device_t dev);
112 static int mxge_detach(device_t dev);
113 static int mxge_shutdown(device_t dev);
114 static void mxge_intr(void *arg);
116 static device_method_t mxge_methods[] =
118 /* Device interface */
119 DEVMETHOD(device_probe, mxge_probe),
120 DEVMETHOD(device_attach, mxge_attach),
121 DEVMETHOD(device_detach, mxge_detach),
122 DEVMETHOD(device_shutdown, mxge_shutdown),
126 static driver_t mxge_driver =
130 sizeof(mxge_softc_t),
133 static devclass_t mxge_devclass;
135 /* Declare ourselves to be a child of the PCI bus.*/
136 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
137 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
138 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
140 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
141 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
142 static int mxge_close(mxge_softc_t *sc);
143 static int mxge_open(mxge_softc_t *sc);
144 static void mxge_tick(void *arg);
147 mxge_probe(device_t dev)
152 if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
153 ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
154 (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
155 rev = pci_get_revid(dev);
157 case MXGE_PCI_REV_Z8E:
158 device_set_desc(dev, "Myri10G-PCIE-8A");
160 case MXGE_PCI_REV_Z8ES:
161 device_set_desc(dev, "Myri10G-PCIE-8B");
164 device_set_desc(dev, "Myri10G-PCIE-8??");
165 device_printf(dev, "Unrecognized rev %d NIC\n",
175 mxge_enable_wc(mxge_softc_t *sc)
178 #if defined(__i386) || defined(__amd64)
183 len = rman_get_size(sc->mem_res);
184 err = pmap_change_attr((vm_offset_t) sc->sram,
185 len, PAT_WRITE_COMBINING);
187 device_printf(sc->dev, "pmap_change_attr failed, %d\n",
193 sc->wc = 0; /* TBD: PAT support */
198 /* callback to get our DMA address */
200 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
204 *(bus_addr_t *) arg = segs->ds_addr;
209 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
210 bus_size_t alignment)
213 device_t dev = sc->dev;
214 bus_size_t boundary, maxsegsize;
216 if (bytes > 4096 && alignment == 4096) {
224 /* allocate DMAable memory tags */
225 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
226 alignment, /* alignment */
227 boundary, /* boundary */
228 BUS_SPACE_MAXADDR, /* low */
229 BUS_SPACE_MAXADDR, /* high */
230 NULL, NULL, /* filter */
233 maxsegsize, /* maxsegsize */
234 BUS_DMA_COHERENT, /* flags */
235 &dma->dmat); /* tag */
237 device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
241 /* allocate DMAable memory & map */
242 err = bus_dmamem_alloc(dma->dmat, &dma->addr,
243 (BUS_DMA_WAITOK | BUS_DMA_COHERENT
244 | BUS_DMA_ZERO), &dma->map);
246 device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
247 goto abort_with_dmat;
250 /* load the memory */
251 err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
252 mxge_dmamap_callback,
253 (void *)&dma->bus_addr, 0);
255 device_printf(dev, "couldn't load map (err = %d)\n", err);
261 bus_dmamem_free(dma->dmat, dma->addr, dma->map);
263 (void)bus_dma_tag_destroy(dma->dmat);
269 mxge_dma_free(mxge_dma_t *dma)
271 bus_dmamap_unload(dma->dmat, dma->map);
272 bus_dmamem_free(dma->dmat, dma->addr, dma->map);
273 (void)bus_dma_tag_destroy(dma->dmat);
277 * The eeprom strings on the lanaiX have the format
284 mxge_parse_strings(mxge_softc_t *sc)
286 #define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
291 ptr = sc->eeprom_strings;
292 limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
294 while (ptr < limit && *ptr != '\0') {
295 if (memcmp(ptr, "MAC=", 4) == 0) {
297 sc->mac_addr_string = ptr;
298 for (i = 0; i < 6; i++) {
300 if ((ptr + 2) > limit)
302 sc->mac_addr[i] = strtoul(ptr, NULL, 16);
305 } else if (memcmp(ptr, "PC=", 3) == 0) {
307 strncpy(sc->product_code_string, ptr,
308 sizeof (sc->product_code_string) - 1);
309 } else if (memcmp(ptr, "SN=", 3) == 0) {
311 strncpy(sc->serial_number_string, ptr,
312 sizeof (sc->serial_number_string) - 1);
314 MXGE_NEXT_STRING(ptr);
321 device_printf(sc->dev, "failed to parse eeprom_strings\n");
326 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
328 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
331 unsigned long base, off;
333 device_t pdev, mcp55;
334 uint16_t vendor_id, device_id, word;
335 uintptr_t bus, slot, func, ivend, idev;
339 if (!mxge_nvidia_ecrc_enable)
342 pdev = device_get_parent(device_get_parent(sc->dev));
344 device_printf(sc->dev, "could not find parent?\n");
347 vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
348 device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
350 if (vendor_id != 0x10de)
355 if (device_id == 0x005d) {
356 /* ck804, base address is magic */
358 } else if (device_id >= 0x0374 && device_id <= 0x378) {
359 /* mcp55, base address stored in chipset */
360 mcp55 = pci_find_bsf(0, 0, 0);
362 0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
363 0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
364 word = pci_read_config(mcp55, 0x90, 2);
365 base = ((unsigned long)word & 0x7ffeU) << 25;
372 Test below is commented because it is believed that doing
373 config read/write beyond 0xff will access the config space
374 for the next larger function. Uncomment this and remove
375 the hacky pmap_mapdev() way of accessing config space when
376 FreeBSD grows support for extended pcie config space access
379 /* See if we can, by some miracle, access the extended
381 val = pci_read_config(pdev, 0x178, 4);
382 if (val != 0xffffffff) {
384 pci_write_config(pdev, 0x178, val, 4);
388 /* Rather than using normal pci config space writes, we must
389 * map the Nvidia config space ourselves. This is because on
390 * opteron/nvidia class machine the 0xe000000 mapping is
391 * handled by the nvidia chipset, that means the internal PCI
392 * device (the on-chip northbridge), or the amd-8131 bridge
393 * and things behind them are not visible by this method.
396 BUS_READ_IVAR(device_get_parent(pdev), pdev,
398 BUS_READ_IVAR(device_get_parent(pdev), pdev,
399 PCI_IVAR_SLOT, &slot);
400 BUS_READ_IVAR(device_get_parent(pdev), pdev,
401 PCI_IVAR_FUNCTION, &func);
402 BUS_READ_IVAR(device_get_parent(pdev), pdev,
403 PCI_IVAR_VENDOR, &ivend);
404 BUS_READ_IVAR(device_get_parent(pdev), pdev,
405 PCI_IVAR_DEVICE, &idev);
408 + 0x00100000UL * (unsigned long)bus
409 + 0x00001000UL * (unsigned long)(func
412 /* map it into the kernel */
413 va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
417 device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
420 /* get a pointer to the config space mapped into the kernel */
421 cfgptr = va + (off & PAGE_MASK);
423 /* make sure that we can really access it */
424 vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
425 device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
426 if (! (vendor_id == ivend && device_id == idev)) {
427 device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
428 vendor_id, device_id);
429 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
433 ptr32 = (uint32_t*)(cfgptr + 0x178);
436 if (val == 0xffffffff) {
437 device_printf(sc->dev, "extended mapping failed\n");
438 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
442 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
444 device_printf(sc->dev,
445 "Enabled ECRC on upstream Nvidia bridge "
447 (int)bus, (int)slot, (int)func);
452 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
454 device_printf(sc->dev,
455 "Nforce 4 chipset on non-x86/amd64!?!?!\n");
462 mxge_dma_test(mxge_softc_t *sc, int test_type)
465 bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
471 /* Run a small DMA test.
472 * The magic multipliers to the length tell the firmware
473 * to do DMA read, write, or read+write tests. The
474 * results are returned in cmd.data0. The upper 16
475 * bits of the return is the number of transfers completed.
476 * The lower 16 bits is the time in 0.5us ticks that the
477 * transfers took to complete.
480 len = sc->tx_boundary;
482 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
483 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
484 cmd.data2 = len * 0x10000;
485 status = mxge_send_cmd(sc, test_type, &cmd);
490 sc->read_dma = ((cmd.data0>>16) * len * 2) /
491 (cmd.data0 & 0xffff);
492 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
493 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
494 cmd.data2 = len * 0x1;
495 status = mxge_send_cmd(sc, test_type, &cmd);
500 sc->write_dma = ((cmd.data0>>16) * len * 2) /
501 (cmd.data0 & 0xffff);
503 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
504 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
505 cmd.data2 = len * 0x10001;
506 status = mxge_send_cmd(sc, test_type, &cmd);
511 sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
512 (cmd.data0 & 0xffff);
515 if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
516 device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
523 * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
524 * when the PCI-E Completion packets are aligned on an 8-byte
525 * boundary. Some PCI-E chip sets always align Completion packets; on
526 * the ones that do not, the alignment can be enforced by enabling
527 * ECRC generation (if supported).
529 * When PCI-E Completion packets are not aligned, it is actually more
530 * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
532 * If the driver can neither enable ECRC nor verify that it has
533 * already been enabled, then it must use a firmware image which works
534 * around unaligned completion packets (ethp_z8e.dat), and it should
535 * also ensure that it never gives the device a Read-DMA which is
536 * larger than 2KB by setting the tx_boundary to 2KB. If ECRC is
537 * enabled, then the driver should use the aligned (eth_z8e.dat)
538 * firmware image, and set tx_boundary to 4KB.
542 mxge_firmware_probe(mxge_softc_t *sc)
544 device_t dev = sc->dev;
548 sc->tx_boundary = 4096;
550 * Verify the max read request size was set to 4KB
551 * before trying the test with 4KB.
553 if (pci_find_extcap(dev, PCIY_EXPRESS, ®) == 0) {
554 pectl = pci_read_config(dev, reg + 0x8, 2);
555 if ((pectl & (5 << 12)) != (5 << 12)) {
556 device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
558 sc->tx_boundary = 2048;
563 * load the optimized firmware (which assumes aligned PCIe
564 * completions) in order to see if it works on this host.
566 sc->fw_name = mxge_fw_aligned;
567 status = mxge_load_firmware(sc, 1);
573 * Enable ECRC if possible
575 mxge_enable_nvidia_ecrc(sc);
578 * Run a DMA test which watches for unaligned completions and
579 * aborts on the first one seen.
582 status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
584 return 0; /* keep the aligned firmware */
587 device_printf(dev, "DMA test failed: %d\n", status);
588 if (status == ENOSYS)
589 device_printf(dev, "Falling back to ethp! "
590 "Please install up to date fw\n");
595 mxge_select_firmware(mxge_softc_t *sc)
600 if (mxge_force_firmware != 0) {
601 if (mxge_force_firmware == 1)
606 device_printf(sc->dev,
607 "Assuming %s completions (forced)\n",
608 aligned ? "aligned" : "unaligned");
612 /* if the PCIe link width is 4 or less, we can use the aligned
613 firmware and skip any checks */
614 if (sc->link_width != 0 && sc->link_width <= 4) {
615 device_printf(sc->dev,
616 "PCIe x%d Link, expect reduced performance\n",
622 if (0 == mxge_firmware_probe(sc))
627 sc->fw_name = mxge_fw_aligned;
628 sc->tx_boundary = 4096;
630 sc->fw_name = mxge_fw_unaligned;
631 sc->tx_boundary = 2048;
633 return (mxge_load_firmware(sc, 0));
643 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
647 if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
648 device_printf(sc->dev, "Bad firmware type: 0x%x\n",
649 be32toh(hdr->mcp_type));
653 /* save firmware version for sysctl */
654 strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
656 device_printf(sc->dev, "firmware id: %s\n", hdr->version);
658 ksscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
659 &sc->fw_ver_minor, &sc->fw_ver_tiny);
661 if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
662 && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
663 device_printf(sc->dev, "Found firmware version %s\n",
665 device_printf(sc->dev, "Driver needs %d.%d\n",
666 MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
674 z_alloc(void *nil, u_int items, u_int size)
678 ptr = kmalloc(items * size, M_TEMP, M_NOWAIT);
683 z_free(void *nil, void *ptr)
690 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
693 const mcp_gen_header_t *hdr;
700 fw = firmware_image_load(sc->fw_name, NULL);
702 device_printf(sc->dev, "Could not find firmware image %s\n",
707 /* setup zlib and decompress f/w */
708 bzero(&zs, sizeof (zs));
711 status = inflateInit(&zs);
712 if (status != Z_OK) {
717 /* the uncompressed size is stored as the firmware version,
718 which would otherwise go unused */
719 fw_len = (size_t) fw->version;
720 inflate_buffer = kmalloc(fw_len, M_TEMP, M_NOWAIT);
721 if (inflate_buffer == NULL)
723 zs.avail_in = fw->datasize;
724 zs.next_in = __DECONST(char *, fw->data);
725 zs.avail_out = fw_len;
726 zs.next_out = inflate_buffer;
727 status = inflate(&zs, Z_FINISH);
728 if (status != Z_STREAM_END) {
729 device_printf(sc->dev, "zlib %d\n", status);
731 goto abort_with_buffer;
735 hdr_offset = htobe32(*(const uint32_t *)
736 (fw->fw_image + MCP_HEADER_PTR_OFFSET));
737 if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
738 device_printf(sc->dev, "Bad firmware file");
742 hdr = (const void*)(fw->fw_image + hdr_offset);
744 status = mxge_validate_firmware(sc, hdr);
748 /* Copy the inflated firmware to NIC SRAM. */
749 for (i = 0; i < fw_len; i += 256) {
750 mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
752 min(256U, (unsigned)(fw_len - i)));
762 kfree(inflate_buffer, M_TEMP);
767 firmware_image_unload(fw);
772 * Enable or disable periodic RDMAs from the host to make certain
773 * chipsets resend dropped PCIe messages
777 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
780 volatile uint32_t *confirm;
781 volatile char *submit;
782 uint32_t *buf, dma_low, dma_high;
785 buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
787 /* clear confirmation addr */
788 confirm = (volatile uint32_t *)sc->cmd;
792 /* send an rdma command to the PCIe engine, and wait for the
793 response in the confirmation address. The firmware should
794 write a -1 there to indicate it is alive and well
797 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
798 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
799 buf[0] = htobe32(dma_high); /* confirm addr MSW */
800 buf[1] = htobe32(dma_low); /* confirm addr LSW */
801 buf[2] = htobe32(0xffffffff); /* confirm data */
802 dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
803 dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
804 buf[3] = htobe32(dma_high); /* dummy addr MSW */
805 buf[4] = htobe32(dma_low); /* dummy addr LSW */
806 buf[5] = htobe32(enable); /* enable? */
809 submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
811 mxge_pio_copy(submit, buf, 64);
816 while (*confirm != 0xffffffff && i < 20) {
820 if (*confirm != 0xffffffff) {
821 device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
822 (enable ? "enable" : "disable"), confirm,
829 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
832 char buf_bytes[sizeof(*buf) + 8];
833 volatile mcp_cmd_response_t *response = sc->cmd;
834 volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
835 uint32_t dma_low, dma_high;
836 int err, sleep_total = 0;
838 /* ensure buf is aligned to 8 bytes */
839 buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
841 buf->data0 = htobe32(data->data0);
842 buf->data1 = htobe32(data->data1);
843 buf->data2 = htobe32(data->data2);
844 buf->cmd = htobe32(cmd);
845 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
846 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
848 buf->response_addr.low = htobe32(dma_low);
849 buf->response_addr.high = htobe32(dma_high);
850 lockmgr(&sc->cmd_lock, LK_EXCLUSIVE);
851 response->result = 0xffffffff;
853 mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
855 /* wait up to 20ms */
857 for (sleep_total = 0; sleep_total < 20; sleep_total++) {
858 bus_dmamap_sync(sc->cmd_dma.dmat,
859 sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
861 switch (be32toh(response->result)) {
863 data->data0 = be32toh(response->data);
869 case MXGEFW_CMD_UNKNOWN:
872 case MXGEFW_CMD_ERROR_UNALIGNED:
875 case MXGEFW_CMD_ERROR_BUSY:
879 device_printf(sc->dev,
881 "failed, result = %d\n",
882 cmd, be32toh(response->result));
890 device_printf(sc->dev, "mxge: command %d timed out"
892 cmd, be32toh(response->result));
893 lockmgr(&sc->cmd_lock, LK_RELEASE);
898 mxge_adopt_running_firmware(mxge_softc_t *sc)
900 struct mcp_gen_header *hdr;
901 const size_t bytes = sizeof (struct mcp_gen_header);
905 /* find running firmware header */
906 hdr_offset = htobe32(*(volatile uint32_t *)
907 (sc->sram + MCP_HEADER_PTR_OFFSET));
909 if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
910 device_printf(sc->dev,
911 "Running firmware has bad header offset (%d)\n",
916 /* copy header of running firmware from SRAM to host memory to
917 * validate firmware */
918 hdr = kmalloc(bytes, M_DEVBUF, M_NOWAIT);
920 device_printf(sc->dev, "could not kmalloc firmware hdr\n");
923 bus_space_read_region_1(rman_get_bustag(sc->mem_res),
924 rman_get_bushandle(sc->mem_res),
925 hdr_offset, (char *)hdr, bytes);
926 status = mxge_validate_firmware(sc, hdr);
927 kfree(hdr, M_DEVBUF);
930 * check to see if adopted firmware has bug where adopting
931 * it will cause broadcasts to be filtered unless the NIC
932 * is kept in ALLMULTI mode
934 if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
935 sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
936 sc->adopted_rx_filter_bug = 1;
937 device_printf(sc->dev, "Adopting fw %d.%d.%d: "
938 "working around rx filter bug\n",
939 sc->fw_ver_major, sc->fw_ver_minor,
948 mxge_load_firmware(mxge_softc_t *sc, int adopt)
950 volatile uint32_t *confirm;
951 volatile char *submit;
953 uint32_t *buf, size, dma_low, dma_high;
956 buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
958 size = sc->sram_size;
959 status = mxge_load_firmware_helper(sc, &size);
963 /* Try to use the currently running firmware, if
965 status = mxge_adopt_running_firmware(sc);
967 device_printf(sc->dev,
968 "failed to adopt running firmware\n");
971 device_printf(sc->dev,
972 "Successfully adopted running firmware\n");
973 if (sc->tx_boundary == 4096) {
974 device_printf(sc->dev,
975 "Using firmware currently running on NIC"
977 device_printf(sc->dev,
978 "performance consider loading optimized "
981 sc->fw_name = mxge_fw_unaligned;
982 sc->tx_boundary = 2048;
985 /* clear confirmation addr */
986 confirm = (volatile uint32_t *)sc->cmd;
989 /* send a reload command to the bootstrap MCP, and wait for the
990 response in the confirmation address. The firmware should
991 write a -1 there to indicate it is alive and well
994 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
995 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
997 buf[0] = htobe32(dma_high); /* confirm addr MSW */
998 buf[1] = htobe32(dma_low); /* confirm addr LSW */
999 buf[2] = htobe32(0xffffffff); /* confirm data */
1001 /* FIX: All newest firmware should un-protect the bottom of
1002 the sram before handoff. However, the very first interfaces
1003 do not. Therefore the handoff copy must skip the first 8 bytes
1005 /* where the code starts*/
1006 buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1007 buf[4] = htobe32(size - 8); /* length of code */
1008 buf[5] = htobe32(8); /* where to copy to */
1009 buf[6] = htobe32(0); /* where to jump to */
1011 submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1012 mxge_pio_copy(submit, buf, 64);
1017 while (*confirm != 0xffffffff && i < 20) {
1020 bus_dmamap_sync(sc->cmd_dma.dmat,
1021 sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1023 if (*confirm != 0xffffffff) {
1024 device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1033 mxge_update_mac_address(mxge_softc_t *sc)
1036 uint8_t *addr = sc->mac_addr;
1040 cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1041 | (addr[2] << 8) | addr[3]);
1043 cmd.data1 = ((addr[4] << 8) | (addr[5]));
1045 status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1050 mxge_change_pause(mxge_softc_t *sc, int pause)
1056 status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1059 status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1063 device_printf(sc->dev, "Failed to set flow control mode\n");
1071 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1076 if (mxge_always_promisc)
1080 status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1083 status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1087 device_printf(sc->dev, "Failed to set promisc mode\n");
1092 mxge_set_multicast_list(mxge_softc_t *sc)
1095 struct ifmultiaddr *ifma;
1096 struct ifnet *ifp = sc->ifp;
1099 /* This firmware is known to not support multicast */
1100 if (!sc->fw_multicast_support)
1103 /* Disable multicast filtering while we play with the lists*/
1104 err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1106 device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1107 " error status: %d\n", err);
1111 if (sc->adopted_rx_filter_bug)
1114 if (ifp->if_flags & IFF_ALLMULTI)
1115 /* request to disable multicast filtering, so quit here */
1118 /* Flush all the filters */
1120 err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1122 device_printf(sc->dev,
1123 "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1124 ", error status: %d\n", err);
1128 /* Walk the multicast list, and add each address */
1130 if_maddr_rlock(ifp);
1131 LIST_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1132 if (ifma->ifma_addr->sa_family != AF_LINK)
1134 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1136 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1138 cmd.data0 = htonl(cmd.data0);
1139 cmd.data1 = htonl(cmd.data1);
1140 err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1142 device_printf(sc->dev, "Failed "
1143 "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1145 /* abort, leaving multicast filtering off */
1146 if_maddr_runlock(ifp);
1150 if_maddr_runlock(ifp);
1151 /* Enable multicast filtering */
1152 err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1154 device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1155 ", error status: %d\n", err);
1160 mxge_max_mtu(mxge_softc_t *sc)
1165 if (MJUMPAGESIZE - MXGEFW_PAD > MXGEFW_MAX_MTU)
1166 return MXGEFW_MAX_MTU - MXGEFW_PAD;
1168 /* try to set nbufs to see if it we can
1169 use virtually contiguous jumbos */
1171 status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1174 return MXGEFW_MAX_MTU - MXGEFW_PAD;
1176 /* otherwise, we're limited to MJUMPAGESIZE */
1177 return MJUMPAGESIZE - MXGEFW_PAD;
1181 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1183 struct mxge_slice_state *ss;
1184 mxge_rx_done_t *rx_done;
1185 volatile uint32_t *irq_claim;
1189 /* try to send a reset command to the card to see if it
1191 memset(&cmd, 0, sizeof (cmd));
1192 status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1194 device_printf(sc->dev, "failed reset\n");
1198 mxge_dummy_rdma(sc, 1);
1201 /* set the intrq size */
1202 cmd.data0 = sc->rx_ring_size;
1203 status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1206 * Even though we already know how many slices are supported
1207 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1208 * has magic side effects, and must be called after a reset.
1209 * It must be called prior to calling any RSS related cmds,
1210 * including assigning an interrupt queue for anything but
1211 * slice 0. It must also be called *after*
1212 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1213 * the firmware to compute offsets.
1216 if (sc->num_slices > 1) {
1217 /* ask the maximum number of slices it supports */
1218 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1221 device_printf(sc->dev,
1222 "failed to get number of slices\n");
1226 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1227 * to setting up the interrupt queue DMA
1229 cmd.data0 = sc->num_slices;
1230 cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1231 #ifdef IFNET_BUF_RING
1232 cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1234 status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1237 device_printf(sc->dev,
1238 "failed to set number of slices\n");
1244 if (interrupts_setup) {
1245 /* Now exchange information about interrupts */
1246 for (slice = 0; slice < sc->num_slices; slice++) {
1247 rx_done = &sc->ss[slice].rx_done;
1248 memset(rx_done->entry, 0, sc->rx_ring_size);
1249 cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1250 cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1252 status |= mxge_send_cmd(sc,
1253 MXGEFW_CMD_SET_INTRQ_DMA,
1258 status |= mxge_send_cmd(sc,
1259 MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1262 sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1264 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1265 irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1268 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1270 sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1272 device_printf(sc->dev, "failed set interrupt parameters\n");
1277 *sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1280 /* run a DMA benchmark */
1281 (void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1283 for (slice = 0; slice < sc->num_slices; slice++) {
1284 ss = &sc->ss[slice];
1286 ss->irq_claim = irq_claim + (2 * slice);
1287 /* reset mcp/driver shared state back to 0 */
1288 ss->rx_done.idx = 0;
1289 ss->rx_done.cnt = 0;
1292 ss->tx.pkt_done = 0;
1293 ss->tx.queue_active = 0;
1294 ss->tx.activate = 0;
1295 ss->tx.deactivate = 0;
1300 ss->rx_small.cnt = 0;
1301 ss->lro_bad_csum = 0;
1303 ss->lro_flushed = 0;
1304 if (ss->fw_stats != NULL) {
1305 ss->fw_stats->valid = 0;
1306 ss->fw_stats->send_done_count = 0;
1309 sc->rdma_tags_available = 15;
1310 status = mxge_update_mac_address(sc);
1311 mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1312 mxge_change_pause(sc, sc->pause);
1313 mxge_set_multicast_list(sc);
1318 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1321 unsigned int intr_coal_delay;
1325 intr_coal_delay = sc->intr_coal_delay;
1326 err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1330 if (intr_coal_delay == sc->intr_coal_delay)
1333 if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1336 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
1337 *sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1338 sc->intr_coal_delay = intr_coal_delay;
1340 lockmgr(&sc->driver_lock, LK_RELEASE);
1345 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1348 unsigned int enabled;
1352 enabled = sc->pause;
1353 err = sysctl_handle_int(oidp, &enabled, arg2, req);
1357 if (enabled == sc->pause)
1360 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
1361 err = mxge_change_pause(sc, enabled);
1362 lockmgr(&sc->driver_lock, LK_RELEASE);
1367 mxge_change_lro_locked(mxge_softc_t *sc, int lro_cnt)
1374 ifp->if_capenable &= ~IFCAP_LRO;
1376 ifp->if_capenable |= IFCAP_LRO;
1377 sc->lro_cnt = lro_cnt;
1378 if (ifp->if_flags & IFF_RUNNING) {
1380 err = mxge_open(sc);
1386 mxge_change_lro(SYSCTL_HANDLER_ARGS)
1389 unsigned int lro_cnt;
1393 lro_cnt = sc->lro_cnt;
1394 err = sysctl_handle_int(oidp, &lro_cnt, arg2, req);
1398 if (lro_cnt == sc->lro_cnt)
1404 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
1405 err = mxge_change_lro_locked(sc, lro_cnt);
1406 lockmgr(&sc->driver_lock, LK_RELEASE);
1411 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1417 arg2 = be32toh(*(int *)arg1);
1419 err = sysctl_handle_int(oidp, arg1, arg2, req);
1425 mxge_rem_sysctls(mxge_softc_t *sc)
1427 struct mxge_slice_state *ss;
1430 if (sc->slice_sysctl_tree == NULL)
1433 for (slice = 0; slice < sc->num_slices; slice++) {
1434 ss = &sc->ss[slice];
1435 if (ss == NULL || ss->sysctl_tree == NULL)
1437 sysctl_ctx_free(&ss->sysctl_ctx);
1438 ss->sysctl_tree = NULL;
1440 sysctl_ctx_free(&sc->slice_sysctl_ctx);
1441 sc->slice_sysctl_tree = NULL;
1445 mxge_add_sysctls(mxge_softc_t *sc)
1447 struct sysctl_ctx_list *ctx;
1448 struct sysctl_oid_list *children;
1450 struct mxge_slice_state *ss;
1454 ctx = &sc->sysctl_ctx;
1455 sysctl_ctx_init(ctx);
1456 sc->sysctl_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_STATIC_CHILDREN(_hw),
1458 device_get_nameunit(sc->dev),
1460 if (sc->sysctl_tree == NULL) {
1461 device_printf(sc->dev, "can't add sysctl node\n");
1465 children = SYSCTL_CHILDREN(sc->sysctl_tree);
1466 fw = sc->ss[0].fw_stats;
1468 /* random information */
1469 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1471 CTLFLAG_RD, &sc->fw_version,
1472 0, "firmware version");
1473 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1475 CTLFLAG_RD, &sc->serial_number_string,
1476 0, "serial number");
1477 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1479 CTLFLAG_RD, &sc->product_code_string,
1481 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1483 CTLFLAG_RD, &sc->link_width,
1485 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1487 CTLFLAG_RD, &sc->tx_boundary,
1489 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1491 CTLFLAG_RD, &sc->wc,
1492 0, "write combining PIO?");
1493 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1495 CTLFLAG_RD, &sc->read_dma,
1496 0, "DMA Read speed in MB/s");
1497 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1499 CTLFLAG_RD, &sc->write_dma,
1500 0, "DMA Write speed in MB/s");
1501 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1502 "read_write_dma_MBs",
1503 CTLFLAG_RD, &sc->read_write_dma,
1504 0, "DMA concurrent Read/Write speed in MB/s");
1507 /* performance related tunables */
1508 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1510 CTLTYPE_INT|CTLFLAG_RW, sc,
1511 0, mxge_change_intr_coal,
1512 "I", "interrupt coalescing delay in usecs");
1514 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1515 "flow_control_enabled",
1516 CTLTYPE_INT|CTLFLAG_RW, sc,
1517 0, mxge_change_flow_control,
1518 "I", "interrupt coalescing delay in usecs");
1520 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1522 CTLFLAG_RW, &mxge_deassert_wait,
1523 0, "Wait for IRQ line to go low in ihandler");
1525 /* stats block from firmware is in network byte order.
1527 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1529 CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1530 0, mxge_handle_be32,
1532 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1533 "rdma_tags_available",
1534 CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1535 0, mxge_handle_be32,
1536 "I", "rdma_tags_available");
1537 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1538 "dropped_bad_crc32",
1539 CTLTYPE_INT|CTLFLAG_RD,
1540 &fw->dropped_bad_crc32,
1541 0, mxge_handle_be32,
1542 "I", "dropped_bad_crc32");
1543 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1545 CTLTYPE_INT|CTLFLAG_RD,
1546 &fw->dropped_bad_phy,
1547 0, mxge_handle_be32,
1548 "I", "dropped_bad_phy");
1549 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1550 "dropped_link_error_or_filtered",
1551 CTLTYPE_INT|CTLFLAG_RD,
1552 &fw->dropped_link_error_or_filtered,
1553 0, mxge_handle_be32,
1554 "I", "dropped_link_error_or_filtered");
1555 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1556 "dropped_link_overflow",
1557 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1558 0, mxge_handle_be32,
1559 "I", "dropped_link_overflow");
1560 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1561 "dropped_multicast_filtered",
1562 CTLTYPE_INT|CTLFLAG_RD,
1563 &fw->dropped_multicast_filtered,
1564 0, mxge_handle_be32,
1565 "I", "dropped_multicast_filtered");
1566 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1567 "dropped_no_big_buffer",
1568 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1569 0, mxge_handle_be32,
1570 "I", "dropped_no_big_buffer");
1571 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1572 "dropped_no_small_buffer",
1573 CTLTYPE_INT|CTLFLAG_RD,
1574 &fw->dropped_no_small_buffer,
1575 0, mxge_handle_be32,
1576 "I", "dropped_no_small_buffer");
1577 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1579 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1580 0, mxge_handle_be32,
1581 "I", "dropped_overrun");
1582 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1584 CTLTYPE_INT|CTLFLAG_RD,
1586 0, mxge_handle_be32,
1587 "I", "dropped_pause");
1588 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1590 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1591 0, mxge_handle_be32,
1592 "I", "dropped_runt");
1594 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1595 "dropped_unicast_filtered",
1596 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1597 0, mxge_handle_be32,
1598 "I", "dropped_unicast_filtered");
1600 /* verbose printing? */
1601 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1603 CTLFLAG_RW, &mxge_verbose,
1604 0, "verbose printing");
1607 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1609 CTLTYPE_INT|CTLFLAG_RW, sc,
1611 "I", "number of lro merge queues");
1614 /* add counters exported for debugging from all slices */
1615 sysctl_ctx_init(&sc->slice_sysctl_ctx);
1616 sc->slice_sysctl_tree =
1617 SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1618 "slice", CTLFLAG_RD, 0, "");
1620 for (slice = 0; slice < sc->num_slices; slice++) {
1621 ss = &sc->ss[slice];
1622 sysctl_ctx_init(&ss->sysctl_ctx);
1623 ctx = &ss->sysctl_ctx;
1624 children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1625 ksprintf(slice_num, "%d", slice);
1627 SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1629 children = SYSCTL_CHILDREN(ss->sysctl_tree);
1630 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1632 CTLFLAG_RD, &ss->rx_small.cnt,
1634 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1636 CTLFLAG_RD, &ss->rx_big.cnt,
1638 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1639 "lro_flushed", CTLFLAG_RD, &ss->lro_flushed,
1640 0, "number of lro merge queues flushed");
1642 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1643 "lro_queued", CTLFLAG_RD, &ss->lro_queued,
1644 0, "number of frames appended to lro merge"
1647 #ifndef IFNET_BUF_RING
1648 /* only transmit from slice 0 for now */
1652 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1654 CTLFLAG_RD, &ss->tx.req,
1657 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1659 CTLFLAG_RD, &ss->tx.done,
1661 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1663 CTLFLAG_RD, &ss->tx.pkt_done,
1665 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1667 CTLFLAG_RD, &ss->tx.stall,
1669 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1671 CTLFLAG_RD, &ss->tx.wake,
1673 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1675 CTLFLAG_RD, &ss->tx.defrag,
1677 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1679 CTLFLAG_RD, &ss->tx.queue_active,
1680 0, "tx_queue_active");
1681 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1683 CTLFLAG_RD, &ss->tx.activate,
1685 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1687 CTLFLAG_RD, &ss->tx.deactivate,
1688 0, "tx_deactivate");
1692 /* copy an array of mcp_kreq_ether_send_t's to the mcp. Copy
1693 backwards one at a time and handle ring wraps */
1696 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1697 mcp_kreq_ether_send_t *src, int cnt)
1699 int idx, starting_slot;
1700 starting_slot = tx->req;
1703 idx = (starting_slot + cnt) & tx->mask;
1704 mxge_pio_copy(&tx->lanai[idx],
1705 &src[cnt], sizeof(*src));
1711 * copy an array of mcp_kreq_ether_send_t's to the mcp. Copy
1712 * at most 32 bytes at a time, so as to avoid involving the software
1713 * pio handler in the nic. We re-write the first segment's flags
1714 * to mark them valid only after writing the entire chain
1718 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1723 volatile uint32_t *dst_ints;
1724 mcp_kreq_ether_send_t *srcp;
1725 volatile mcp_kreq_ether_send_t *dstp, *dst;
1728 idx = tx->req & tx->mask;
1730 last_flags = src->flags;
1733 dst = dstp = &tx->lanai[idx];
1736 if ((idx + cnt) < tx->mask) {
1737 for (i = 0; i < (cnt - 1); i += 2) {
1738 mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1739 wmb(); /* force write every 32 bytes */
1744 /* submit all but the first request, and ensure
1745 that it is submitted below */
1746 mxge_submit_req_backwards(tx, src, cnt);
1750 /* submit the first request */
1751 mxge_pio_copy(dstp, srcp, sizeof(*src));
1752 wmb(); /* barrier before setting valid flag */
1755 /* re-write the last 32-bits with the valid flags */
1756 src->flags = last_flags;
1757 src_ints = (uint32_t *)src;
1759 dst_ints = (volatile uint32_t *)dst;
1761 *dst_ints = *src_ints;
1769 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1770 int busdma_seg_cnt, int ip_off)
1773 mcp_kreq_ether_send_t *req;
1774 bus_dma_segment_t *seg;
1777 uint32_t low, high_swapped;
1778 int len, seglen, cum_len, cum_len_next;
1779 int next_is_first, chop, cnt, rdma_count, small;
1780 uint16_t pseudo_hdr_offset, cksum_offset, mss;
1781 uint8_t flags, flags_next;
1784 mss = m->m_pkthdr.tso_segsz;
1786 /* negative cum_len signifies to the
1787 * send loop that we are still in the
1788 * header portion of the TSO packet.
1791 /* ensure we have the ethernet, IP and TCP
1792 header together in the first mbuf, copy
1793 it to a scratch buffer if not */
1794 if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1795 m_copydata(m, 0, ip_off + sizeof (*ip),
1797 ip = (struct ip *)(ss->scratch + ip_off);
1799 ip = (struct ip *)(mtod(m, char *) + ip_off);
1801 if (__predict_false(m->m_len < ip_off + (ip->ip_hl << 2)
1803 m_copydata(m, 0, ip_off + (ip->ip_hl << 2)
1804 + sizeof (*tcp), ss->scratch);
1805 ip = (struct ip *)(mtod(m, char *) + ip_off);
1808 tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1809 cum_len = -(ip_off + ((ip->ip_hl + tcp->th_off) << 2));
1811 /* TSO implies checksum offload on this hardware */
1812 cksum_offset = ip_off + (ip->ip_hl << 2);
1813 flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1816 /* for TSO, pseudo_hdr_offset holds mss.
1817 * The firmware figures out where to put
1818 * the checksum by parsing the header. */
1819 pseudo_hdr_offset = htobe16(mss);
1826 /* "rdma_count" is the number of RDMAs belonging to the
1827 * current packet BEFORE the current send request. For
1828 * non-TSO packets, this is equal to "count".
1829 * For TSO packets, rdma_count needs to be reset
1830 * to 0 after a segment cut.
1832 * The rdma_count field of the send request is
1833 * the number of RDMAs of the packet starting at
1834 * that request. For TSO send requests with one ore more cuts
1835 * in the middle, this is the number of RDMAs starting
1836 * after the last cut in the request. All previous
1837 * segments before the last cut implicitly have 1 RDMA.
1839 * Since the number of RDMAs is not known beforehand,
1840 * it must be filled-in retroactively - after each
1841 * segmentation cut or at the end of the entire packet.
1844 while (busdma_seg_cnt) {
1845 /* Break the busdma segment up into pieces*/
1846 low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1847 high_swapped = htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1851 flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1853 cum_len_next = cum_len + seglen;
1854 (req-rdma_count)->rdma_count = rdma_count + 1;
1855 if (__predict_true(cum_len >= 0)) {
1857 chop = (cum_len_next > mss);
1858 cum_len_next = cum_len_next % mss;
1859 next_is_first = (cum_len_next == 0);
1860 flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1861 flags_next |= next_is_first *
1863 rdma_count |= -(chop | next_is_first);
1864 rdma_count += chop & !next_is_first;
1865 } else if (cum_len_next >= 0) {
1870 small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1871 flags_next = MXGEFW_FLAGS_TSO_PLD |
1872 MXGEFW_FLAGS_FIRST |
1873 (small * MXGEFW_FLAGS_SMALL);
1876 req->addr_high = high_swapped;
1877 req->addr_low = htobe32(low);
1878 req->pseudo_hdr_offset = pseudo_hdr_offset;
1880 req->rdma_count = 1;
1881 req->length = htobe16(seglen);
1882 req->cksum_offset = cksum_offset;
1883 req->flags = flags | ((cum_len & 1) *
1884 MXGEFW_FLAGS_ALIGN_ODD);
1887 cum_len = cum_len_next;
1892 if (__predict_false(cksum_offset > seglen))
1893 cksum_offset -= seglen;
1896 if (__predict_false(cnt > tx->max_desc))
1902 (req-rdma_count)->rdma_count = rdma_count;
1906 req->flags |= MXGEFW_FLAGS_TSO_LAST;
1907 } while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1909 tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1910 mxge_submit_req(tx, tx->req_list, cnt);
1911 #ifdef IFNET_BUF_RING
1912 if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1913 /* tell the NIC to start polling this slice */
1915 tx->queue_active = 1;
1923 bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1927 kprintf("tx->max_desc exceeded via TSO!\n");
1928 kprintf("mss = %d, %ld, %d!\n", mss,
1929 (long)seg - (long)tx->seg_list, tx->max_desc);
1936 #endif /* IFCAP_TSO4 */
1938 #ifdef MXGE_NEW_VLAN_API
1940 * We reproduce the software vlan tag insertion from
1941 * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
1942 * vlan tag insertion. We need to advertise this in order to have the
1943 * vlan interface respect our csum offload flags.
1945 static struct mbuf *
1946 mxge_vlan_tag_insert(struct mbuf *m)
1948 struct ether_vlan_header *evl;
1950 M_PREPEND(m, EVL_ENCAPLEN, MB_DONTWAIT);
1951 if (__predict_false(m == NULL))
1953 if (m->m_len < sizeof(*evl)) {
1954 m = m_pullup(m, sizeof(*evl));
1955 if (__predict_false(m == NULL))
1959 * Transform the Ethernet header into an Ethernet header
1960 * with 802.1Q encapsulation.
1962 evl = mtod(m, struct ether_vlan_header *);
1963 bcopy((char *)evl + EVL_ENCAPLEN,
1964 (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
1965 evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
1966 evl->evl_tag = htons(m->m_pkthdr.ether_vlantag);
1967 m->m_flags &= ~M_VLANTAG;
1970 #endif /* MXGE_NEW_VLAN_API */
1973 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
1976 mcp_kreq_ether_send_t *req;
1977 bus_dma_segment_t *seg;
1982 int cnt, cum_len, err, i, idx, odd_flag, ip_off;
1983 uint16_t pseudo_hdr_offset;
1984 uint8_t flags, cksum_offset;
1991 ip_off = sizeof (struct ether_header);
1992 #ifdef MXGE_NEW_VLAN_API
1993 if (m->m_flags & M_VLANTAG) {
1994 m = mxge_vlan_tag_insert(m);
1995 if (__predict_false(m == NULL))
1997 ip_off += EVL_ENCAPLEN;
2000 /* (try to) map the frame for DMA */
2001 idx = tx->req & tx->mask;
2002 err = bus_dmamap_load_mbuf_segment(tx->dmat, tx->info[idx].map,
2003 m, tx->seg_list, 1, &cnt,
2005 if (__predict_false(err == EFBIG)) {
2006 /* Too many segments in the chain. Try
2008 m_tmp = m_defrag(m, M_NOWAIT);
2009 if (m_tmp == NULL) {
2014 err = bus_dmamap_load_mbuf_segment(tx->dmat,
2016 m, tx->seg_list, 1, &cnt,
2019 if (__predict_false(err != 0)) {
2020 device_printf(sc->dev, "bus_dmamap_load_mbuf_segment returned %d"
2021 " packet len = %d\n", err, m->m_pkthdr.len);
2024 bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2025 BUS_DMASYNC_PREWRITE);
2026 tx->info[idx].m = m;
2029 /* TSO is different enough, we handle it in another routine */
2030 if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2031 mxge_encap_tso(ss, m, cnt, ip_off);
2038 pseudo_hdr_offset = 0;
2039 flags = MXGEFW_FLAGS_NO_TSO;
2041 /* checksum offloading? */
2042 if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
2043 /* ensure ip header is in first mbuf, copy
2044 it to a scratch buffer if not */
2045 if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
2046 m_copydata(m, 0, ip_off + sizeof (*ip),
2048 ip = (struct ip *)(ss->scratch + ip_off);
2050 ip = (struct ip *)(mtod(m, char *) + ip_off);
2052 cksum_offset = ip_off + (ip->ip_hl << 2);
2053 pseudo_hdr_offset = cksum_offset + m->m_pkthdr.csum_data;
2054 pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2055 req->cksum_offset = cksum_offset;
2056 flags |= MXGEFW_FLAGS_CKSUM;
2057 odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2061 if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2062 flags |= MXGEFW_FLAGS_SMALL;
2064 /* convert segments into a request list */
2067 req->flags = MXGEFW_FLAGS_FIRST;
2068 for (i = 0; i < cnt; i++) {
2070 htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2072 htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2073 req->length = htobe16(seg->ds_len);
2074 req->cksum_offset = cksum_offset;
2075 if (cksum_offset > seg->ds_len)
2076 cksum_offset -= seg->ds_len;
2079 req->pseudo_hdr_offset = pseudo_hdr_offset;
2080 req->pad = 0; /* complete solid 16-byte block */
2081 req->rdma_count = 1;
2082 req->flags |= flags | ((cum_len & 1) * odd_flag);
2083 cum_len += seg->ds_len;
2089 /* pad runts to 60 bytes */
2093 htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2095 htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2096 req->length = htobe16(60 - cum_len);
2097 req->cksum_offset = 0;
2098 req->pseudo_hdr_offset = pseudo_hdr_offset;
2099 req->pad = 0; /* complete solid 16-byte block */
2100 req->rdma_count = 1;
2101 req->flags |= flags | ((cum_len & 1) * odd_flag);
2105 tx->req_list[0].rdma_count = cnt;
2107 /* print what the firmware will see */
2108 for (i = 0; i < cnt; i++) {
2109 kprintf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2110 "cso:%d, flags:0x%x, rdma:%d\n",
2111 i, (int)ntohl(tx->req_list[i].addr_high),
2112 (int)ntohl(tx->req_list[i].addr_low),
2113 (int)ntohs(tx->req_list[i].length),
2114 (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2115 tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2116 tx->req_list[i].rdma_count);
2118 kprintf("--------------\n");
2120 tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2121 mxge_submit_req(tx, tx->req_list, cnt);
2122 #ifdef IFNET_BUF_RING
2123 if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2124 /* tell the NIC to start polling this slice */
2126 tx->queue_active = 1;
2139 #ifdef IFNET_BUF_RING
2141 mxge_qflush(struct ifnet *ifp)
2143 mxge_softc_t *sc = ifp->if_softc;
2148 for (slice = 0; slice < sc->num_slices; slice++) {
2149 tx = &sc->ss[slice].tx;
2150 lockmgr(&tx->lock, LK_EXCLUSIVE);
2151 while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2153 lockmgr(&tx->lock, LK_RELEASE);
2159 mxge_start_locked(struct mxge_slice_state *ss)
2170 while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2171 m = drbr_dequeue(ifp, tx->br);
2175 /* let BPF see it */
2178 /* give it to the nic */
2181 /* ran out of transmit slots */
2182 if (((ss->if_flags & IFF_OACTIVE) == 0)
2183 && (!drbr_empty(ifp, tx->br))) {
2184 ss->if_flags |= IFF_OACTIVE;
2190 mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2201 if ((ss->if_flags & (IFF_RUNNING|IFF_OACTIVE)) !=
2203 err = drbr_enqueue(ifp, tx->br, m);
2207 if (drbr_empty(ifp, tx->br) &&
2208 ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2209 /* let BPF see it */
2211 /* give it to the nic */
2213 } else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2216 if (!drbr_empty(ifp, tx->br))
2217 mxge_start_locked(ss);
2222 mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2224 mxge_softc_t *sc = ifp->if_softc;
2225 struct mxge_slice_state *ss;
2231 slice = m->m_pkthdr.flowid;
2233 slice &= (sc->num_slices - 1); /* num_slices always power of 2 */
2235 ss = &sc->ss[slice];
2238 if (lockmgr(&tx->lock, LK_EXCLUSIVE|LK_NOWAIT)) {
2239 err = mxge_transmit_locked(ss, m);
2240 lockmgr(&tx->lock, LK_RELEASE);
2242 err = drbr_enqueue(ifp, tx->br, m);
2251 mxge_start_locked(struct mxge_slice_state *ss)
2261 while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2262 m = ifq_dequeue(&ifp->if_snd, NULL);
2266 /* let BPF see it */
2269 /* give it to the nic */
2272 /* ran out of transmit slots */
2273 if ((sc->ifp->if_flags & IFF_OACTIVE) == 0) {
2274 sc->ifp->if_flags |= IFF_OACTIVE;
2280 mxge_start(struct ifnet *ifp)
2282 mxge_softc_t *sc = ifp->if_softc;
2283 struct mxge_slice_state *ss;
2285 /* only use the first slice for now */
2287 lockmgr(&ss->tx.lock, LK_EXCLUSIVE);
2288 mxge_start_locked(ss);
2289 lockmgr(&ss->tx.lock, LK_RELEASE);
2293 * copy an array of mcp_kreq_ether_recv_t's to the mcp. Copy
2294 * at most 32 bytes at a time, so as to avoid involving the software
2295 * pio handler in the nic. We re-write the first segment's low
2296 * DMA address to mark it valid only after we write the entire chunk
2300 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2301 mcp_kreq_ether_recv_t *src)
2305 low = src->addr_low;
2306 src->addr_low = 0xffffffff;
2307 mxge_pio_copy(dst, src, 4 * sizeof (*src));
2309 mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2311 src->addr_low = low;
2312 dst->addr_low = low;
2317 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2319 bus_dma_segment_t seg;
2321 mxge_rx_ring_t *rx = &ss->rx_small;
2324 m = m_gethdr(MB_DONTWAIT, MT_DATA);
2331 err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2332 &seg, 1, &cnt, BUS_DMA_NOWAIT);
2337 rx->info[idx].m = m;
2338 rx->shadow[idx].addr_low =
2339 htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2340 rx->shadow[idx].addr_high =
2341 htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2345 mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2350 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2352 bus_dma_segment_t seg[3];
2354 mxge_rx_ring_t *rx = &ss->rx_big;
2357 if (rx->cl_size == MCLBYTES)
2358 m = m_getcl(MB_DONTWAIT, MT_DATA, M_PKTHDR);
2360 m = m_getjcl(MB_DONTWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2366 m->m_len = rx->mlen;
2367 err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2368 seg, 1, &cnt, BUS_DMA_NOWAIT);
2373 rx->info[idx].m = m;
2374 rx->shadow[idx].addr_low =
2375 htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2376 rx->shadow[idx].addr_high =
2377 htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2379 #if MXGE_VIRT_JUMBOS
2380 for (i = 1; i < cnt; i++) {
2381 rx->shadow[idx + i].addr_low =
2382 htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2383 rx->shadow[idx + i].addr_high =
2384 htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2389 for (i = 0; i < rx->nbufs; i++) {
2390 if ((idx & 7) == 7) {
2391 mxge_submit_8rx(&rx->lanai[idx - 7],
2392 &rx->shadow[idx - 7]);
2400 * Myri10GE hardware checksums are not valid if the sender
2401 * padded the frame with non-zero padding. This is because
2402 * the firmware just does a simple 16-bit 1s complement
2403 * checksum across the entire frame, excluding the first 14
2404 * bytes. It is best to simply to check the checksum and
2405 * tell the stack about it only if the checksum is good
2408 static inline uint16_t
2409 mxge_rx_csum(struct mbuf *m, int csum)
2411 struct ether_header *eh;
2415 eh = mtod(m, struct ether_header *);
2417 /* only deal with IPv4 TCP & UDP for now */
2418 if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2420 ip = (struct ip *)(eh + 1);
2421 if (__predict_false(ip->ip_p != IPPROTO_TCP &&
2422 ip->ip_p != IPPROTO_UDP))
2425 c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2426 htonl(ntohs(csum) + ntohs(ip->ip_len) +
2427 - (ip->ip_hl << 2) + ip->ip_p));
2436 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2438 struct ether_vlan_header *evl;
2439 struct ether_header *eh;
2442 evl = mtod(m, struct ether_vlan_header *);
2443 eh = mtod(m, struct ether_header *);
2446 * fix checksum by subtracting EVL_ENCAPLEN bytes
2447 * after what the firmware thought was the end of the ethernet
2451 /* put checksum into host byte order */
2452 *csum = ntohs(*csum);
2453 partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2454 (*csum) += ~partial;
2455 (*csum) += ((*csum) < ~partial);
2456 (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2457 (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2459 /* restore checksum to network byte order;
2460 later consumers expect this */
2461 *csum = htons(*csum);
2464 #ifdef MXGE_NEW_VLAN_API
2465 m->m_pkthdr.ether_vlantag = ntohs(evl->evl_tag);
2469 mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2473 VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2474 m_tag_prepend(m, mtag);
2478 m->m_flags |= M_VLANTAG;
2481 * Remove the 802.1q header by copying the Ethernet
2482 * addresses over it and adjusting the beginning of
2483 * the data in the mbuf. The encapsulated Ethernet
2484 * type field is already in place.
2486 bcopy((char *)evl, (char *)evl + EVL_ENCAPLEN,
2487 ETHER_HDR_LEN - ETHER_TYPE_LEN);
2488 m_adj(m, EVL_ENCAPLEN);
2493 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2498 struct ether_header *eh;
2500 bus_dmamap_t old_map;
2502 uint16_t tcpudp_csum;
2507 idx = rx->cnt & rx->mask;
2508 rx->cnt += rx->nbufs;
2509 /* save a pointer to the received mbuf */
2510 m = rx->info[idx].m;
2511 /* try to replace the received mbuf */
2512 if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2513 /* drop the frame -- the old mbuf is re-cycled */
2518 /* unmap the received buffer */
2519 old_map = rx->info[idx].map;
2520 bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2521 bus_dmamap_unload(rx->dmat, old_map);
2523 /* swap the bus_dmamap_t's */
2524 rx->info[idx].map = rx->extra_map;
2525 rx->extra_map = old_map;
2527 /* mcp implicitly skips 1st 2 bytes so that packet is properly
2529 m->m_data += MXGEFW_PAD;
2531 m->m_pkthdr.rcvif = ifp;
2532 m->m_len = m->m_pkthdr.len = len;
2534 eh = mtod(m, struct ether_header *);
2535 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2536 mxge_vlan_tag_remove(m, &csum);
2538 /* if the checksum is valid, mark it in the mbuf header */
2539 if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2540 if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2542 /* otherwise, it was a UDP frame, or a TCP frame which
2543 we could not do LRO on. Tell the stack that the
2545 m->m_pkthdr.csum_data = 0xffff;
2546 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2549 /* flowid only valid if RSS hashing is enabled */
2550 if (sc->num_slices > 1) {
2551 m->m_pkthdr.flowid = (ss - sc->ss);
2552 m->m_flags |= M_FLOWID;
2555 /* pass the frame up the stack */
2556 (*ifp->if_input)(ifp, m);
2560 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2564 struct ether_header *eh;
2567 bus_dmamap_t old_map;
2569 uint16_t tcpudp_csum;
2574 idx = rx->cnt & rx->mask;
2576 /* save a pointer to the received mbuf */
2577 m = rx->info[idx].m;
2578 /* try to replace the received mbuf */
2579 if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2580 /* drop the frame -- the old mbuf is re-cycled */
2585 /* unmap the received buffer */
2586 old_map = rx->info[idx].map;
2587 bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2588 bus_dmamap_unload(rx->dmat, old_map);
2590 /* swap the bus_dmamap_t's */
2591 rx->info[idx].map = rx->extra_map;
2592 rx->extra_map = old_map;
2594 /* mcp implicitly skips 1st 2 bytes so that packet is properly
2596 m->m_data += MXGEFW_PAD;
2598 m->m_pkthdr.rcvif = ifp;
2599 m->m_len = m->m_pkthdr.len = len;
2601 eh = mtod(m, struct ether_header *);
2602 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2603 mxge_vlan_tag_remove(m, &csum);
2605 /* if the checksum is valid, mark it in the mbuf header */
2606 if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2607 if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2609 /* otherwise, it was a UDP frame, or a TCP frame which
2610 we could not do LRO on. Tell the stack that the
2612 m->m_pkthdr.csum_data = 0xffff;
2613 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2616 /* flowid only valid if RSS hashing is enabled */
2617 if (sc->num_slices > 1) {
2618 m->m_pkthdr.flowid = (ss - sc->ss);
2619 m->m_flags |= M_FLOWID;
2622 /* pass the frame up the stack */
2623 (*ifp->if_input)(ifp, m);
2627 mxge_clean_rx_done(struct mxge_slice_state *ss)
2629 mxge_rx_done_t *rx_done = &ss->rx_done;
2635 while (rx_done->entry[rx_done->idx].length != 0) {
2636 length = ntohs(rx_done->entry[rx_done->idx].length);
2637 rx_done->entry[rx_done->idx].length = 0;
2638 checksum = rx_done->entry[rx_done->idx].checksum;
2639 if (length <= (MHLEN - MXGEFW_PAD))
2640 mxge_rx_done_small(ss, length, checksum);
2642 mxge_rx_done_big(ss, length, checksum);
2644 rx_done->idx = rx_done->cnt & rx_done->mask;
2646 /* limit potential for livelock */
2647 if (__predict_false(++limit > rx_done->mask / 2))
2651 while (!SLIST_EMPTY(&ss->lro_active)) {
2652 struct lro_entry *lro = SLIST_FIRST(&ss->lro_active);
2653 SLIST_REMOVE_HEAD(&ss->lro_active, next);
2654 mxge_lro_flush(ss, lro);
2661 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2672 while (tx->pkt_done != mcp_idx) {
2673 idx = tx->done & tx->mask;
2675 m = tx->info[idx].m;
2676 /* mbuf and DMA map only attached to the first
2679 ss->obytes += m->m_pkthdr.len;
2680 if (m->m_flags & M_MCAST)
2683 tx->info[idx].m = NULL;
2684 map = tx->info[idx].map;
2685 bus_dmamap_unload(tx->dmat, map);
2688 if (tx->info[idx].flag) {
2689 tx->info[idx].flag = 0;
2694 /* If we have space, clear IFF_OACTIVE to tell the stack that
2695 its OK to send packets */
2696 #ifdef IFNET_BUF_RING
2697 flags = &ss->if_flags;
2699 flags = &ifp->if_flags;
2701 lockmgr(&ss->tx.lock, LK_EXCLUSIVE);
2702 if ((*flags) & IFF_OACTIVE &&
2703 tx->req - tx->done < (tx->mask + 1)/4) {
2704 *(flags) &= ~IFF_OACTIVE;
2706 mxge_start_locked(ss);
2708 #ifdef IFNET_BUF_RING
2709 if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2710 /* let the NIC stop polling this queue, since there
2711 * are no more transmits pending */
2712 if (tx->req == tx->done) {
2714 tx->queue_active = 0;
2720 lockmgr(&ss->tx.lock, LK_RELEASE);
2724 static struct mxge_media_type mxge_xfp_media_types[] =
2726 {IFM_10G_CX4, 0x7f, "10GBASE-CX4 (module)"},
2727 {IFM_10G_SR, (1 << 7), "10GBASE-SR"},
2728 {IFM_10G_LR, (1 << 6), "10GBASE-LR"},
2729 {0, (1 << 5), "10GBASE-ER"},
2730 {IFM_10G_LRM, (1 << 4), "10GBASE-LRM"},
2731 {0, (1 << 3), "10GBASE-SW"},
2732 {0, (1 << 2), "10GBASE-LW"},
2733 {0, (1 << 1), "10GBASE-EW"},
2734 {0, (1 << 0), "Reserved"}
2736 static struct mxge_media_type mxge_sfp_media_types[] =
2738 {0, (1 << 7), "Reserved"},
2739 {IFM_10G_LRM, (1 << 6), "10GBASE-LRM"},
2740 {IFM_10G_LR, (1 << 5), "10GBASE-LR"},
2741 {IFM_10G_SR, (1 << 4), "10GBASE-SR"}
2745 mxge_set_media(mxge_softc_t *sc, int type)
2747 sc->media_flags |= type;
2748 ifmedia_add(&sc->media, sc->media_flags, 0, NULL);
2749 ifmedia_set(&sc->media, sc->media_flags);
2754 * Determine the media type for a NIC. Some XFPs will identify
2755 * themselves only when their link is up, so this is initiated via a
2756 * link up interrupt. However, this can potentially take up to
2757 * several milliseconds, so it is run via the watchdog routine, rather
2758 * than in the interrupt handler itself. This need only be done
2759 * once, not each time the link is up.
2762 mxge_media_probe(mxge_softc_t *sc)
2767 struct mxge_media_type *mxge_media_types = NULL;
2768 int i, err, ms, mxge_media_type_entries;
2771 sc->need_media_probe = 0;
2773 /* if we've already set a media type, we're done */
2774 if (sc->media_flags != (IFM_ETHER | IFM_AUTO))
2778 * parse the product code to deterimine the interface type
2779 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2780 * after the 3rd dash in the driver's cached copy of the
2781 * EEPROM's product code string.
2783 ptr = sc->product_code_string;
2785 device_printf(sc->dev, "Missing product code\n");
2788 for (i = 0; i < 3; i++, ptr++) {
2789 ptr = index(ptr, '-');
2791 device_printf(sc->dev,
2792 "only %d dashes in PC?!?\n", i);
2798 mxge_set_media(sc, IFM_10G_CX4);
2801 else if (*ptr == 'Q') {
2802 /* -Q is Quad Ribbon Fiber */
2803 device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2804 /* FreeBSD has no media type for Quad ribbon fiber */
2810 mxge_media_types = mxge_xfp_media_types;
2811 mxge_media_type_entries =
2812 sizeof (mxge_xfp_media_types) /
2813 sizeof (mxge_xfp_media_types[0]);
2814 byte = MXGE_XFP_COMPLIANCE_BYTE;
2818 if (*ptr == 'S' || *(ptr +1) == 'S') {
2819 /* -S or -2S is SFP+ */
2820 mxge_media_types = mxge_sfp_media_types;
2821 mxge_media_type_entries =
2822 sizeof (mxge_sfp_media_types) /
2823 sizeof (mxge_sfp_media_types[0]);
2828 if (mxge_media_types == NULL) {
2829 device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2834 * At this point we know the NIC has an XFP cage, so now we
2835 * try to determine what is in the cage by using the
2836 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2837 * register. We read just one byte, which may take over
2841 cmd.data0 = 0; /* just fetch 1 byte, not all 256 */
2843 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2844 if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
2845 device_printf(sc->dev, "failed to read XFP\n");
2847 if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
2848 device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
2850 if (err != MXGEFW_CMD_OK) {
2854 /* now we wait for the data to be cached */
2856 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2857 for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
2860 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2862 if (err != MXGEFW_CMD_OK) {
2863 device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
2864 cage_type, err, ms);
2868 if (cmd.data0 == mxge_media_types[0].bitmask) {
2870 device_printf(sc->dev, "%s:%s\n", cage_type,
2871 mxge_media_types[0].name);
2872 mxge_set_media(sc, IFM_10G_CX4);
2875 for (i = 1; i < mxge_media_type_entries; i++) {
2876 if (cmd.data0 & mxge_media_types[i].bitmask) {
2878 device_printf(sc->dev, "%s:%s\n",
2880 mxge_media_types[i].name);
2882 mxge_set_media(sc, mxge_media_types[i].flag);
2886 device_printf(sc->dev, "%s media 0x%x unknown\n", cage_type,
2893 mxge_intr(void *arg)
2895 struct mxge_slice_state *ss = arg;
2896 mxge_softc_t *sc = ss->sc;
2897 mcp_irq_data_t *stats = ss->fw_stats;
2898 mxge_tx_ring_t *tx = &ss->tx;
2899 mxge_rx_done_t *rx_done = &ss->rx_done;
2900 uint32_t send_done_count;
2904 #ifndef IFNET_BUF_RING
2905 /* an interrupt on a non-zero slice is implicitly valid
2906 since MSI-X irqs are not shared */
2908 mxge_clean_rx_done(ss);
2909 *ss->irq_claim = be32toh(3);
2914 /* make sure the DMA has finished */
2915 if (!stats->valid) {
2918 valid = stats->valid;
2920 if (sc->legacy_irq) {
2921 /* lower legacy IRQ */
2922 *sc->irq_deassert = 0;
2923 if (!mxge_deassert_wait)
2924 /* don't wait for conf. that irq is low */
2930 /* loop while waiting for legacy irq deassertion */
2932 /* check for transmit completes and receives */
2933 send_done_count = be32toh(stats->send_done_count);
2934 while ((send_done_count != tx->pkt_done) ||
2935 (rx_done->entry[rx_done->idx].length != 0)) {
2936 if (send_done_count != tx->pkt_done)
2937 mxge_tx_done(ss, (int)send_done_count);
2938 mxge_clean_rx_done(ss);
2939 send_done_count = be32toh(stats->send_done_count);
2941 if (sc->legacy_irq && mxge_deassert_wait)
2943 } while (*((volatile uint8_t *) &stats->valid));
2945 /* fw link & error stats meaningful only on the first slice */
2946 if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
2947 if (sc->link_state != stats->link_up) {
2948 sc->link_state = stats->link_up;
2949 if (sc->link_state) {
2950 sc->ifp->if_link_state = LINK_STATE_UP;
2951 if_link_state_change(sc->ifp);
2953 device_printf(sc->dev, "link up\n");
2955 sc->ifp->if_link_state = LINK_STATE_DOWN;
2956 if_link_state_change(sc->ifp);
2958 device_printf(sc->dev, "link down\n");
2960 sc->need_media_probe = 1;
2962 if (sc->rdma_tags_available !=
2963 be32toh(stats->rdma_tags_available)) {
2964 sc->rdma_tags_available =
2965 be32toh(stats->rdma_tags_available);
2966 device_printf(sc->dev, "RDMA timed out! %d tags "
2967 "left\n", sc->rdma_tags_available);
2970 if (stats->link_down) {
2971 sc->down_cnt += stats->link_down;
2973 sc->ifp->if_link_state = LINK_STATE_DOWN;
2974 if_link_state_change(sc->ifp);
2978 /* check to see if we have rx token to pass back */
2980 *ss->irq_claim = be32toh(3);
2981 *(ss->irq_claim + 1) = be32toh(3);
2985 mxge_init(void *arg)
2992 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
2994 struct lro_entry *lro_entry;
2997 while (!SLIST_EMPTY(&ss->lro_free)) {
2998 lro_entry = SLIST_FIRST(&ss->lro_free);
2999 SLIST_REMOVE_HEAD(&ss->lro_free, next);
3000 kfree(lro_entry, M_DEVBUF);
3003 for (i = 0; i <= ss->rx_big.mask; i++) {
3004 if (ss->rx_big.info[i].m == NULL)
3006 bus_dmamap_unload(ss->rx_big.dmat,
3007 ss->rx_big.info[i].map);
3008 m_freem(ss->rx_big.info[i].m);
3009 ss->rx_big.info[i].m = NULL;
3012 for (i = 0; i <= ss->rx_small.mask; i++) {
3013 if (ss->rx_small.info[i].m == NULL)
3015 bus_dmamap_unload(ss->rx_small.dmat,
3016 ss->rx_small.info[i].map);
3017 m_freem(ss->rx_small.info[i].m);
3018 ss->rx_small.info[i].m = NULL;
3021 /* transmit ring used only on the first slice */
3022 if (ss->tx.info == NULL)
3025 for (i = 0; i <= ss->tx.mask; i++) {
3026 ss->tx.info[i].flag = 0;
3027 if (ss->tx.info[i].m == NULL)
3029 bus_dmamap_unload(ss->tx.dmat,
3030 ss->tx.info[i].map);
3031 m_freem(ss->tx.info[i].m);
3032 ss->tx.info[i].m = NULL;
3037 mxge_free_mbufs(mxge_softc_t *sc)
3041 for (slice = 0; slice < sc->num_slices; slice++)
3042 mxge_free_slice_mbufs(&sc->ss[slice]);
3046 mxge_free_slice_rings(struct mxge_slice_state *ss)
3051 if (ss->rx_done.entry != NULL)
3052 mxge_dma_free(&ss->rx_done.dma);
3053 ss->rx_done.entry = NULL;
3055 if (ss->tx.req_bytes != NULL)
3056 kfree(ss->tx.req_bytes, M_DEVBUF);
3057 ss->tx.req_bytes = NULL;
3059 if (ss->tx.seg_list != NULL)
3060 kfree(ss->tx.seg_list, M_DEVBUF);
3061 ss->tx.seg_list = NULL;
3063 if (ss->rx_small.shadow != NULL)
3064 kfree(ss->rx_small.shadow, M_DEVBUF);
3065 ss->rx_small.shadow = NULL;
3067 if (ss->rx_big.shadow != NULL)
3068 kfree(ss->rx_big.shadow, M_DEVBUF);
3069 ss->rx_big.shadow = NULL;
3071 if (ss->tx.info != NULL) {
3072 if (ss->tx.dmat != NULL) {
3073 for (i = 0; i <= ss->tx.mask; i++) {
3074 bus_dmamap_destroy(ss->tx.dmat,
3075 ss->tx.info[i].map);
3077 bus_dma_tag_destroy(ss->tx.dmat);
3079 kfree(ss->tx.info, M_DEVBUF);
3083 if (ss->rx_small.info != NULL) {
3084 if (ss->rx_small.dmat != NULL) {
3085 for (i = 0; i <= ss->rx_small.mask; i++) {
3086 bus_dmamap_destroy(ss->rx_small.dmat,
3087 ss->rx_small.info[i].map);
3089 bus_dmamap_destroy(ss->rx_small.dmat,
3090 ss->rx_small.extra_map);
3091 bus_dma_tag_destroy(ss->rx_small.dmat);
3093 kfree(ss->rx_small.info, M_DEVBUF);
3095 ss->rx_small.info = NULL;
3097 if (ss->rx_big.info != NULL) {
3098 if (ss->rx_big.dmat != NULL) {
3099 for (i = 0; i <= ss->rx_big.mask; i++) {
3100 bus_dmamap_destroy(ss->rx_big.dmat,
3101 ss->rx_big.info[i].map);
3103 bus_dmamap_destroy(ss->rx_big.dmat,
3104 ss->rx_big.extra_map);
3105 bus_dma_tag_destroy(ss->rx_big.dmat);
3107 kfree(ss->rx_big.info, M_DEVBUF);
3109 ss->rx_big.info = NULL;
3113 mxge_free_rings(mxge_softc_t *sc)
3117 for (slice = 0; slice < sc->num_slices; slice++)
3118 mxge_free_slice_rings(&sc->ss[slice]);
3122 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3123 int tx_ring_entries)
3125 mxge_softc_t *sc = ss->sc;
3131 /* allocate per-slice receive resources */
3133 ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3134 ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3136 /* allocate the rx shadow rings */
3137 bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3138 ss->rx_small.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3139 if (ss->rx_small.shadow == NULL)
3142 bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3143 ss->rx_big.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3144 if (ss->rx_big.shadow == NULL)
3147 /* allocate the rx host info rings */
3148 bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3149 ss->rx_small.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3150 if (ss->rx_small.info == NULL)
3153 bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3154 ss->rx_big.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3155 if (ss->rx_big.info == NULL)
3158 /* allocate the rx busdma resources */
3159 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3161 4096, /* boundary */
3162 BUS_SPACE_MAXADDR, /* low */
3163 BUS_SPACE_MAXADDR, /* high */
3164 NULL, NULL, /* filter */
3165 MHLEN, /* maxsize */
3167 MHLEN, /* maxsegsize */
3168 BUS_DMA_ALLOCNOW, /* flags */
3169 &ss->rx_small.dmat); /* tag */
3171 device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3176 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3178 #if MXGE_VIRT_JUMBOS
3179 4096, /* boundary */
3183 BUS_SPACE_MAXADDR, /* low */
3184 BUS_SPACE_MAXADDR, /* high */
3185 NULL, NULL, /* filter */
3186 3*4096, /* maxsize */
3187 #if MXGE_VIRT_JUMBOS
3189 4096, /* maxsegsize*/
3192 MJUM9BYTES, /* maxsegsize*/
3194 BUS_DMA_ALLOCNOW, /* flags */
3195 &ss->rx_big.dmat); /* tag */
3197 device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3201 for (i = 0; i <= ss->rx_small.mask; i++) {
3202 err = bus_dmamap_create(ss->rx_small.dmat, 0,
3203 &ss->rx_small.info[i].map);
3205 device_printf(sc->dev, "Err %d rx_small dmamap\n",
3210 err = bus_dmamap_create(ss->rx_small.dmat, 0,
3211 &ss->rx_small.extra_map);
3213 device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3218 for (i = 0; i <= ss->rx_big.mask; i++) {
3219 err = bus_dmamap_create(ss->rx_big.dmat, 0,
3220 &ss->rx_big.info[i].map);
3222 device_printf(sc->dev, "Err %d rx_big dmamap\n",
3227 err = bus_dmamap_create(ss->rx_big.dmat, 0,
3228 &ss->rx_big.extra_map);
3230 device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3235 /* now allocate TX resouces */
3237 #ifndef IFNET_BUF_RING
3238 /* only use a single TX ring for now */
3239 if (ss != ss->sc->ss)
3243 ss->tx.mask = tx_ring_entries - 1;
3244 ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3247 /* allocate the tx request copy block */
3249 sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3250 ss->tx.req_bytes = kmalloc(bytes, M_DEVBUF, M_WAITOK);
3251 if (ss->tx.req_bytes == NULL)
3253 /* ensure req_list entries are aligned to 8 bytes */
3254 ss->tx.req_list = (mcp_kreq_ether_send_t *)
3255 ((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3257 /* allocate the tx busdma segment list */
3258 bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3259 ss->tx.seg_list = (bus_dma_segment_t *)
3260 kmalloc(bytes, M_DEVBUF, M_WAITOK);
3261 if (ss->tx.seg_list == NULL)
3264 /* allocate the tx host info ring */
3265 bytes = tx_ring_entries * sizeof (*ss->tx.info);
3266 ss->tx.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3267 if (ss->tx.info == NULL)
3270 /* allocate the tx busdma resources */
3271 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3273 sc->tx_boundary, /* boundary */
3274 BUS_SPACE_MAXADDR, /* low */
3275 BUS_SPACE_MAXADDR, /* high */
3276 NULL, NULL, /* filter */
3277 65536 + 256, /* maxsize */
3278 ss->tx.max_desc - 2, /* num segs */
3279 sc->tx_boundary, /* maxsegsz */
3280 BUS_DMA_ALLOCNOW, /* flags */
3281 &ss->tx.dmat); /* tag */
3284 device_printf(sc->dev, "Err %d allocating tx dmat\n",
3289 /* now use these tags to setup dmamaps for each slot
3291 for (i = 0; i <= ss->tx.mask; i++) {
3292 err = bus_dmamap_create(ss->tx.dmat, 0,
3293 &ss->tx.info[i].map);
3295 device_printf(sc->dev, "Err %d tx dmamap\n",
3305 mxge_alloc_rings(mxge_softc_t *sc)
3309 int tx_ring_entries, rx_ring_entries;
3312 /* get ring sizes */
3313 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3314 tx_ring_size = cmd.data0;
3316 device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3320 tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3321 rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3322 ifq_set_maxlen(&sc->ifp->if_snd, tx_ring_entries - 1);
3323 ifq_set_ready(&sc->ifp->if_snd);
3325 for (slice = 0; slice < sc->num_slices; slice++) {
3326 err = mxge_alloc_slice_rings(&sc->ss[slice],
3335 mxge_free_rings(sc);
3342 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3344 int bufsize = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN + MXGEFW_PAD;
3346 if (bufsize < MCLBYTES) {
3347 /* easy, everything fits in a single buffer */
3348 *big_buf_size = MCLBYTES;
3349 *cl_size = MCLBYTES;
3354 if (bufsize < MJUMPAGESIZE) {
3355 /* still easy, everything still fits in a single buffer */
3356 *big_buf_size = MJUMPAGESIZE;
3357 *cl_size = MJUMPAGESIZE;
3361 #if MXGE_VIRT_JUMBOS
3362 /* now we need to use virtually contiguous buffers */
3363 *cl_size = MJUM9BYTES;
3364 *big_buf_size = 4096;
3365 *nbufs = mtu / 4096 + 1;
3366 /* needs to be a power of two, so round up */
3370 *cl_size = MJUM9BYTES;
3371 *big_buf_size = MJUM9BYTES;
3377 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3382 struct lro_entry *lro_entry;
3387 slice = ss - sc->ss;
3389 SLIST_INIT(&ss->lro_free);
3390 SLIST_INIT(&ss->lro_active);
3392 for (i = 0; i < sc->lro_cnt; i++) {
3393 lro_entry = (struct lro_entry *)
3394 kmalloc(sizeof (*lro_entry), M_DEVBUF,
3396 if (lro_entry == NULL) {
3400 SLIST_INSERT_HEAD(&ss->lro_free, lro_entry, next);
3402 /* get the lanai pointers to the send and receive rings */
3405 #ifndef IFNET_BUF_RING
3406 /* We currently only send from the first slice */
3410 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3412 (volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3413 ss->tx.send_go = (volatile uint32_t *)
3414 (sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3415 ss->tx.send_stop = (volatile uint32_t *)
3416 (sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3417 #ifndef IFNET_BUF_RING
3421 err |= mxge_send_cmd(sc,
3422 MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3423 ss->rx_small.lanai =
3424 (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3426 err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3428 (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3431 device_printf(sc->dev,
3432 "failed to get ring sizes or locations\n");
3436 /* stock receive rings */
3437 for (i = 0; i <= ss->rx_small.mask; i++) {
3438 map = ss->rx_small.info[i].map;
3439 err = mxge_get_buf_small(ss, map, i);
3441 device_printf(sc->dev, "alloced %d/%d smalls\n",
3442 i, ss->rx_small.mask + 1);
3446 for (i = 0; i <= ss->rx_big.mask; i++) {
3447 ss->rx_big.shadow[i].addr_low = 0xffffffff;
3448 ss->rx_big.shadow[i].addr_high = 0xffffffff;
3450 ss->rx_big.nbufs = nbufs;
3451 ss->rx_big.cl_size = cl_size;
3452 ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3453 EVL_ENCAPLEN + MXGEFW_PAD;
3454 for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3455 map = ss->rx_big.info[i].map;
3456 err = mxge_get_buf_big(ss, map, i);
3458 device_printf(sc->dev, "alloced %d/%d bigs\n",
3459 i, ss->rx_big.mask + 1);
3467 mxge_open(mxge_softc_t *sc)
3470 int err, big_bytes, nbufs, slice, cl_size, i;
3472 volatile uint8_t *itable;
3473 struct mxge_slice_state *ss;
3475 /* Copy the MAC address in case it was overridden */
3476 bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3478 err = mxge_reset(sc, 1);
3480 device_printf(sc->dev, "failed to reset\n");
3484 if (sc->num_slices > 1) {
3485 /* setup the indirection table */
3486 cmd.data0 = sc->num_slices;
3487 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3490 err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3493 device_printf(sc->dev,
3494 "failed to setup rss tables\n");
3498 /* just enable an identity mapping */
3499 itable = sc->sram + cmd.data0;
3500 for (i = 0; i < sc->num_slices; i++)
3501 itable[i] = (uint8_t)i;
3504 cmd.data1 = mxge_rss_hash_type;
3505 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3507 device_printf(sc->dev, "failed to enable slices\n");
3513 mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3516 err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3518 /* error is only meaningful if we're trying to set
3519 MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3520 if (err && nbufs > 1) {
3521 device_printf(sc->dev,
3522 "Failed to set alway-use-n to %d\n",
3526 /* Give the firmware the mtu and the big and small buffer
3527 sizes. The firmware wants the big buf size to be a power
3528 of two. Luckily, FreeBSD's clusters are powers of two */
3529 cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
3530 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3531 cmd.data0 = MHLEN - MXGEFW_PAD;
3532 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3534 cmd.data0 = big_bytes;
3535 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3538 device_printf(sc->dev, "failed to setup params\n");
3542 /* Now give him the pointer to the stats block */
3544 #ifdef IFNET_BUF_RING
3545 slice < sc->num_slices;
3550 ss = &sc->ss[slice];
3552 MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3554 MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3555 cmd.data2 = sizeof(struct mcp_irq_data);
3556 cmd.data2 |= (slice << 16);
3557 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3561 bus = sc->ss->fw_stats_dma.bus_addr;
3562 bus += offsetof(struct mcp_irq_data, send_done_count);
3563 cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3564 cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3565 err = mxge_send_cmd(sc,
3566 MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3568 /* Firmware cannot support multicast without STATS_DMA_V2 */
3569 sc->fw_multicast_support = 0;
3571 sc->fw_multicast_support = 1;
3575 device_printf(sc->dev, "failed to setup params\n");
3579 for (slice = 0; slice < sc->num_slices; slice++) {
3580 err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3582 device_printf(sc->dev, "couldn't open slice %d\n",
3588 /* Finally, start the firmware running */
3589 err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3591 device_printf(sc->dev, "Couldn't bring up link\n");
3594 #ifdef IFNET_BUF_RING
3595 for (slice = 0; slice < sc->num_slices; slice++) {
3596 ss = &sc->ss[slice];
3597 ss->if_flags |= IFF_RUNNING;
3598 ss->if_flags &= ~IFF_OACTIVE;
3601 sc->ifp->if_flags |= IFF_RUNNING;
3602 sc->ifp->if_flags &= ~IFF_OACTIVE;
3603 callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3609 mxge_free_mbufs(sc);
3615 mxge_close(mxge_softc_t *sc)
3618 int err, old_down_cnt;
3619 #ifdef IFNET_BUF_RING
3620 struct mxge_slice_state *ss;
3624 callout_stop(&sc->co_hdl);
3625 #ifdef IFNET_BUF_RING
3626 for (slice = 0; slice < sc->num_slices; slice++) {
3627 ss = &sc->ss[slice];
3628 ss->if_flags &= ~IFF_RUNNING;
3631 sc->ifp->if_flags &= ~IFF_RUNNING;
3632 old_down_cnt = sc->down_cnt;
3634 err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3636 device_printf(sc->dev, "Couldn't bring down link\n");
3638 if (old_down_cnt == sc->down_cnt) {
3639 /* wait for down irq */
3640 DELAY(10 * sc->intr_coal_delay);
3643 if (old_down_cnt == sc->down_cnt) {
3644 device_printf(sc->dev, "never got down irq\n");
3647 mxge_free_mbufs(sc);
3653 mxge_setup_cfg_space(mxge_softc_t *sc)
3655 device_t dev = sc->dev;
3657 uint16_t cmd, lnk, pectl;
3659 /* find the PCIe link width and set max read request to 4KB*/
3660 if (pci_find_extcap(dev, PCIY_EXPRESS, ®) == 0) {
3661 lnk = pci_read_config(dev, reg + 0x12, 2);
3662 sc->link_width = (lnk >> 4) & 0x3f;
3664 pectl = pci_read_config(dev, reg + 0x8, 2);
3665 pectl = (pectl & ~0x7000) | (5 << 12);
3666 pci_write_config(dev, reg + 0x8, pectl, 2);
3669 /* Enable DMA and Memory space access */
3670 pci_enable_busmaster(dev);
3671 cmd = pci_read_config(dev, PCIR_COMMAND, 2);
3672 cmd |= PCIM_CMD_MEMEN;
3673 pci_write_config(dev, PCIR_COMMAND, cmd, 2);
3677 mxge_read_reboot(mxge_softc_t *sc)
3679 device_t dev = sc->dev;
3682 /* find the vendor specific offset */
3683 if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
3684 device_printf(sc->dev,
3685 "could not find vendor specific offset\n");
3686 return (uint32_t)-1;
3688 /* enable read32 mode */
3689 pci_write_config(dev, vs + 0x10, 0x3, 1);
3690 /* tell NIC which register to read */
3691 pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3692 return (pci_read_config(dev, vs + 0x14, 4));
3696 mxge_watchdog_reset(mxge_softc_t *sc, int slice)
3698 struct pci_devinfo *dinfo;
3706 device_printf(sc->dev, "Watchdog reset!\n");
3709 * check to see if the NIC rebooted. If it did, then all of
3710 * PCI config space has been reset, and things like the
3711 * busmaster bit will be zero. If this is the case, then we
3712 * must restore PCI config space before the NIC can be used
3715 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3716 if (cmd == 0xffff) {
3718 * maybe the watchdog caught the NIC rebooting; wait
3719 * up to 100ms for it to finish. If it does not come
3720 * back, then give up
3723 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3724 if (cmd == 0xffff) {
3725 device_printf(sc->dev, "NIC disappeared!\n");
3729 if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3730 /* print the reboot status */
3731 reboot = mxge_read_reboot(sc);
3732 device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3734 /* restore PCI configuration space */
3735 dinfo = device_get_ivars(sc->dev);
3736 pci_cfg_restore(sc->dev, dinfo);
3738 /* and redo any changes we made to our config space */
3739 mxge_setup_cfg_space(sc);
3741 if (sc->ifp->if_flags & IFF_RUNNING) {
3743 err = mxge_open(sc);
3746 tx = &sc->ss[slice].tx;
3747 device_printf(sc->dev,
3748 "NIC did not reboot, slice %d ring state:\n",
3750 device_printf(sc->dev,
3751 "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3752 tx->req, tx->done, tx->queue_active);
3753 device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3754 tx->activate, tx->deactivate);
3755 device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3757 be32toh(sc->ss->fw_stats->send_done_count));
3758 device_printf(sc->dev, "not resetting\n");
3764 mxge_watchdog(mxge_softc_t *sc)
3767 uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3770 /* see if we have outstanding transmits, which
3771 have been pending for more than mxge_ticks */
3773 #ifdef IFNET_BUF_RING
3774 (i < sc->num_slices) && (err == 0);
3776 (i < 1) && (err == 0);
3780 if (tx->req != tx->done &&
3781 tx->watchdog_req != tx->watchdog_done &&
3782 tx->done == tx->watchdog_done) {
3783 /* check for pause blocking before resetting */
3784 if (tx->watchdog_rx_pause == rx_pause)
3785 err = mxge_watchdog_reset(sc, i);
3787 device_printf(sc->dev, "Flow control blocking "
3788 "xmits, check link partner\n");
3791 tx->watchdog_req = tx->req;
3792 tx->watchdog_done = tx->done;
3793 tx->watchdog_rx_pause = rx_pause;
3796 if (sc->need_media_probe)
3797 mxge_media_probe(sc);
3802 mxge_update_stats(mxge_softc_t *sc)
3804 struct mxge_slice_state *ss;
3805 u_long ipackets = 0;
3806 u_long opackets = 0;
3807 #ifdef IFNET_BUF_RING
3815 for (slice = 0; slice < sc->num_slices; slice++) {
3816 ss = &sc->ss[slice];
3817 ipackets += ss->ipackets;
3818 opackets += ss->opackets;
3819 #ifdef IFNET_BUF_RING
3820 obytes += ss->obytes;
3821 omcasts += ss->omcasts;
3822 odrops += ss->tx.br->br_drops;
3824 oerrors += ss->oerrors;
3826 sc->ifp->if_ipackets = ipackets;
3827 sc->ifp->if_opackets = opackets;
3828 #ifdef IFNET_BUF_RING
3829 sc->ifp->if_obytes = obytes;
3830 sc->ifp->if_omcasts = omcasts;
3831 sc->ifp->if_snd.ifq_drops = odrops;
3833 sc->ifp->if_oerrors = oerrors;
3837 mxge_tick(void *arg)
3839 mxge_softc_t *sc = arg;
3842 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
3843 /* aggregate stats from different slices */
3844 mxge_update_stats(sc);
3845 if (!sc->watchdog_countdown) {
3846 err = mxge_watchdog(sc);
3847 sc->watchdog_countdown = 4;
3849 sc->watchdog_countdown--;
3851 callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3852 lockmgr(&sc->driver_lock, LK_RELEASE);
3856 mxge_media_change(struct ifnet *ifp)
3862 mxge_change_mtu(mxge_softc_t *sc, int mtu)
3864 struct ifnet *ifp = sc->ifp;
3865 int real_mtu, old_mtu;
3869 real_mtu = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
3870 if ((real_mtu > sc->max_mtu) || real_mtu < 60)
3872 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
3873 old_mtu = ifp->if_mtu;
3875 if (ifp->if_flags & IFF_RUNNING) {
3877 err = mxge_open(sc);
3879 ifp->if_mtu = old_mtu;
3881 (void) mxge_open(sc);
3884 lockmgr(&sc->driver_lock, LK_RELEASE);
3889 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
3891 mxge_softc_t *sc = ifp->if_softc;
3896 ifmr->ifm_status = IFM_AVALID;
3897 ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
3898 ifmr->ifm_active = IFM_AUTO | IFM_ETHER;
3899 ifmr->ifm_active |= sc->link_state ? IFM_FDX : 0;
3903 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data, struct ucred *cr)
3905 mxge_softc_t *sc = ifp->if_softc;
3906 struct ifreq *ifr = (struct ifreq *)data;
3914 err = ether_ioctl(ifp, command, data);
3918 err = mxge_change_mtu(sc, ifr->ifr_mtu);
3922 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
3924 lockmgr(&sc->driver_lock, LK_RELEASE);
3927 if (ifp->if_flags & IFF_UP) {
3928 if (!(ifp->if_flags & IFF_RUNNING)) {
3929 err = mxge_open(sc);
3931 /* take care of promis can allmulti
3933 mxge_change_promisc(sc,
3934 ifp->if_flags & IFF_PROMISC);
3935 mxge_set_multicast_list(sc);
3938 if (ifp->if_flags & IFF_RUNNING) {
3942 lockmgr(&sc->driver_lock, LK_RELEASE);
3947 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
3948 mxge_set_multicast_list(sc);
3949 lockmgr(&sc->driver_lock, LK_RELEASE);
3953 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
3954 mask = ifr->ifr_reqcap ^ ifp->if_capenable;
3955 if (mask & IFCAP_TXCSUM) {
3956 if (IFCAP_TXCSUM & ifp->if_capenable) {
3957 ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
3958 ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
3961 ifp->if_capenable |= IFCAP_TXCSUM;
3962 ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
3964 } else if (mask & IFCAP_RXCSUM) {
3965 if (IFCAP_RXCSUM & ifp->if_capenable) {
3966 ifp->if_capenable &= ~IFCAP_RXCSUM;
3969 ifp->if_capenable |= IFCAP_RXCSUM;
3973 if (mask & IFCAP_TSO4) {
3974 if (IFCAP_TSO4 & ifp->if_capenable) {
3975 ifp->if_capenable &= ~IFCAP_TSO4;
3976 ifp->if_hwassist &= ~CSUM_TSO;
3977 } else if (IFCAP_TXCSUM & ifp->if_capenable) {
3978 ifp->if_capenable |= IFCAP_TSO4;
3979 ifp->if_hwassist |= CSUM_TSO;
3981 kprintf("mxge requires tx checksum offload"
3982 " be enabled to use TSO\n");
3986 if (mask & IFCAP_LRO) {
3987 if (IFCAP_LRO & ifp->if_capenable)
3988 err = mxge_change_lro_locked(sc, 0);
3990 err = mxge_change_lro_locked(sc, mxge_lro_cnt);
3992 if (mask & IFCAP_VLAN_HWTAGGING)
3993 ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
3994 lockmgr(&sc->driver_lock, LK_RELEASE);
3995 VLAN_CAPABILITIES(ifp);
4000 err = ifmedia_ioctl(ifp, (struct ifreq *)data,
4001 &sc->media, command);
4011 mxge_fetch_tunables(mxge_softc_t *sc)
4014 TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4015 TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4016 &mxge_flow_control);
4017 TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4018 &mxge_intr_coal_delay);
4019 TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4020 &mxge_nvidia_ecrc_enable);
4021 TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4022 &mxge_force_firmware);
4023 TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4024 &mxge_deassert_wait);
4025 TUNABLE_INT_FETCH("hw.mxge.verbose",
4027 TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4028 TUNABLE_INT_FETCH("hw.mxge.lro_cnt", &sc->lro_cnt);
4029 TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4030 TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4031 TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4032 if (sc->lro_cnt != 0)
4033 mxge_lro_cnt = sc->lro_cnt;
4037 if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4038 mxge_intr_coal_delay = 30;
4039 if (mxge_ticks == 0)
4040 mxge_ticks = hz / 2;
4041 sc->pause = mxge_flow_control;
4042 if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4043 || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4044 mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
4046 if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4047 mxge_initial_mtu < ETHER_MIN_LEN)
4048 mxge_initial_mtu = ETHERMTU_JUMBO;
4053 mxge_free_slices(mxge_softc_t *sc)
4055 struct mxge_slice_state *ss;
4062 for (i = 0; i < sc->num_slices; i++) {
4064 if (ss->fw_stats != NULL) {
4065 mxge_dma_free(&ss->fw_stats_dma);
4066 ss->fw_stats = NULL;
4067 #ifdef IFNET_BUF_RING
4068 if (ss->tx.br != NULL) {
4069 drbr_free(ss->tx.br, M_DEVBUF);
4073 lockuninit(&ss->tx.lock);
4075 if (ss->rx_done.entry != NULL) {
4076 mxge_dma_free(&ss->rx_done.dma);
4077 ss->rx_done.entry = NULL;
4080 kfree(sc->ss, M_DEVBUF);
4085 mxge_alloc_slices(mxge_softc_t *sc)
4088 struct mxge_slice_state *ss;
4090 int err, i, max_intr_slots;
4092 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4094 device_printf(sc->dev, "Cannot determine rx ring size\n");
4097 sc->rx_ring_size = cmd.data0;
4098 max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4100 bytes = sizeof (*sc->ss) * sc->num_slices;
4101 sc->ss = kmalloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4104 for (i = 0; i < sc->num_slices; i++) {
4109 /* allocate per-slice rx interrupt queues */
4111 bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4112 err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4115 ss->rx_done.entry = ss->rx_done.dma.addr;
4116 bzero(ss->rx_done.entry, bytes);
4119 * allocate the per-slice firmware stats; stats
4120 * (including tx) are used used only on the first
4123 #ifndef IFNET_BUF_RING
4128 bytes = sizeof (*ss->fw_stats);
4129 err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4130 sizeof (*ss->fw_stats), 64);
4133 ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4134 ksnprintf(ss->tx.lock_name, sizeof(ss->tx.lock_name),
4135 "%s:tx(%d)", device_get_nameunit(sc->dev), i);
4136 lockinit(&ss->tx.lock, ss->tx.lock_name, 0, LK_CANRECURSE);
4137 #ifdef IFNET_BUF_RING
4138 ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4146 mxge_free_slices(sc);
4151 mxge_slice_probe(mxge_softc_t *sc)
4155 int msix_cnt, status, max_intr_slots;
4159 * don't enable multiple slices if they are not enabled,
4160 * or if this is not an SMP system
4163 if (mxge_max_slices == 0 || mxge_max_slices == 1 || ncpus < 2)
4166 /* see how many MSI-X interrupts are available */
4167 msix_cnt = pci_msix_count(sc->dev);
4171 /* now load the slice aware firmware see what it supports */
4172 old_fw = sc->fw_name;
4173 if (old_fw == mxge_fw_aligned)
4174 sc->fw_name = mxge_fw_rss_aligned;
4176 sc->fw_name = mxge_fw_rss_unaligned;
4177 status = mxge_load_firmware(sc, 0);
4179 device_printf(sc->dev, "Falling back to a single slice\n");
4183 /* try to send a reset command to the card to see if it
4185 memset(&cmd, 0, sizeof (cmd));
4186 status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4188 device_printf(sc->dev, "failed reset\n");
4192 /* get rx ring size */
4193 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4195 device_printf(sc->dev, "Cannot determine rx ring size\n");
4198 max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4200 /* tell it the size of the interrupt queues */
4201 cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4202 status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4204 device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4208 /* ask the maximum number of slices it supports */
4209 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4211 device_printf(sc->dev,
4212 "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4215 sc->num_slices = cmd.data0;
4216 if (sc->num_slices > msix_cnt)
4217 sc->num_slices = msix_cnt;
4219 if (mxge_max_slices == -1) {
4220 /* cap to number of CPUs in system */
4221 if (sc->num_slices > ncpus)
4222 sc->num_slices = ncpus;
4224 if (sc->num_slices > mxge_max_slices)
4225 sc->num_slices = mxge_max_slices;
4227 /* make sure it is a power of two */
4228 while (sc->num_slices & (sc->num_slices - 1))
4232 device_printf(sc->dev, "using %d slices\n",
4238 sc->fw_name = old_fw;
4239 (void) mxge_load_firmware(sc, 0);
4243 mxge_add_msix_irqs(mxge_softc_t *sc)
4246 int count, err, i, rid;
4249 sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4252 if (sc->msix_table_res == NULL) {
4253 device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4257 count = sc->num_slices;
4258 err = pci_alloc_msix(sc->dev, &count);
4260 device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4261 "err = %d \n", sc->num_slices, err);
4262 goto abort_with_msix_table;
4264 if (count < sc->num_slices) {
4265 device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4266 count, sc->num_slices);
4267 device_printf(sc->dev,
4268 "Try setting hw.mxge.max_slices to %d\n",
4271 goto abort_with_msix;
4273 bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4274 sc->msix_irq_res = kmalloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4275 if (sc->msix_irq_res == NULL) {
4277 goto abort_with_msix;
4280 for (i = 0; i < sc->num_slices; i++) {
4282 sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4285 if (sc->msix_irq_res[i] == NULL) {
4286 device_printf(sc->dev, "couldn't allocate IRQ res"
4287 " for message %d\n", i);
4289 goto abort_with_res;
4293 bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4294 sc->msix_ih = kmalloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4296 for (i = 0; i < sc->num_slices; i++) {
4297 err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4299 mxge_intr, &sc->ss[i], &sc->msix_ih[i],
4300 XXX /* serializer */);
4302 device_printf(sc->dev, "couldn't setup intr for "
4304 goto abort_with_intr;
4309 device_printf(sc->dev, "using %d msix IRQs:",
4311 for (i = 0; i < sc->num_slices; i++)
4312 kprintf(" %ld", rman_get_start(sc->msix_irq_res[i]));
4318 for (i = 0; i < sc->num_slices; i++) {
4319 if (sc->msix_ih[i] != NULL) {
4320 bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4322 sc->msix_ih[i] = NULL;
4325 kfree(sc->msix_ih, M_DEVBUF);
4329 for (i = 0; i < sc->num_slices; i++) {
4331 if (sc->msix_irq_res[i] != NULL)
4332 bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4333 sc->msix_irq_res[i]);
4334 sc->msix_irq_res[i] = NULL;
4336 kfree(sc->msix_irq_res, M_DEVBUF);
4340 pci_release_msi(sc->dev);
4342 abort_with_msix_table:
4343 bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4344 sc->msix_table_res);
4350 mxge_add_single_irq(mxge_softc_t *sc)
4352 int count, err, rid;
4354 count = pci_msi_count(sc->dev);
4355 if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4361 sc->irq_res = bus_alloc_resource(sc->dev, SYS_RES_IRQ, &rid, 0, ~0,
4362 1, RF_SHAREABLE | RF_ACTIVE);
4363 if (sc->irq_res == NULL) {
4364 device_printf(sc->dev, "could not alloc interrupt\n");
4368 device_printf(sc->dev, "using %s irq %ld\n",
4369 sc->legacy_irq ? "INTx" : "MSI",
4370 rman_get_start(sc->irq_res));
4371 err = bus_setup_intr(sc->dev, sc->irq_res,
4373 mxge_intr, &sc->ss[0], &sc->ih,
4374 XXX /* serializer */);
4376 bus_release_resource(sc->dev, SYS_RES_IRQ,
4377 sc->legacy_irq ? 0 : 1, sc->irq_res);
4378 if (!sc->legacy_irq)
4379 pci_release_msi(sc->dev);
4385 mxge_rem_msix_irqs(mxge_softc_t *sc)
4389 for (i = 0; i < sc->num_slices; i++) {
4390 if (sc->msix_ih[i] != NULL) {
4391 bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4393 sc->msix_ih[i] = NULL;
4396 kfree(sc->msix_ih, M_DEVBUF);
4398 for (i = 0; i < sc->num_slices; i++) {
4400 if (sc->msix_irq_res[i] != NULL)
4401 bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4402 sc->msix_irq_res[i]);
4403 sc->msix_irq_res[i] = NULL;
4405 kfree(sc->msix_irq_res, M_DEVBUF);
4407 bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4408 sc->msix_table_res);
4410 pci_release_msi(sc->dev);
4415 mxge_rem_single_irq(mxge_softc_t *sc)
4417 bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4418 bus_release_resource(sc->dev, SYS_RES_IRQ,
4419 sc->legacy_irq ? 0 : 1, sc->irq_res);
4420 if (!sc->legacy_irq)
4421 pci_release_msi(sc->dev);
4425 mxge_rem_irq(mxge_softc_t *sc)
4427 if (sc->num_slices > 1)
4428 mxge_rem_msix_irqs(sc);
4430 mxge_rem_single_irq(sc);
4434 mxge_add_irq(mxge_softc_t *sc)
4438 if (sc->num_slices > 1)
4439 err = mxge_add_msix_irqs(sc);
4441 err = mxge_add_single_irq(sc);
4443 if (0 && err == 0 && sc->num_slices > 1) {
4444 mxge_rem_msix_irqs(sc);
4445 err = mxge_add_msix_irqs(sc);
4452 mxge_attach(device_t dev)
4454 mxge_softc_t *sc = device_get_softc(dev);
4455 struct ifnet *ifp = &sc->arpcom.ac_if;
4459 * avoid rewriting half the lines in this file to use
4460 * &sc->arpcom.ac_if instead
4464 mxge_fetch_tunables(sc);
4466 err = bus_dma_tag_create(NULL, /* parent */
4469 BUS_SPACE_MAXADDR, /* low */
4470 BUS_SPACE_MAXADDR, /* high */
4471 NULL, NULL, /* filter */
4472 65536 + 256, /* maxsize */
4473 MXGE_MAX_SEND_DESC, /* num segs */
4474 65536, /* maxsegsize */
4476 &sc->parent_dmat); /* tag */
4479 device_printf(sc->dev, "Err %d allocating parent dmat\n",
4481 goto abort_with_nothing;
4485 if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4487 ksnprintf(sc->cmd_lock_name, sizeof(sc->cmd_lock_name), "%s:cmd",
4488 device_get_nameunit(dev));
4489 lockinit(&sc->cmd_lock, sc->cmd_lock_name, 0, LK_CANRECURSE);
4490 ksnprintf(sc->driver_lock_name, sizeof(sc->driver_lock_name),
4491 "%s:drv", device_get_nameunit(dev));
4492 lockinit(&sc->driver_lock, sc->driver_lock_name,
4495 callout_init(&sc->co_hdl);
4497 mxge_setup_cfg_space(sc);
4499 /* Map the board into the kernel */
4501 sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
4503 if (sc->mem_res == NULL) {
4504 device_printf(dev, "could not map memory\n");
4506 goto abort_with_lock;
4508 sc->sram = rman_get_virtual(sc->mem_res);
4509 sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4510 if (sc->sram_size > rman_get_size(sc->mem_res)) {
4511 device_printf(dev, "impossible memory region size %ld\n",
4512 rman_get_size(sc->mem_res));
4514 goto abort_with_mem_res;
4517 /* make NULL terminated copy of the EEPROM strings section of
4519 bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4520 bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4521 rman_get_bushandle(sc->mem_res),
4522 sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4524 MXGE_EEPROM_STRINGS_SIZE - 2);
4525 err = mxge_parse_strings(sc);
4527 goto abort_with_mem_res;
4529 /* Enable write combining for efficient use of PCIe bus */
4532 /* Allocate the out of band dma memory */
4533 err = mxge_dma_alloc(sc, &sc->cmd_dma,
4534 sizeof (mxge_cmd_t), 64);
4536 goto abort_with_mem_res;
4537 sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4538 err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4540 goto abort_with_cmd_dma;
4542 err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4544 goto abort_with_zeropad_dma;
4546 /* select & load the firmware */
4547 err = mxge_select_firmware(sc);
4549 goto abort_with_dmabench;
4550 sc->intr_coal_delay = mxge_intr_coal_delay;
4552 mxge_slice_probe(sc);
4553 err = mxge_alloc_slices(sc);
4555 goto abort_with_dmabench;
4557 err = mxge_reset(sc, 0);
4559 goto abort_with_slices;
4561 err = mxge_alloc_rings(sc);
4563 device_printf(sc->dev, "failed to allocate rings\n");
4564 goto abort_with_dmabench;
4567 err = mxge_add_irq(sc);
4569 device_printf(sc->dev, "failed to add irq\n");
4570 goto abort_with_rings;
4573 ifp->if_baudrate = IF_Gbps(10UL);
4574 ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4577 ifp->if_capabilities |= IFCAP_LRO;
4580 #ifdef MXGE_NEW_VLAN_API
4581 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4584 sc->max_mtu = mxge_max_mtu(sc);
4585 if (sc->max_mtu >= 9000)
4586 ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4588 device_printf(dev, "MTU limited to %d. Install "
4589 "latest firmware for 9000 byte jumbo support\n",
4590 sc->max_mtu - ETHER_HDR_LEN);
4591 ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4592 ifp->if_capenable = ifp->if_capabilities;
4593 if (sc->lro_cnt == 0)
4594 ifp->if_capenable &= ~IFCAP_LRO;
4596 ifp->if_init = mxge_init;
4598 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4599 ifp->if_ioctl = mxge_ioctl;
4600 ifp->if_start = mxge_start;
4601 /* Initialise the ifmedia structure */
4602 ifmedia_init(&sc->media, 0, mxge_media_change,
4604 mxge_set_media(sc, IFM_ETHER | IFM_AUTO);
4605 mxge_media_probe(sc);
4607 ether_ifattach(ifp, sc->mac_addr, NULL);
4608 /* ether_ifattach sets mtu to ETHERMTU */
4609 if (mxge_initial_mtu != ETHERMTU)
4610 mxge_change_mtu(sc, mxge_initial_mtu);
4612 mxge_add_sysctls(sc);
4613 #ifdef IFNET_BUF_RING
4614 ifp->if_transmit = mxge_transmit;
4615 ifp->if_qflush = mxge_qflush;
4620 mxge_free_rings(sc);
4622 mxge_free_slices(sc);
4623 abort_with_dmabench:
4624 mxge_dma_free(&sc->dmabench_dma);
4625 abort_with_zeropad_dma:
4626 mxge_dma_free(&sc->zeropad_dma);
4628 mxge_dma_free(&sc->cmd_dma);
4630 bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4632 pci_disable_busmaster(dev);
4633 lockuninit(&sc->cmd_lock);
4634 lockuninit(&sc->driver_lock);
4635 bus_dma_tag_destroy(sc->parent_dmat);
4641 mxge_detach(device_t dev)
4643 mxge_softc_t *sc = device_get_softc(dev);
4645 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
4647 if (sc->ifp->if_flags & IFF_RUNNING)
4649 lockmgr(&sc->driver_lock, LK_RELEASE);
4650 ether_ifdetach(sc->ifp);
4651 callout_drain(&sc->co_hdl);
4652 ifmedia_removeall(&sc->media);
4653 mxge_dummy_rdma(sc, 0);
4654 mxge_rem_sysctls(sc);
4656 mxge_free_rings(sc);
4657 mxge_free_slices(sc);
4658 mxge_dma_free(&sc->dmabench_dma);
4659 mxge_dma_free(&sc->zeropad_dma);
4660 mxge_dma_free(&sc->cmd_dma);
4661 bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4662 pci_disable_busmaster(dev);
4663 lockuninit(&sc->cmd_lock);
4664 lockuninit(&sc->driver_lock);
4665 bus_dma_tag_destroy(sc->parent_dmat);
4670 mxge_shutdown(device_t dev)
4676 This file uses Myri10GE driver indentation.
4679 c-file-style:"linux"