1 /******************************************************************************
3 Copyright (c) 2006-2009, Myricom Inc.
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
9 1. Redistributions of source code must retain the above copyright notice,
10 this list of conditions and the following disclaimer.
12 2. Neither the name of the Myricom Inc, nor the names of its
13 contributors may be used to endorse or promote products derived from
14 this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
28 ***************************************************************************/
30 #include <sys/cdefs.h>
31 /*__FBSDID("$FreeBSD: src/sys/dev/mxge/if_mxge.c,v 1.63 2009/06/26 11:45:06 rwatson Exp $");*/
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/linker.h>
36 #include <sys/firmware.h>
37 #include <sys/endian.h>
38 #include <sys/sockio.h>
40 #include <sys/malloc.h>
41 #include <sys/kernel.h>
43 #include <sys/module.h>
44 #include <sys/socket.h>
45 #include <sys/sysctl.h>
47 /* count xmits ourselves, rather than via drbr */
50 #include <net/if_arp.h>
51 #include <net/ethernet.h>
52 #include <net/if_dl.h>
53 #include <net/if_media.h>
57 #include <net/if_types.h>
58 #include <net/vlan/if_vlan_var.h>
61 #include <netinet/in_systm.h>
62 #include <netinet/in.h>
63 #include <netinet/ip.h>
64 #include <netinet/tcp.h>
66 #include <machine/resource.h>
70 #include <bus/pci/pcireg.h>
71 #include <bus/pci/pcivar.h>
72 #include <bus/pci/pci_private.h> /* XXX for pci_cfg_restore */
74 #include <vm/vm.h> /* for pmap_mapdev() */
77 #if defined(__i386) || defined(__amd64)
78 #include <machine/specialreg.h>
81 #include <dev/netif/mxge/mxge_mcp.h>
82 #include <dev/netif/mxge/mcp_gen_header.h>
83 /*#define MXGE_FAKE_IFP*/
84 #include <dev/netif/mxge/if_mxge_var.h>
86 #include <sys/buf_ring.h>
92 static int mxge_nvidia_ecrc_enable = 1;
93 static int mxge_force_firmware = 0;
94 static int mxge_intr_coal_delay = 30;
95 static int mxge_deassert_wait = 1;
96 static int mxge_flow_control = 1;
97 static int mxge_verbose = 0;
98 static int mxge_lro_cnt = 8;
99 static int mxge_ticks;
100 static int mxge_max_slices = 1;
101 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
102 static int mxge_always_promisc = 0;
103 static int mxge_initial_mtu = ETHERMTU_JUMBO;
104 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
105 static char *mxge_fw_aligned = "mxge_eth_z8e";
106 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
107 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
109 static int mxge_probe(device_t dev);
110 static int mxge_attach(device_t dev);
111 static int mxge_detach(device_t dev);
112 static int mxge_shutdown(device_t dev);
113 static void mxge_intr(void *arg);
115 static device_method_t mxge_methods[] =
117 /* Device interface */
118 DEVMETHOD(device_probe, mxge_probe),
119 DEVMETHOD(device_attach, mxge_attach),
120 DEVMETHOD(device_detach, mxge_detach),
121 DEVMETHOD(device_shutdown, mxge_shutdown),
125 static driver_t mxge_driver =
129 sizeof(mxge_softc_t),
132 static devclass_t mxge_devclass;
134 /* Declare ourselves to be a child of the PCI bus.*/
135 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
136 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
137 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
139 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
140 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
141 static int mxge_close(mxge_softc_t *sc);
142 static int mxge_open(mxge_softc_t *sc);
143 static void mxge_tick(void *arg);
146 mxge_probe(device_t dev)
151 if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
152 ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
153 (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
154 rev = pci_get_revid(dev);
156 case MXGE_PCI_REV_Z8E:
157 device_set_desc(dev, "Myri10G-PCIE-8A");
159 case MXGE_PCI_REV_Z8ES:
160 device_set_desc(dev, "Myri10G-PCIE-8B");
163 device_set_desc(dev, "Myri10G-PCIE-8??");
164 device_printf(dev, "Unrecognized rev %d NIC\n",
174 mxge_enable_wc(mxge_softc_t *sc)
176 #if defined(__i386) || defined(__amd64)
181 len = rman_get_size(sc->mem_res);
182 err = pmap_change_attr((vm_offset_t) sc->sram,
183 len, PAT_WRITE_COMBINING);
185 device_printf(sc->dev, "pmap_change_attr failed, %d\n",
193 /* callback to get our DMA address */
195 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
199 *(bus_addr_t *) arg = segs->ds_addr;
204 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
205 bus_size_t alignment)
208 device_t dev = sc->dev;
209 bus_size_t boundary, maxsegsize;
211 if (bytes > 4096 && alignment == 4096) {
219 /* allocate DMAable memory tags */
220 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
221 alignment, /* alignment */
222 boundary, /* boundary */
223 BUS_SPACE_MAXADDR, /* low */
224 BUS_SPACE_MAXADDR, /* high */
225 NULL, NULL, /* filter */
228 maxsegsize, /* maxsegsize */
229 BUS_DMA_COHERENT, /* flags */
230 &dma->dmat); /* tag */
232 device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
236 /* allocate DMAable memory & map */
237 err = bus_dmamem_alloc(dma->dmat, &dma->addr,
238 (BUS_DMA_WAITOK | BUS_DMA_COHERENT
239 | BUS_DMA_ZERO), &dma->map);
241 device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
242 goto abort_with_dmat;
245 /* load the memory */
246 err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
247 mxge_dmamap_callback,
248 (void *)&dma->bus_addr, 0);
250 device_printf(dev, "couldn't load map (err = %d)\n", err);
256 bus_dmamem_free(dma->dmat, dma->addr, dma->map);
258 (void)bus_dma_tag_destroy(dma->dmat);
264 mxge_dma_free(mxge_dma_t *dma)
266 bus_dmamap_unload(dma->dmat, dma->map);
267 bus_dmamem_free(dma->dmat, dma->addr, dma->map);
268 (void)bus_dma_tag_destroy(dma->dmat);
272 * The eeprom strings on the lanaiX have the format
279 mxge_parse_strings(mxge_softc_t *sc)
281 #define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
286 ptr = sc->eeprom_strings;
287 limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
289 while (ptr < limit && *ptr != '\0') {
290 if (memcmp(ptr, "MAC=", 4) == 0) {
292 sc->mac_addr_string = ptr;
293 for (i = 0; i < 6; i++) {
295 if ((ptr + 2) > limit)
297 sc->mac_addr[i] = strtoul(ptr, NULL, 16);
300 } else if (memcmp(ptr, "PC=", 3) == 0) {
302 strncpy(sc->product_code_string, ptr,
303 sizeof (sc->product_code_string) - 1);
304 } else if (memcmp(ptr, "SN=", 3) == 0) {
306 strncpy(sc->serial_number_string, ptr,
307 sizeof (sc->serial_number_string) - 1);
309 MXGE_NEXT_STRING(ptr);
316 device_printf(sc->dev, "failed to parse eeprom_strings\n");
321 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
323 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
326 unsigned long base, off;
328 device_t pdev, mcp55;
329 uint16_t vendor_id, device_id, word;
330 uintptr_t bus, slot, func, ivend, idev;
334 if (!mxge_nvidia_ecrc_enable)
337 pdev = device_get_parent(device_get_parent(sc->dev));
339 device_printf(sc->dev, "could not find parent?\n");
342 vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
343 device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
345 if (vendor_id != 0x10de)
350 if (device_id == 0x005d) {
351 /* ck804, base address is magic */
353 } else if (device_id >= 0x0374 && device_id <= 0x378) {
354 /* mcp55, base address stored in chipset */
355 mcp55 = pci_find_bsf(0, 0, 0);
357 0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
358 0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
359 word = pci_read_config(mcp55, 0x90, 2);
360 base = ((unsigned long)word & 0x7ffeU) << 25;
367 Test below is commented because it is believed that doing
368 config read/write beyond 0xff will access the config space
369 for the next larger function. Uncomment this and remove
370 the hacky pmap_mapdev() way of accessing config space when
371 FreeBSD grows support for extended pcie config space access
374 /* See if we can, by some miracle, access the extended
376 val = pci_read_config(pdev, 0x178, 4);
377 if (val != 0xffffffff) {
379 pci_write_config(pdev, 0x178, val, 4);
383 /* Rather than using normal pci config space writes, we must
384 * map the Nvidia config space ourselves. This is because on
385 * opteron/nvidia class machine the 0xe000000 mapping is
386 * handled by the nvidia chipset, that means the internal PCI
387 * device (the on-chip northbridge), or the amd-8131 bridge
388 * and things behind them are not visible by this method.
391 BUS_READ_IVAR(device_get_parent(pdev), pdev,
393 BUS_READ_IVAR(device_get_parent(pdev), pdev,
394 PCI_IVAR_SLOT, &slot);
395 BUS_READ_IVAR(device_get_parent(pdev), pdev,
396 PCI_IVAR_FUNCTION, &func);
397 BUS_READ_IVAR(device_get_parent(pdev), pdev,
398 PCI_IVAR_VENDOR, &ivend);
399 BUS_READ_IVAR(device_get_parent(pdev), pdev,
400 PCI_IVAR_DEVICE, &idev);
403 + 0x00100000UL * (unsigned long)bus
404 + 0x00001000UL * (unsigned long)(func
407 /* map it into the kernel */
408 va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
412 device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
415 /* get a pointer to the config space mapped into the kernel */
416 cfgptr = va + (off & PAGE_MASK);
418 /* make sure that we can really access it */
419 vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
420 device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
421 if (! (vendor_id == ivend && device_id == idev)) {
422 device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
423 vendor_id, device_id);
424 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
428 ptr32 = (uint32_t*)(cfgptr + 0x178);
431 if (val == 0xffffffff) {
432 device_printf(sc->dev, "extended mapping failed\n");
433 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
437 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
439 device_printf(sc->dev,
440 "Enabled ECRC on upstream Nvidia bridge "
442 (int)bus, (int)slot, (int)func);
447 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
449 device_printf(sc->dev,
450 "Nforce 4 chipset on non-x86/amd64!?!?!\n");
457 mxge_dma_test(mxge_softc_t *sc, int test_type)
460 bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
466 /* Run a small DMA test.
467 * The magic multipliers to the length tell the firmware
468 * to do DMA read, write, or read+write tests. The
469 * results are returned in cmd.data0. The upper 16
470 * bits of the return is the number of transfers completed.
471 * The lower 16 bits is the time in 0.5us ticks that the
472 * transfers took to complete.
475 len = sc->tx_boundary;
477 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
478 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
479 cmd.data2 = len * 0x10000;
480 status = mxge_send_cmd(sc, test_type, &cmd);
485 sc->read_dma = ((cmd.data0>>16) * len * 2) /
486 (cmd.data0 & 0xffff);
487 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
488 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
489 cmd.data2 = len * 0x1;
490 status = mxge_send_cmd(sc, test_type, &cmd);
495 sc->write_dma = ((cmd.data0>>16) * len * 2) /
496 (cmd.data0 & 0xffff);
498 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
499 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
500 cmd.data2 = len * 0x10001;
501 status = mxge_send_cmd(sc, test_type, &cmd);
506 sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
507 (cmd.data0 & 0xffff);
510 if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
511 device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
518 * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
519 * when the PCI-E Completion packets are aligned on an 8-byte
520 * boundary. Some PCI-E chip sets always align Completion packets; on
521 * the ones that do not, the alignment can be enforced by enabling
522 * ECRC generation (if supported).
524 * When PCI-E Completion packets are not aligned, it is actually more
525 * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
527 * If the driver can neither enable ECRC nor verify that it has
528 * already been enabled, then it must use a firmware image which works
529 * around unaligned completion packets (ethp_z8e.dat), and it should
530 * also ensure that it never gives the device a Read-DMA which is
531 * larger than 2KB by setting the tx_boundary to 2KB. If ECRC is
532 * enabled, then the driver should use the aligned (eth_z8e.dat)
533 * firmware image, and set tx_boundary to 4KB.
537 mxge_firmware_probe(mxge_softc_t *sc)
539 device_t dev = sc->dev;
543 sc->tx_boundary = 4096;
545 * Verify the max read request size was set to 4KB
546 * before trying the test with 4KB.
548 if (pci_find_extcap(dev, PCIY_EXPRESS, ®) == 0) {
549 pectl = pci_read_config(dev, reg + 0x8, 2);
550 if ((pectl & (5 << 12)) != (5 << 12)) {
551 device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
553 sc->tx_boundary = 2048;
558 * load the optimized firmware (which assumes aligned PCIe
559 * completions) in order to see if it works on this host.
561 sc->fw_name = mxge_fw_aligned;
562 status = mxge_load_firmware(sc, 1);
568 * Enable ECRC if possible
570 mxge_enable_nvidia_ecrc(sc);
573 * Run a DMA test which watches for unaligned completions and
574 * aborts on the first one seen.
577 status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
579 return 0; /* keep the aligned firmware */
582 device_printf(dev, "DMA test failed: %d\n", status);
583 if (status == ENOSYS)
584 device_printf(dev, "Falling back to ethp! "
585 "Please install up to date fw\n");
590 mxge_select_firmware(mxge_softc_t *sc)
595 if (mxge_force_firmware != 0) {
596 if (mxge_force_firmware == 1)
601 device_printf(sc->dev,
602 "Assuming %s completions (forced)\n",
603 aligned ? "aligned" : "unaligned");
607 /* if the PCIe link width is 4 or less, we can use the aligned
608 firmware and skip any checks */
609 if (sc->link_width != 0 && sc->link_width <= 4) {
610 device_printf(sc->dev,
611 "PCIe x%d Link, expect reduced performance\n",
617 if (0 == mxge_firmware_probe(sc))
622 sc->fw_name = mxge_fw_aligned;
623 sc->tx_boundary = 4096;
625 sc->fw_name = mxge_fw_unaligned;
626 sc->tx_boundary = 2048;
628 return (mxge_load_firmware(sc, 0));
638 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
642 if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
643 device_printf(sc->dev, "Bad firmware type: 0x%x\n",
644 be32toh(hdr->mcp_type));
648 /* save firmware version for sysctl */
649 strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
651 device_printf(sc->dev, "firmware id: %s\n", hdr->version);
653 ksscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
654 &sc->fw_ver_minor, &sc->fw_ver_tiny);
656 if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
657 && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
658 device_printf(sc->dev, "Found firmware version %s\n",
660 device_printf(sc->dev, "Driver needs %d.%d\n",
661 MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
669 z_alloc(void *nil, u_int items, u_int size)
673 ptr = kmalloc(items * size, M_TEMP, M_NOWAIT);
678 z_free(void *nil, void *ptr)
685 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
688 const mcp_gen_header_t *hdr;
695 fw = firmware_image_load(sc->fw_name, NULL);
697 device_printf(sc->dev, "Could not find firmware image %s\n",
702 /* setup zlib and decompress f/w */
703 bzero(&zs, sizeof (zs));
706 status = inflateInit(&zs);
707 if (status != Z_OK) {
712 /* the uncompressed size is stored as the firmware version,
713 which would otherwise go unused */
714 fw_len = (size_t) fw->version;
715 inflate_buffer = kmalloc(fw_len, M_TEMP, M_NOWAIT);
716 if (inflate_buffer == NULL)
718 zs.avail_in = fw->datasize;
719 zs.next_in = __DECONST(char *, fw->data);
720 zs.avail_out = fw_len;
721 zs.next_out = inflate_buffer;
722 status = inflate(&zs, Z_FINISH);
723 if (status != Z_STREAM_END) {
724 device_printf(sc->dev, "zlib %d\n", status);
726 goto abort_with_buffer;
730 hdr_offset = htobe32(*(const uint32_t *)
731 (fw->fw_image + MCP_HEADER_PTR_OFFSET));
732 if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
733 device_printf(sc->dev, "Bad firmware file");
737 hdr = (const void*)(fw->fw_image + hdr_offset);
739 status = mxge_validate_firmware(sc, hdr);
743 /* Copy the inflated firmware to NIC SRAM. */
744 for (i = 0; i < fw_len; i += 256) {
745 mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
747 min(256U, (unsigned)(fw_len - i)));
757 kfree(inflate_buffer, M_TEMP);
762 firmware_image_unload(fw);
767 * Enable or disable periodic RDMAs from the host to make certain
768 * chipsets resend dropped PCIe messages
772 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
775 volatile uint32_t *confirm;
776 volatile char *submit;
777 uint32_t *buf, dma_low, dma_high;
780 buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
782 /* clear confirmation addr */
783 confirm = (volatile uint32_t *)sc->cmd;
787 /* send an rdma command to the PCIe engine, and wait for the
788 response in the confirmation address. The firmware should
789 write a -1 there to indicate it is alive and well
792 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
793 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
794 buf[0] = htobe32(dma_high); /* confirm addr MSW */
795 buf[1] = htobe32(dma_low); /* confirm addr LSW */
796 buf[2] = htobe32(0xffffffff); /* confirm data */
797 dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
798 dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
799 buf[3] = htobe32(dma_high); /* dummy addr MSW */
800 buf[4] = htobe32(dma_low); /* dummy addr LSW */
801 buf[5] = htobe32(enable); /* enable? */
804 submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
806 mxge_pio_copy(submit, buf, 64);
811 while (*confirm != 0xffffffff && i < 20) {
815 if (*confirm != 0xffffffff) {
816 device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
817 (enable ? "enable" : "disable"), confirm,
824 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
827 char buf_bytes[sizeof(*buf) + 8];
828 volatile mcp_cmd_response_t *response = sc->cmd;
829 volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
830 uint32_t dma_low, dma_high;
831 int err, sleep_total = 0;
833 /* ensure buf is aligned to 8 bytes */
834 buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
836 buf->data0 = htobe32(data->data0);
837 buf->data1 = htobe32(data->data1);
838 buf->data2 = htobe32(data->data2);
839 buf->cmd = htobe32(cmd);
840 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
841 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
843 buf->response_addr.low = htobe32(dma_low);
844 buf->response_addr.high = htobe32(dma_high);
845 lockmgr(&sc->cmd_lock, LK_EXCLUSIVE);
846 response->result = 0xffffffff;
848 mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
850 /* wait up to 20ms */
852 for (sleep_total = 0; sleep_total < 20; sleep_total++) {
853 bus_dmamap_sync(sc->cmd_dma.dmat,
854 sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
856 switch (be32toh(response->result)) {
858 data->data0 = be32toh(response->data);
864 case MXGEFW_CMD_UNKNOWN:
867 case MXGEFW_CMD_ERROR_UNALIGNED:
870 case MXGEFW_CMD_ERROR_BUSY:
874 device_printf(sc->dev,
876 "failed, result = %d\n",
877 cmd, be32toh(response->result));
885 device_printf(sc->dev, "mxge: command %d timed out"
887 cmd, be32toh(response->result));
888 lockmgr(&sc->cmd_lock, LK_RELEASE);
893 mxge_adopt_running_firmware(mxge_softc_t *sc)
895 struct mcp_gen_header *hdr;
896 const size_t bytes = sizeof (struct mcp_gen_header);
900 /* find running firmware header */
901 hdr_offset = htobe32(*(volatile uint32_t *)
902 (sc->sram + MCP_HEADER_PTR_OFFSET));
904 if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
905 device_printf(sc->dev,
906 "Running firmware has bad header offset (%d)\n",
911 /* copy header of running firmware from SRAM to host memory to
912 * validate firmware */
913 hdr = kmalloc(bytes, M_DEVBUF, M_NOWAIT);
915 device_printf(sc->dev, "could not kmalloc firmware hdr\n");
918 bus_space_read_region_1(rman_get_bustag(sc->mem_res),
919 rman_get_bushandle(sc->mem_res),
920 hdr_offset, (char *)hdr, bytes);
921 status = mxge_validate_firmware(sc, hdr);
922 kfree(hdr, M_DEVBUF);
925 * check to see if adopted firmware has bug where adopting
926 * it will cause broadcasts to be filtered unless the NIC
927 * is kept in ALLMULTI mode
929 if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
930 sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
931 sc->adopted_rx_filter_bug = 1;
932 device_printf(sc->dev, "Adopting fw %d.%d.%d: "
933 "working around rx filter bug\n",
934 sc->fw_ver_major, sc->fw_ver_minor,
943 mxge_load_firmware(mxge_softc_t *sc, int adopt)
945 volatile uint32_t *confirm;
946 volatile char *submit;
948 uint32_t *buf, size, dma_low, dma_high;
951 buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
953 size = sc->sram_size;
954 status = mxge_load_firmware_helper(sc, &size);
958 /* Try to use the currently running firmware, if
960 status = mxge_adopt_running_firmware(sc);
962 device_printf(sc->dev,
963 "failed to adopt running firmware\n");
966 device_printf(sc->dev,
967 "Successfully adopted running firmware\n");
968 if (sc->tx_boundary == 4096) {
969 device_printf(sc->dev,
970 "Using firmware currently running on NIC"
972 device_printf(sc->dev,
973 "performance consider loading optimized "
976 sc->fw_name = mxge_fw_unaligned;
977 sc->tx_boundary = 2048;
980 /* clear confirmation addr */
981 confirm = (volatile uint32_t *)sc->cmd;
984 /* send a reload command to the bootstrap MCP, and wait for the
985 response in the confirmation address. The firmware should
986 write a -1 there to indicate it is alive and well
989 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
990 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
992 buf[0] = htobe32(dma_high); /* confirm addr MSW */
993 buf[1] = htobe32(dma_low); /* confirm addr LSW */
994 buf[2] = htobe32(0xffffffff); /* confirm data */
996 /* FIX: All newest firmware should un-protect the bottom of
997 the sram before handoff. However, the very first interfaces
998 do not. Therefore the handoff copy must skip the first 8 bytes
1000 /* where the code starts*/
1001 buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1002 buf[4] = htobe32(size - 8); /* length of code */
1003 buf[5] = htobe32(8); /* where to copy to */
1004 buf[6] = htobe32(0); /* where to jump to */
1006 submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1007 mxge_pio_copy(submit, buf, 64);
1012 while (*confirm != 0xffffffff && i < 20) {
1015 bus_dmamap_sync(sc->cmd_dma.dmat,
1016 sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1018 if (*confirm != 0xffffffff) {
1019 device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1028 mxge_update_mac_address(mxge_softc_t *sc)
1031 uint8_t *addr = sc->mac_addr;
1035 cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1036 | (addr[2] << 8) | addr[3]);
1038 cmd.data1 = ((addr[4] << 8) | (addr[5]));
1040 status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1045 mxge_change_pause(mxge_softc_t *sc, int pause)
1051 status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1054 status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1058 device_printf(sc->dev, "Failed to set flow control mode\n");
1066 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1071 if (mxge_always_promisc)
1075 status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1078 status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1082 device_printf(sc->dev, "Failed to set promisc mode\n");
1087 mxge_set_multicast_list(mxge_softc_t *sc)
1090 struct ifmultiaddr *ifma;
1091 struct ifnet *ifp = sc->ifp;
1094 /* This firmware is known to not support multicast */
1095 if (!sc->fw_multicast_support)
1098 /* Disable multicast filtering while we play with the lists*/
1099 err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1101 device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1102 " error status: %d\n", err);
1106 if (sc->adopted_rx_filter_bug)
1109 if (ifp->if_flags & IFF_ALLMULTI)
1110 /* request to disable multicast filtering, so quit here */
1113 /* Flush all the filters */
1115 err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1117 device_printf(sc->dev,
1118 "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1119 ", error status: %d\n", err);
1123 /* Walk the multicast list, and add each address */
1125 if_maddr_rlock(ifp);
1126 TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1127 if (ifma->ifma_addr->sa_family != AF_LINK)
1129 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1131 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1133 cmd.data0 = htonl(cmd.data0);
1134 cmd.data1 = htonl(cmd.data1);
1135 err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1137 device_printf(sc->dev, "Failed "
1138 "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1140 /* abort, leaving multicast filtering off */
1141 if_maddr_runlock(ifp);
1145 if_maddr_runlock(ifp);
1146 /* Enable multicast filtering */
1147 err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1149 device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1150 ", error status: %d\n", err);
1155 mxge_max_mtu(mxge_softc_t *sc)
1160 if (MJUMPAGESIZE - MXGEFW_PAD > MXGEFW_MAX_MTU)
1161 return MXGEFW_MAX_MTU - MXGEFW_PAD;
1163 /* try to set nbufs to see if it we can
1164 use virtually contiguous jumbos */
1166 status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1169 return MXGEFW_MAX_MTU - MXGEFW_PAD;
1171 /* otherwise, we're limited to MJUMPAGESIZE */
1172 return MJUMPAGESIZE - MXGEFW_PAD;
1176 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1178 struct mxge_slice_state *ss;
1179 mxge_rx_done_t *rx_done;
1180 volatile uint32_t *irq_claim;
1184 /* try to send a reset command to the card to see if it
1186 memset(&cmd, 0, sizeof (cmd));
1187 status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1189 device_printf(sc->dev, "failed reset\n");
1193 mxge_dummy_rdma(sc, 1);
1196 /* set the intrq size */
1197 cmd.data0 = sc->rx_ring_size;
1198 status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1201 * Even though we already know how many slices are supported
1202 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1203 * has magic side effects, and must be called after a reset.
1204 * It must be called prior to calling any RSS related cmds,
1205 * including assigning an interrupt queue for anything but
1206 * slice 0. It must also be called *after*
1207 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1208 * the firmware to compute offsets.
1211 if (sc->num_slices > 1) {
1212 /* ask the maximum number of slices it supports */
1213 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1216 device_printf(sc->dev,
1217 "failed to get number of slices\n");
1221 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1222 * to setting up the interrupt queue DMA
1224 cmd.data0 = sc->num_slices;
1225 cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1226 #ifdef IFNET_BUF_RING
1227 cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1229 status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1232 device_printf(sc->dev,
1233 "failed to set number of slices\n");
1239 if (interrupts_setup) {
1240 /* Now exchange information about interrupts */
1241 for (slice = 0; slice < sc->num_slices; slice++) {
1242 rx_done = &sc->ss[slice].rx_done;
1243 memset(rx_done->entry, 0, sc->rx_ring_size);
1244 cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1245 cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1247 status |= mxge_send_cmd(sc,
1248 MXGEFW_CMD_SET_INTRQ_DMA,
1253 status |= mxge_send_cmd(sc,
1254 MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1257 sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1259 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1260 irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1263 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1265 sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1267 device_printf(sc->dev, "failed set interrupt parameters\n");
1272 *sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1275 /* run a DMA benchmark */
1276 (void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1278 for (slice = 0; slice < sc->num_slices; slice++) {
1279 ss = &sc->ss[slice];
1281 ss->irq_claim = irq_claim + (2 * slice);
1282 /* reset mcp/driver shared state back to 0 */
1283 ss->rx_done.idx = 0;
1284 ss->rx_done.cnt = 0;
1287 ss->tx.pkt_done = 0;
1288 ss->tx.queue_active = 0;
1289 ss->tx.activate = 0;
1290 ss->tx.deactivate = 0;
1295 ss->rx_small.cnt = 0;
1296 ss->lro_bad_csum = 0;
1298 ss->lro_flushed = 0;
1299 if (ss->fw_stats != NULL) {
1300 ss->fw_stats->valid = 0;
1301 ss->fw_stats->send_done_count = 0;
1304 sc->rdma_tags_available = 15;
1305 status = mxge_update_mac_address(sc);
1306 mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1307 mxge_change_pause(sc, sc->pause);
1308 mxge_set_multicast_list(sc);
1313 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1316 unsigned int intr_coal_delay;
1320 intr_coal_delay = sc->intr_coal_delay;
1321 err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1325 if (intr_coal_delay == sc->intr_coal_delay)
1328 if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1331 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
1332 *sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1333 sc->intr_coal_delay = intr_coal_delay;
1335 lockmgr(&sc->driver_lock, LK_RELEASE);
1340 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1343 unsigned int enabled;
1347 enabled = sc->pause;
1348 err = sysctl_handle_int(oidp, &enabled, arg2, req);
1352 if (enabled == sc->pause)
1355 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
1356 err = mxge_change_pause(sc, enabled);
1357 lockmgr(&sc->driver_lock, LK_RELEASE);
1362 mxge_change_lro_locked(mxge_softc_t *sc, int lro_cnt)
1369 ifp->if_capenable &= ~IFCAP_LRO;
1371 ifp->if_capenable |= IFCAP_LRO;
1372 sc->lro_cnt = lro_cnt;
1373 if (ifp->if_flags & IFF_RUNNING) {
1375 err = mxge_open(sc);
1381 mxge_change_lro(SYSCTL_HANDLER_ARGS)
1384 unsigned int lro_cnt;
1388 lro_cnt = sc->lro_cnt;
1389 err = sysctl_handle_int(oidp, &lro_cnt, arg2, req);
1393 if (lro_cnt == sc->lro_cnt)
1399 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
1400 err = mxge_change_lro_locked(sc, lro_cnt);
1401 lockmgr(&sc->driver_lock, LK_RELEASE);
1406 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1412 arg2 = be32toh(*(int *)arg1);
1414 err = sysctl_handle_int(oidp, arg1, arg2, req);
1420 mxge_rem_sysctls(mxge_softc_t *sc)
1422 struct mxge_slice_state *ss;
1425 if (sc->slice_sysctl_tree == NULL)
1428 for (slice = 0; slice < sc->num_slices; slice++) {
1429 ss = &sc->ss[slice];
1430 if (ss == NULL || ss->sysctl_tree == NULL)
1432 sysctl_ctx_free(&ss->sysctl_ctx);
1433 ss->sysctl_tree = NULL;
1435 sysctl_ctx_free(&sc->slice_sysctl_ctx);
1436 sc->slice_sysctl_tree = NULL;
1440 mxge_add_sysctls(mxge_softc_t *sc)
1442 struct sysctl_ctx_list *ctx;
1443 struct sysctl_oid_list *children;
1445 struct mxge_slice_state *ss;
1449 ctx = &sc->sysctl_ctx;
1450 sysctl_ctx_init(ctx);
1451 sc->sysctl_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_STATIC_CHILDREN(_hw),
1453 device_get_nameunit(sc->dev),
1455 if (sc->sysctl_tree == NULL) {
1456 device_printf(sc->dev, "can't add sysctl node\n");
1460 children = SYSCTL_CHILDREN(sc->sysctl_tree);
1461 fw = sc->ss[0].fw_stats;
1463 /* random information */
1464 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1466 CTLFLAG_RD, &sc->fw_version,
1467 0, "firmware version");
1468 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1470 CTLFLAG_RD, &sc->serial_number_string,
1471 0, "serial number");
1472 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1474 CTLFLAG_RD, &sc->product_code_string,
1476 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1478 CTLFLAG_RD, &sc->link_width,
1480 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1482 CTLFLAG_RD, &sc->tx_boundary,
1484 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1486 CTLFLAG_RD, &sc->wc,
1487 0, "write combining PIO?");
1488 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1490 CTLFLAG_RD, &sc->read_dma,
1491 0, "DMA Read speed in MB/s");
1492 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1494 CTLFLAG_RD, &sc->write_dma,
1495 0, "DMA Write speed in MB/s");
1496 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1497 "read_write_dma_MBs",
1498 CTLFLAG_RD, &sc->read_write_dma,
1499 0, "DMA concurrent Read/Write speed in MB/s");
1502 /* performance related tunables */
1503 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1505 CTLTYPE_INT|CTLFLAG_RW, sc,
1506 0, mxge_change_intr_coal,
1507 "I", "interrupt coalescing delay in usecs");
1509 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1510 "flow_control_enabled",
1511 CTLTYPE_INT|CTLFLAG_RW, sc,
1512 0, mxge_change_flow_control,
1513 "I", "interrupt coalescing delay in usecs");
1515 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1517 CTLFLAG_RW, &mxge_deassert_wait,
1518 0, "Wait for IRQ line to go low in ihandler");
1520 /* stats block from firmware is in network byte order.
1522 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1524 CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1525 0, mxge_handle_be32,
1527 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1528 "rdma_tags_available",
1529 CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1530 0, mxge_handle_be32,
1531 "I", "rdma_tags_available");
1532 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1533 "dropped_bad_crc32",
1534 CTLTYPE_INT|CTLFLAG_RD,
1535 &fw->dropped_bad_crc32,
1536 0, mxge_handle_be32,
1537 "I", "dropped_bad_crc32");
1538 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1540 CTLTYPE_INT|CTLFLAG_RD,
1541 &fw->dropped_bad_phy,
1542 0, mxge_handle_be32,
1543 "I", "dropped_bad_phy");
1544 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1545 "dropped_link_error_or_filtered",
1546 CTLTYPE_INT|CTLFLAG_RD,
1547 &fw->dropped_link_error_or_filtered,
1548 0, mxge_handle_be32,
1549 "I", "dropped_link_error_or_filtered");
1550 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1551 "dropped_link_overflow",
1552 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1553 0, mxge_handle_be32,
1554 "I", "dropped_link_overflow");
1555 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1556 "dropped_multicast_filtered",
1557 CTLTYPE_INT|CTLFLAG_RD,
1558 &fw->dropped_multicast_filtered,
1559 0, mxge_handle_be32,
1560 "I", "dropped_multicast_filtered");
1561 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1562 "dropped_no_big_buffer",
1563 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1564 0, mxge_handle_be32,
1565 "I", "dropped_no_big_buffer");
1566 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1567 "dropped_no_small_buffer",
1568 CTLTYPE_INT|CTLFLAG_RD,
1569 &fw->dropped_no_small_buffer,
1570 0, mxge_handle_be32,
1571 "I", "dropped_no_small_buffer");
1572 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1574 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1575 0, mxge_handle_be32,
1576 "I", "dropped_overrun");
1577 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1579 CTLTYPE_INT|CTLFLAG_RD,
1581 0, mxge_handle_be32,
1582 "I", "dropped_pause");
1583 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1585 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1586 0, mxge_handle_be32,
1587 "I", "dropped_runt");
1589 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1590 "dropped_unicast_filtered",
1591 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1592 0, mxge_handle_be32,
1593 "I", "dropped_unicast_filtered");
1595 /* verbose printing? */
1596 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1598 CTLFLAG_RW, &mxge_verbose,
1599 0, "verbose printing");
1602 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1604 CTLTYPE_INT|CTLFLAG_RW, sc,
1606 "I", "number of lro merge queues");
1609 /* add counters exported for debugging from all slices */
1610 sysctl_ctx_init(&sc->slice_sysctl_ctx);
1611 sc->slice_sysctl_tree =
1612 SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1613 "slice", CTLFLAG_RD, 0, "");
1615 for (slice = 0; slice < sc->num_slices; slice++) {
1616 ss = &sc->ss[slice];
1617 sysctl_ctx_init(&ss->sysctl_ctx);
1618 ctx = &ss->sysctl_ctx;
1619 children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1620 ksprintf(slice_num, "%d", slice);
1622 SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1624 children = SYSCTL_CHILDREN(ss->sysctl_tree);
1625 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1627 CTLFLAG_RD, &ss->rx_small.cnt,
1629 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1631 CTLFLAG_RD, &ss->rx_big.cnt,
1633 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1634 "lro_flushed", CTLFLAG_RD, &ss->lro_flushed,
1635 0, "number of lro merge queues flushed");
1637 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1638 "lro_queued", CTLFLAG_RD, &ss->lro_queued,
1639 0, "number of frames appended to lro merge"
1642 #ifndef IFNET_BUF_RING
1643 /* only transmit from slice 0 for now */
1647 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1649 CTLFLAG_RD, &ss->tx.req,
1652 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1654 CTLFLAG_RD, &ss->tx.done,
1656 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1658 CTLFLAG_RD, &ss->tx.pkt_done,
1660 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1662 CTLFLAG_RD, &ss->tx.stall,
1664 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1666 CTLFLAG_RD, &ss->tx.wake,
1668 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1670 CTLFLAG_RD, &ss->tx.defrag,
1672 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1674 CTLFLAG_RD, &ss->tx.queue_active,
1675 0, "tx_queue_active");
1676 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1678 CTLFLAG_RD, &ss->tx.activate,
1680 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1682 CTLFLAG_RD, &ss->tx.deactivate,
1683 0, "tx_deactivate");
1687 /* copy an array of mcp_kreq_ether_send_t's to the mcp. Copy
1688 backwards one at a time and handle ring wraps */
1691 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1692 mcp_kreq_ether_send_t *src, int cnt)
1694 int idx, starting_slot;
1695 starting_slot = tx->req;
1698 idx = (starting_slot + cnt) & tx->mask;
1699 mxge_pio_copy(&tx->lanai[idx],
1700 &src[cnt], sizeof(*src));
1706 * copy an array of mcp_kreq_ether_send_t's to the mcp. Copy
1707 * at most 32 bytes at a time, so as to avoid involving the software
1708 * pio handler in the nic. We re-write the first segment's flags
1709 * to mark them valid only after writing the entire chain
1713 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1718 volatile uint32_t *dst_ints;
1719 mcp_kreq_ether_send_t *srcp;
1720 volatile mcp_kreq_ether_send_t *dstp, *dst;
1723 idx = tx->req & tx->mask;
1725 last_flags = src->flags;
1728 dst = dstp = &tx->lanai[idx];
1731 if ((idx + cnt) < tx->mask) {
1732 for (i = 0; i < (cnt - 1); i += 2) {
1733 mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1734 wmb(); /* force write every 32 bytes */
1739 /* submit all but the first request, and ensure
1740 that it is submitted below */
1741 mxge_submit_req_backwards(tx, src, cnt);
1745 /* submit the first request */
1746 mxge_pio_copy(dstp, srcp, sizeof(*src));
1747 wmb(); /* barrier before setting valid flag */
1750 /* re-write the last 32-bits with the valid flags */
1751 src->flags = last_flags;
1752 src_ints = (uint32_t *)src;
1754 dst_ints = (volatile uint32_t *)dst;
1756 *dst_ints = *src_ints;
1764 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1765 int busdma_seg_cnt, int ip_off)
1768 mcp_kreq_ether_send_t *req;
1769 bus_dma_segment_t *seg;
1772 uint32_t low, high_swapped;
1773 int len, seglen, cum_len, cum_len_next;
1774 int next_is_first, chop, cnt, rdma_count, small;
1775 uint16_t pseudo_hdr_offset, cksum_offset, mss;
1776 uint8_t flags, flags_next;
1779 mss = m->m_pkthdr.tso_segsz;
1781 /* negative cum_len signifies to the
1782 * send loop that we are still in the
1783 * header portion of the TSO packet.
1786 /* ensure we have the ethernet, IP and TCP
1787 header together in the first mbuf, copy
1788 it to a scratch buffer if not */
1789 if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1790 m_copydata(m, 0, ip_off + sizeof (*ip),
1792 ip = (struct ip *)(ss->scratch + ip_off);
1794 ip = (struct ip *)(mtod(m, char *) + ip_off);
1796 if (__predict_false(m->m_len < ip_off + (ip->ip_hl << 2)
1798 m_copydata(m, 0, ip_off + (ip->ip_hl << 2)
1799 + sizeof (*tcp), ss->scratch);
1800 ip = (struct ip *)(mtod(m, char *) + ip_off);
1803 tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1804 cum_len = -(ip_off + ((ip->ip_hl + tcp->th_off) << 2));
1806 /* TSO implies checksum offload on this hardware */
1807 cksum_offset = ip_off + (ip->ip_hl << 2);
1808 flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1811 /* for TSO, pseudo_hdr_offset holds mss.
1812 * The firmware figures out where to put
1813 * the checksum by parsing the header. */
1814 pseudo_hdr_offset = htobe16(mss);
1821 /* "rdma_count" is the number of RDMAs belonging to the
1822 * current packet BEFORE the current send request. For
1823 * non-TSO packets, this is equal to "count".
1824 * For TSO packets, rdma_count needs to be reset
1825 * to 0 after a segment cut.
1827 * The rdma_count field of the send request is
1828 * the number of RDMAs of the packet starting at
1829 * that request. For TSO send requests with one ore more cuts
1830 * in the middle, this is the number of RDMAs starting
1831 * after the last cut in the request. All previous
1832 * segments before the last cut implicitly have 1 RDMA.
1834 * Since the number of RDMAs is not known beforehand,
1835 * it must be filled-in retroactively - after each
1836 * segmentation cut or at the end of the entire packet.
1839 while (busdma_seg_cnt) {
1840 /* Break the busdma segment up into pieces*/
1841 low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1842 high_swapped = htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1846 flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1848 cum_len_next = cum_len + seglen;
1849 (req-rdma_count)->rdma_count = rdma_count + 1;
1850 if (__predict_true(cum_len >= 0)) {
1852 chop = (cum_len_next > mss);
1853 cum_len_next = cum_len_next % mss;
1854 next_is_first = (cum_len_next == 0);
1855 flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1856 flags_next |= next_is_first *
1858 rdma_count |= -(chop | next_is_first);
1859 rdma_count += chop & !next_is_first;
1860 } else if (cum_len_next >= 0) {
1865 small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1866 flags_next = MXGEFW_FLAGS_TSO_PLD |
1867 MXGEFW_FLAGS_FIRST |
1868 (small * MXGEFW_FLAGS_SMALL);
1871 req->addr_high = high_swapped;
1872 req->addr_low = htobe32(low);
1873 req->pseudo_hdr_offset = pseudo_hdr_offset;
1875 req->rdma_count = 1;
1876 req->length = htobe16(seglen);
1877 req->cksum_offset = cksum_offset;
1878 req->flags = flags | ((cum_len & 1) *
1879 MXGEFW_FLAGS_ALIGN_ODD);
1882 cum_len = cum_len_next;
1887 if (__predict_false(cksum_offset > seglen))
1888 cksum_offset -= seglen;
1891 if (__predict_false(cnt > tx->max_desc))
1897 (req-rdma_count)->rdma_count = rdma_count;
1901 req->flags |= MXGEFW_FLAGS_TSO_LAST;
1902 } while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1904 tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1905 mxge_submit_req(tx, tx->req_list, cnt);
1906 #ifdef IFNET_BUF_RING
1907 if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1908 /* tell the NIC to start polling this slice */
1910 tx->queue_active = 1;
1918 bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1922 kprintf("tx->max_desc exceeded via TSO!\n");
1923 kprintf("mss = %d, %ld, %d!\n", mss,
1924 (long)seg - (long)tx->seg_list, tx->max_desc);
1931 #endif /* IFCAP_TSO4 */
1933 #ifdef MXGE_NEW_VLAN_API
1935 * We reproduce the software vlan tag insertion from
1936 * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
1937 * vlan tag insertion. We need to advertise this in order to have the
1938 * vlan interface respect our csum offload flags.
1940 static struct mbuf *
1941 mxge_vlan_tag_insert(struct mbuf *m)
1943 struct ether_vlan_header *evl;
1945 M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, MB_DONTWAIT);
1946 if (__predict_false(m == NULL))
1948 if (m->m_len < sizeof(*evl)) {
1949 m = m_pullup(m, sizeof(*evl));
1950 if (__predict_false(m == NULL))
1954 * Transform the Ethernet header into an Ethernet header
1955 * with 802.1Q encapsulation.
1957 evl = mtod(m, struct ether_vlan_header *);
1958 bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
1959 (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
1960 evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
1961 evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
1962 m->m_flags &= ~M_VLANTAG;
1965 #endif /* MXGE_NEW_VLAN_API */
1968 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
1971 mcp_kreq_ether_send_t *req;
1972 bus_dma_segment_t *seg;
1977 int cnt, cum_len, err, i, idx, odd_flag, ip_off;
1978 uint16_t pseudo_hdr_offset;
1979 uint8_t flags, cksum_offset;
1986 ip_off = sizeof (struct ether_header);
1987 #ifdef MXGE_NEW_VLAN_API
1988 if (m->m_flags & M_VLANTAG) {
1989 m = mxge_vlan_tag_insert(m);
1990 if (__predict_false(m == NULL))
1992 ip_off += ETHER_VLAN_ENCAP_LEN;
1995 /* (try to) map the frame for DMA */
1996 idx = tx->req & tx->mask;
1997 err = bus_dmamap_load_mbuf_segment(tx->dmat, tx->info[idx].map,
1998 m, tx->seg_list, 1, &cnt,
2000 if (__predict_false(err == EFBIG)) {
2001 /* Too many segments in the chain. Try
2003 m_tmp = m_defrag(m, M_NOWAIT);
2004 if (m_tmp == NULL) {
2009 err = bus_dmamap_load_mbuf_segment(tx->dmat,
2011 m, tx->seg_list, 1, &cnt,
2014 if (__predict_false(err != 0)) {
2015 device_printf(sc->dev, "bus_dmamap_load_mbuf_segment returned %d"
2016 " packet len = %d\n", err, m->m_pkthdr.len);
2019 bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2020 BUS_DMASYNC_PREWRITE);
2021 tx->info[idx].m = m;
2024 /* TSO is different enough, we handle it in another routine */
2025 if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2026 mxge_encap_tso(ss, m, cnt, ip_off);
2033 pseudo_hdr_offset = 0;
2034 flags = MXGEFW_FLAGS_NO_TSO;
2036 /* checksum offloading? */
2037 if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
2038 /* ensure ip header is in first mbuf, copy
2039 it to a scratch buffer if not */
2040 if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
2041 m_copydata(m, 0, ip_off + sizeof (*ip),
2043 ip = (struct ip *)(ss->scratch + ip_off);
2045 ip = (struct ip *)(mtod(m, char *) + ip_off);
2047 cksum_offset = ip_off + (ip->ip_hl << 2);
2048 pseudo_hdr_offset = cksum_offset + m->m_pkthdr.csum_data;
2049 pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2050 req->cksum_offset = cksum_offset;
2051 flags |= MXGEFW_FLAGS_CKSUM;
2052 odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2056 if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2057 flags |= MXGEFW_FLAGS_SMALL;
2059 /* convert segments into a request list */
2062 req->flags = MXGEFW_FLAGS_FIRST;
2063 for (i = 0; i < cnt; i++) {
2065 htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2067 htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2068 req->length = htobe16(seg->ds_len);
2069 req->cksum_offset = cksum_offset;
2070 if (cksum_offset > seg->ds_len)
2071 cksum_offset -= seg->ds_len;
2074 req->pseudo_hdr_offset = pseudo_hdr_offset;
2075 req->pad = 0; /* complete solid 16-byte block */
2076 req->rdma_count = 1;
2077 req->flags |= flags | ((cum_len & 1) * odd_flag);
2078 cum_len += seg->ds_len;
2084 /* pad runts to 60 bytes */
2088 htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2090 htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2091 req->length = htobe16(60 - cum_len);
2092 req->cksum_offset = 0;
2093 req->pseudo_hdr_offset = pseudo_hdr_offset;
2094 req->pad = 0; /* complete solid 16-byte block */
2095 req->rdma_count = 1;
2096 req->flags |= flags | ((cum_len & 1) * odd_flag);
2100 tx->req_list[0].rdma_count = cnt;
2102 /* print what the firmware will see */
2103 for (i = 0; i < cnt; i++) {
2104 kprintf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2105 "cso:%d, flags:0x%x, rdma:%d\n",
2106 i, (int)ntohl(tx->req_list[i].addr_high),
2107 (int)ntohl(tx->req_list[i].addr_low),
2108 (int)ntohs(tx->req_list[i].length),
2109 (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2110 tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2111 tx->req_list[i].rdma_count);
2113 kprintf("--------------\n");
2115 tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2116 mxge_submit_req(tx, tx->req_list, cnt);
2117 #ifdef IFNET_BUF_RING
2118 if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2119 /* tell the NIC to start polling this slice */
2121 tx->queue_active = 1;
2134 #ifdef IFNET_BUF_RING
2136 mxge_qflush(struct ifnet *ifp)
2138 mxge_softc_t *sc = ifp->if_softc;
2143 for (slice = 0; slice < sc->num_slices; slice++) {
2144 tx = &sc->ss[slice].tx;
2145 lockmgr(&tx->lock, LK_EXCLUSIVE);
2146 while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2148 lockmgr(&tx->lock, LK_RELEASE);
2154 mxge_start_locked(struct mxge_slice_state *ss)
2165 while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2166 m = drbr_dequeue(ifp, tx->br);
2170 /* let BPF see it */
2173 /* give it to the nic */
2176 /* ran out of transmit slots */
2177 if (((ss->if_flags & IFF_OACTIVE) == 0)
2178 && (!drbr_empty(ifp, tx->br))) {
2179 ss->if_flags |= IFF_OACTIVE;
2185 mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2196 if ((ss->if_flags & (IFF_RUNNING|IFF_OACTIVE)) !=
2198 err = drbr_enqueue(ifp, tx->br, m);
2202 if (drbr_empty(ifp, tx->br) &&
2203 ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2204 /* let BPF see it */
2206 /* give it to the nic */
2208 } else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2211 if (!drbr_empty(ifp, tx->br))
2212 mxge_start_locked(ss);
2217 mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2219 mxge_softc_t *sc = ifp->if_softc;
2220 struct mxge_slice_state *ss;
2225 slice = m->m_pkthdr.flowid;
2226 slice &= (sc->num_slices - 1); /* num_slices always power of 2 */
2228 ss = &sc->ss[slice];
2231 if (lockmgr(&tx->lock, LK_EXCLUSIVE|LK_NOWAIT)) {
2232 err = mxge_transmit_locked(ss, m);
2233 lockmgr(&tx->lock, LK_RELEASE);
2235 err = drbr_enqueue(ifp, tx->br, m);
2244 mxge_start_locked(struct mxge_slice_state *ss)
2254 while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2255 IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
2259 /* let BPF see it */
2262 /* give it to the nic */
2265 /* ran out of transmit slots */
2266 if ((sc->ifp->if_flags & IFF_OACTIVE) == 0) {
2267 sc->ifp->if_flags |= IFF_OACTIVE;
2273 mxge_start(struct ifnet *ifp)
2275 mxge_softc_t *sc = ifp->if_softc;
2276 struct mxge_slice_state *ss;
2278 /* only use the first slice for now */
2280 lockmgr(&ss->tx.lock, LK_EXCLUSIVE);
2281 mxge_start_locked(ss);
2282 lockmgr(&ss->tx.lock, LK_RELEASE);
2286 * copy an array of mcp_kreq_ether_recv_t's to the mcp. Copy
2287 * at most 32 bytes at a time, so as to avoid involving the software
2288 * pio handler in the nic. We re-write the first segment's low
2289 * DMA address to mark it valid only after we write the entire chunk
2293 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2294 mcp_kreq_ether_recv_t *src)
2298 low = src->addr_low;
2299 src->addr_low = 0xffffffff;
2300 mxge_pio_copy(dst, src, 4 * sizeof (*src));
2302 mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2304 src->addr_low = low;
2305 dst->addr_low = low;
2310 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2312 bus_dma_segment_t seg;
2314 mxge_rx_ring_t *rx = &ss->rx_small;
2317 m = m_gethdr(MB_DONTWAIT, MT_DATA);
2324 err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2325 &seg, 1, &cnt, BUS_DMA_NOWAIT);
2330 rx->info[idx].m = m;
2331 rx->shadow[idx].addr_low =
2332 htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2333 rx->shadow[idx].addr_high =
2334 htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2338 mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2343 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2345 bus_dma_segment_t seg[3];
2347 mxge_rx_ring_t *rx = &ss->rx_big;
2350 if (rx->cl_size == MCLBYTES)
2351 m = m_getcl(MB_DONTWAIT, MT_DATA, M_PKTHDR);
2353 m = m_getjcl(MB_DONTWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2359 m->m_len = rx->mlen;
2360 err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2361 seg, 1, &cnt, BUS_DMA_NOWAIT);
2366 rx->info[idx].m = m;
2367 rx->shadow[idx].addr_low =
2368 htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2369 rx->shadow[idx].addr_high =
2370 htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2372 #if MXGE_VIRT_JUMBOS
2373 for (i = 1; i < cnt; i++) {
2374 rx->shadow[idx + i].addr_low =
2375 htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2376 rx->shadow[idx + i].addr_high =
2377 htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2382 for (i = 0; i < rx->nbufs; i++) {
2383 if ((idx & 7) == 7) {
2384 mxge_submit_8rx(&rx->lanai[idx - 7],
2385 &rx->shadow[idx - 7]);
2393 * Myri10GE hardware checksums are not valid if the sender
2394 * padded the frame with non-zero padding. This is because
2395 * the firmware just does a simple 16-bit 1s complement
2396 * checksum across the entire frame, excluding the first 14
2397 * bytes. It is best to simply to check the checksum and
2398 * tell the stack about it only if the checksum is good
2401 static inline uint16_t
2402 mxge_rx_csum(struct mbuf *m, int csum)
2404 struct ether_header *eh;
2408 eh = mtod(m, struct ether_header *);
2410 /* only deal with IPv4 TCP & UDP for now */
2411 if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2413 ip = (struct ip *)(eh + 1);
2414 if (__predict_false(ip->ip_p != IPPROTO_TCP &&
2415 ip->ip_p != IPPROTO_UDP))
2418 c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2419 htonl(ntohs(csum) + ntohs(ip->ip_len) +
2420 - (ip->ip_hl << 2) + ip->ip_p));
2429 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2431 struct ether_vlan_header *evl;
2432 struct ether_header *eh;
2435 evl = mtod(m, struct ether_vlan_header *);
2436 eh = mtod(m, struct ether_header *);
2439 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2440 * after what the firmware thought was the end of the ethernet
2444 /* put checksum into host byte order */
2445 *csum = ntohs(*csum);
2446 partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2447 (*csum) += ~partial;
2448 (*csum) += ((*csum) < ~partial);
2449 (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2450 (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2452 /* restore checksum to network byte order;
2453 later consumers expect this */
2454 *csum = htons(*csum);
2457 #ifdef MXGE_NEW_VLAN_API
2458 m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2462 mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2466 VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2467 m_tag_prepend(m, mtag);
2471 m->m_flags |= M_VLANTAG;
2474 * Remove the 802.1q header by copying the Ethernet
2475 * addresses over it and adjusting the beginning of
2476 * the data in the mbuf. The encapsulated Ethernet
2477 * type field is already in place.
2479 bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2480 ETHER_HDR_LEN - ETHER_TYPE_LEN);
2481 m_adj(m, ETHER_VLAN_ENCAP_LEN);
2486 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2491 struct ether_header *eh;
2493 bus_dmamap_t old_map;
2495 uint16_t tcpudp_csum;
2500 idx = rx->cnt & rx->mask;
2501 rx->cnt += rx->nbufs;
2502 /* save a pointer to the received mbuf */
2503 m = rx->info[idx].m;
2504 /* try to replace the received mbuf */
2505 if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2506 /* drop the frame -- the old mbuf is re-cycled */
2511 /* unmap the received buffer */
2512 old_map = rx->info[idx].map;
2513 bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2514 bus_dmamap_unload(rx->dmat, old_map);
2516 /* swap the bus_dmamap_t's */
2517 rx->info[idx].map = rx->extra_map;
2518 rx->extra_map = old_map;
2520 /* mcp implicitly skips 1st 2 bytes so that packet is properly
2522 m->m_data += MXGEFW_PAD;
2524 m->m_pkthdr.rcvif = ifp;
2525 m->m_len = m->m_pkthdr.len = len;
2527 eh = mtod(m, struct ether_header *);
2528 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2529 mxge_vlan_tag_remove(m, &csum);
2531 /* if the checksum is valid, mark it in the mbuf header */
2532 if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2533 if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2535 /* otherwise, it was a UDP frame, or a TCP frame which
2536 we could not do LRO on. Tell the stack that the
2538 m->m_pkthdr.csum_data = 0xffff;
2539 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2541 /* flowid only valid if RSS hashing is enabled */
2542 if (sc->num_slices > 1) {
2543 m->m_pkthdr.flowid = (ss - sc->ss);
2544 m->m_flags |= M_FLOWID;
2546 /* pass the frame up the stack */
2547 (*ifp->if_input)(ifp, m);
2551 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2555 struct ether_header *eh;
2558 bus_dmamap_t old_map;
2560 uint16_t tcpudp_csum;
2565 idx = rx->cnt & rx->mask;
2567 /* save a pointer to the received mbuf */
2568 m = rx->info[idx].m;
2569 /* try to replace the received mbuf */
2570 if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2571 /* drop the frame -- the old mbuf is re-cycled */
2576 /* unmap the received buffer */
2577 old_map = rx->info[idx].map;
2578 bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2579 bus_dmamap_unload(rx->dmat, old_map);
2581 /* swap the bus_dmamap_t's */
2582 rx->info[idx].map = rx->extra_map;
2583 rx->extra_map = old_map;
2585 /* mcp implicitly skips 1st 2 bytes so that packet is properly
2587 m->m_data += MXGEFW_PAD;
2589 m->m_pkthdr.rcvif = ifp;
2590 m->m_len = m->m_pkthdr.len = len;
2592 eh = mtod(m, struct ether_header *);
2593 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2594 mxge_vlan_tag_remove(m, &csum);
2596 /* if the checksum is valid, mark it in the mbuf header */
2597 if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2598 if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2600 /* otherwise, it was a UDP frame, or a TCP frame which
2601 we could not do LRO on. Tell the stack that the
2603 m->m_pkthdr.csum_data = 0xffff;
2604 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2606 /* flowid only valid if RSS hashing is enabled */
2607 if (sc->num_slices > 1) {
2608 m->m_pkthdr.flowid = (ss - sc->ss);
2609 m->m_flags |= M_FLOWID;
2611 /* pass the frame up the stack */
2612 (*ifp->if_input)(ifp, m);
2616 mxge_clean_rx_done(struct mxge_slice_state *ss)
2618 mxge_rx_done_t *rx_done = &ss->rx_done;
2624 while (rx_done->entry[rx_done->idx].length != 0) {
2625 length = ntohs(rx_done->entry[rx_done->idx].length);
2626 rx_done->entry[rx_done->idx].length = 0;
2627 checksum = rx_done->entry[rx_done->idx].checksum;
2628 if (length <= (MHLEN - MXGEFW_PAD))
2629 mxge_rx_done_small(ss, length, checksum);
2631 mxge_rx_done_big(ss, length, checksum);
2633 rx_done->idx = rx_done->cnt & rx_done->mask;
2635 /* limit potential for livelock */
2636 if (__predict_false(++limit > rx_done->mask / 2))
2640 while (!SLIST_EMPTY(&ss->lro_active)) {
2641 struct lro_entry *lro = SLIST_FIRST(&ss->lro_active);
2642 SLIST_REMOVE_HEAD(&ss->lro_active, next);
2643 mxge_lro_flush(ss, lro);
2650 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2661 while (tx->pkt_done != mcp_idx) {
2662 idx = tx->done & tx->mask;
2664 m = tx->info[idx].m;
2665 /* mbuf and DMA map only attached to the first
2668 ss->obytes += m->m_pkthdr.len;
2669 if (m->m_flags & M_MCAST)
2672 tx->info[idx].m = NULL;
2673 map = tx->info[idx].map;
2674 bus_dmamap_unload(tx->dmat, map);
2677 if (tx->info[idx].flag) {
2678 tx->info[idx].flag = 0;
2683 /* If we have space, clear IFF_OACTIVE to tell the stack that
2684 its OK to send packets */
2685 #ifdef IFNET_BUF_RING
2686 flags = &ss->if_flags;
2688 flags = &ifp->if_flags;
2690 lockmgr(&ss->tx.lock, LK_EXCLUSIVE);
2691 if ((*flags) & IFF_OACTIVE &&
2692 tx->req - tx->done < (tx->mask + 1)/4) {
2693 *(flags) &= ~IFF_OACTIVE;
2695 mxge_start_locked(ss);
2697 #ifdef IFNET_BUF_RING
2698 if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2699 /* let the NIC stop polling this queue, since there
2700 * are no more transmits pending */
2701 if (tx->req == tx->done) {
2703 tx->queue_active = 0;
2709 lockmgr(&ss->tx.lock, LK_RELEASE);
2713 static struct mxge_media_type mxge_xfp_media_types[] =
2715 {IFM_10G_CX4, 0x7f, "10GBASE-CX4 (module)"},
2716 {IFM_10G_SR, (1 << 7), "10GBASE-SR"},
2717 {IFM_10G_LR, (1 << 6), "10GBASE-LR"},
2718 {0, (1 << 5), "10GBASE-ER"},
2719 {IFM_10G_LRM, (1 << 4), "10GBASE-LRM"},
2720 {0, (1 << 3), "10GBASE-SW"},
2721 {0, (1 << 2), "10GBASE-LW"},
2722 {0, (1 << 1), "10GBASE-EW"},
2723 {0, (1 << 0), "Reserved"}
2725 static struct mxge_media_type mxge_sfp_media_types[] =
2727 {0, (1 << 7), "Reserved"},
2728 {IFM_10G_LRM, (1 << 6), "10GBASE-LRM"},
2729 {IFM_10G_LR, (1 << 5), "10GBASE-LR"},
2730 {IFM_10G_SR, (1 << 4), "10GBASE-SR"}
2734 mxge_set_media(mxge_softc_t *sc, int type)
2736 sc->media_flags |= type;
2737 ifmedia_add(&sc->media, sc->media_flags, 0, NULL);
2738 ifmedia_set(&sc->media, sc->media_flags);
2743 * Determine the media type for a NIC. Some XFPs will identify
2744 * themselves only when their link is up, so this is initiated via a
2745 * link up interrupt. However, this can potentially take up to
2746 * several milliseconds, so it is run via the watchdog routine, rather
2747 * than in the interrupt handler itself. This need only be done
2748 * once, not each time the link is up.
2751 mxge_media_probe(mxge_softc_t *sc)
2756 struct mxge_media_type *mxge_media_types = NULL;
2757 int i, err, ms, mxge_media_type_entries;
2760 sc->need_media_probe = 0;
2762 /* if we've already set a media type, we're done */
2763 if (sc->media_flags != (IFM_ETHER | IFM_AUTO))
2767 * parse the product code to deterimine the interface type
2768 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2769 * after the 3rd dash in the driver's cached copy of the
2770 * EEPROM's product code string.
2772 ptr = sc->product_code_string;
2774 device_printf(sc->dev, "Missing product code\n");
2777 for (i = 0; i < 3; i++, ptr++) {
2778 ptr = index(ptr, '-');
2780 device_printf(sc->dev,
2781 "only %d dashes in PC?!?\n", i);
2787 mxge_set_media(sc, IFM_10G_CX4);
2790 else if (*ptr == 'Q') {
2791 /* -Q is Quad Ribbon Fiber */
2792 device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2793 /* FreeBSD has no media type for Quad ribbon fiber */
2799 mxge_media_types = mxge_xfp_media_types;
2800 mxge_media_type_entries =
2801 sizeof (mxge_xfp_media_types) /
2802 sizeof (mxge_xfp_media_types[0]);
2803 byte = MXGE_XFP_COMPLIANCE_BYTE;
2807 if (*ptr == 'S' || *(ptr +1) == 'S') {
2808 /* -S or -2S is SFP+ */
2809 mxge_media_types = mxge_sfp_media_types;
2810 mxge_media_type_entries =
2811 sizeof (mxge_sfp_media_types) /
2812 sizeof (mxge_sfp_media_types[0]);
2817 if (mxge_media_types == NULL) {
2818 device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2823 * At this point we know the NIC has an XFP cage, so now we
2824 * try to determine what is in the cage by using the
2825 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2826 * register. We read just one byte, which may take over
2830 cmd.data0 = 0; /* just fetch 1 byte, not all 256 */
2832 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2833 if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
2834 device_printf(sc->dev, "failed to read XFP\n");
2836 if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
2837 device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
2839 if (err != MXGEFW_CMD_OK) {
2843 /* now we wait for the data to be cached */
2845 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2846 for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
2849 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2851 if (err != MXGEFW_CMD_OK) {
2852 device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
2853 cage_type, err, ms);
2857 if (cmd.data0 == mxge_media_types[0].bitmask) {
2859 device_printf(sc->dev, "%s:%s\n", cage_type,
2860 mxge_media_types[0].name);
2861 mxge_set_media(sc, IFM_10G_CX4);
2864 for (i = 1; i < mxge_media_type_entries; i++) {
2865 if (cmd.data0 & mxge_media_types[i].bitmask) {
2867 device_printf(sc->dev, "%s:%s\n",
2869 mxge_media_types[i].name);
2871 mxge_set_media(sc, mxge_media_types[i].flag);
2875 device_printf(sc->dev, "%s media 0x%x unknown\n", cage_type,
2882 mxge_intr(void *arg)
2884 struct mxge_slice_state *ss = arg;
2885 mxge_softc_t *sc = ss->sc;
2886 mcp_irq_data_t *stats = ss->fw_stats;
2887 mxge_tx_ring_t *tx = &ss->tx;
2888 mxge_rx_done_t *rx_done = &ss->rx_done;
2889 uint32_t send_done_count;
2893 #ifndef IFNET_BUF_RING
2894 /* an interrupt on a non-zero slice is implicitly valid
2895 since MSI-X irqs are not shared */
2897 mxge_clean_rx_done(ss);
2898 *ss->irq_claim = be32toh(3);
2903 /* make sure the DMA has finished */
2904 if (!stats->valid) {
2907 valid = stats->valid;
2909 if (sc->legacy_irq) {
2910 /* lower legacy IRQ */
2911 *sc->irq_deassert = 0;
2912 if (!mxge_deassert_wait)
2913 /* don't wait for conf. that irq is low */
2919 /* loop while waiting for legacy irq deassertion */
2921 /* check for transmit completes and receives */
2922 send_done_count = be32toh(stats->send_done_count);
2923 while ((send_done_count != tx->pkt_done) ||
2924 (rx_done->entry[rx_done->idx].length != 0)) {
2925 if (send_done_count != tx->pkt_done)
2926 mxge_tx_done(ss, (int)send_done_count);
2927 mxge_clean_rx_done(ss);
2928 send_done_count = be32toh(stats->send_done_count);
2930 if (sc->legacy_irq && mxge_deassert_wait)
2932 } while (*((volatile uint8_t *) &stats->valid));
2934 /* fw link & error stats meaningful only on the first slice */
2935 if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
2936 if (sc->link_state != stats->link_up) {
2937 sc->link_state = stats->link_up;
2938 if (sc->link_state) {
2939 sc->ifp->if_link_state = LINK_STATE_UP;
2940 if_link_state_change(sc->ifp);
2942 device_printf(sc->dev, "link up\n");
2944 sc->ifp->if_link_state = LINK_STATE_DOWN;
2945 if_link_state_change(sc->ifp);
2947 device_printf(sc->dev, "link down\n");
2949 sc->need_media_probe = 1;
2951 if (sc->rdma_tags_available !=
2952 be32toh(stats->rdma_tags_available)) {
2953 sc->rdma_tags_available =
2954 be32toh(stats->rdma_tags_available);
2955 device_printf(sc->dev, "RDMA timed out! %d tags "
2956 "left\n", sc->rdma_tags_available);
2959 if (stats->link_down) {
2960 sc->down_cnt += stats->link_down;
2962 sc->ifp->if_link_state = LINK_STATE_DOWN;
2963 if_link_state_change(sc->ifp);
2967 /* check to see if we have rx token to pass back */
2969 *ss->irq_claim = be32toh(3);
2970 *(ss->irq_claim + 1) = be32toh(3);
2974 mxge_init(void *arg)
2981 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
2983 struct lro_entry *lro_entry;
2986 while (!SLIST_EMPTY(&ss->lro_free)) {
2987 lro_entry = SLIST_FIRST(&ss->lro_free);
2988 SLIST_REMOVE_HEAD(&ss->lro_free, next);
2989 kfree(lro_entry, M_DEVBUF);
2992 for (i = 0; i <= ss->rx_big.mask; i++) {
2993 if (ss->rx_big.info[i].m == NULL)
2995 bus_dmamap_unload(ss->rx_big.dmat,
2996 ss->rx_big.info[i].map);
2997 m_freem(ss->rx_big.info[i].m);
2998 ss->rx_big.info[i].m = NULL;
3001 for (i = 0; i <= ss->rx_small.mask; i++) {
3002 if (ss->rx_small.info[i].m == NULL)
3004 bus_dmamap_unload(ss->rx_small.dmat,
3005 ss->rx_small.info[i].map);
3006 m_freem(ss->rx_small.info[i].m);
3007 ss->rx_small.info[i].m = NULL;
3010 /* transmit ring used only on the first slice */
3011 if (ss->tx.info == NULL)
3014 for (i = 0; i <= ss->tx.mask; i++) {
3015 ss->tx.info[i].flag = 0;
3016 if (ss->tx.info[i].m == NULL)
3018 bus_dmamap_unload(ss->tx.dmat,
3019 ss->tx.info[i].map);
3020 m_freem(ss->tx.info[i].m);
3021 ss->tx.info[i].m = NULL;
3026 mxge_free_mbufs(mxge_softc_t *sc)
3030 for (slice = 0; slice < sc->num_slices; slice++)
3031 mxge_free_slice_mbufs(&sc->ss[slice]);
3035 mxge_free_slice_rings(struct mxge_slice_state *ss)
3040 if (ss->rx_done.entry != NULL)
3041 mxge_dma_free(&ss->rx_done.dma);
3042 ss->rx_done.entry = NULL;
3044 if (ss->tx.req_bytes != NULL)
3045 kfree(ss->tx.req_bytes, M_DEVBUF);
3046 ss->tx.req_bytes = NULL;
3048 if (ss->tx.seg_list != NULL)
3049 kfree(ss->tx.seg_list, M_DEVBUF);
3050 ss->tx.seg_list = NULL;
3052 if (ss->rx_small.shadow != NULL)
3053 kfree(ss->rx_small.shadow, M_DEVBUF);
3054 ss->rx_small.shadow = NULL;
3056 if (ss->rx_big.shadow != NULL)
3057 kfree(ss->rx_big.shadow, M_DEVBUF);
3058 ss->rx_big.shadow = NULL;
3060 if (ss->tx.info != NULL) {
3061 if (ss->tx.dmat != NULL) {
3062 for (i = 0; i <= ss->tx.mask; i++) {
3063 bus_dmamap_destroy(ss->tx.dmat,
3064 ss->tx.info[i].map);
3066 bus_dma_tag_destroy(ss->tx.dmat);
3068 kfree(ss->tx.info, M_DEVBUF);
3072 if (ss->rx_small.info != NULL) {
3073 if (ss->rx_small.dmat != NULL) {
3074 for (i = 0; i <= ss->rx_small.mask; i++) {
3075 bus_dmamap_destroy(ss->rx_small.dmat,
3076 ss->rx_small.info[i].map);
3078 bus_dmamap_destroy(ss->rx_small.dmat,
3079 ss->rx_small.extra_map);
3080 bus_dma_tag_destroy(ss->rx_small.dmat);
3082 kfree(ss->rx_small.info, M_DEVBUF);
3084 ss->rx_small.info = NULL;
3086 if (ss->rx_big.info != NULL) {
3087 if (ss->rx_big.dmat != NULL) {
3088 for (i = 0; i <= ss->rx_big.mask; i++) {
3089 bus_dmamap_destroy(ss->rx_big.dmat,
3090 ss->rx_big.info[i].map);
3092 bus_dmamap_destroy(ss->rx_big.dmat,
3093 ss->rx_big.extra_map);
3094 bus_dma_tag_destroy(ss->rx_big.dmat);
3096 kfree(ss->rx_big.info, M_DEVBUF);
3098 ss->rx_big.info = NULL;
3102 mxge_free_rings(mxge_softc_t *sc)
3106 for (slice = 0; slice < sc->num_slices; slice++)
3107 mxge_free_slice_rings(&sc->ss[slice]);
3111 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3112 int tx_ring_entries)
3114 mxge_softc_t *sc = ss->sc;
3120 /* allocate per-slice receive resources */
3122 ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3123 ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3125 /* allocate the rx shadow rings */
3126 bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3127 ss->rx_small.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3128 if (ss->rx_small.shadow == NULL)
3131 bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3132 ss->rx_big.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3133 if (ss->rx_big.shadow == NULL)
3136 /* allocate the rx host info rings */
3137 bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3138 ss->rx_small.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3139 if (ss->rx_small.info == NULL)
3142 bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3143 ss->rx_big.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3144 if (ss->rx_big.info == NULL)
3147 /* allocate the rx busdma resources */
3148 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3150 4096, /* boundary */
3151 BUS_SPACE_MAXADDR, /* low */
3152 BUS_SPACE_MAXADDR, /* high */
3153 NULL, NULL, /* filter */
3154 MHLEN, /* maxsize */
3156 MHLEN, /* maxsegsize */
3157 BUS_DMA_ALLOCNOW, /* flags */
3158 &ss->rx_small.dmat); /* tag */
3160 device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3165 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3167 #if MXGE_VIRT_JUMBOS
3168 4096, /* boundary */
3172 BUS_SPACE_MAXADDR, /* low */
3173 BUS_SPACE_MAXADDR, /* high */
3174 NULL, NULL, /* filter */
3175 3*4096, /* maxsize */
3176 #if MXGE_VIRT_JUMBOS
3178 4096, /* maxsegsize*/
3181 MJUM9BYTES, /* maxsegsize*/
3183 BUS_DMA_ALLOCNOW, /* flags */
3184 &ss->rx_big.dmat); /* tag */
3186 device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3190 for (i = 0; i <= ss->rx_small.mask; i++) {
3191 err = bus_dmamap_create(ss->rx_small.dmat, 0,
3192 &ss->rx_small.info[i].map);
3194 device_printf(sc->dev, "Err %d rx_small dmamap\n",
3199 err = bus_dmamap_create(ss->rx_small.dmat, 0,
3200 &ss->rx_small.extra_map);
3202 device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3207 for (i = 0; i <= ss->rx_big.mask; i++) {
3208 err = bus_dmamap_create(ss->rx_big.dmat, 0,
3209 &ss->rx_big.info[i].map);
3211 device_printf(sc->dev, "Err %d rx_big dmamap\n",
3216 err = bus_dmamap_create(ss->rx_big.dmat, 0,
3217 &ss->rx_big.extra_map);
3219 device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3224 /* now allocate TX resouces */
3226 #ifndef IFNET_BUF_RING
3227 /* only use a single TX ring for now */
3228 if (ss != ss->sc->ss)
3232 ss->tx.mask = tx_ring_entries - 1;
3233 ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3236 /* allocate the tx request copy block */
3238 sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3239 ss->tx.req_bytes = kmalloc(bytes, M_DEVBUF, M_WAITOK);
3240 if (ss->tx.req_bytes == NULL)
3242 /* ensure req_list entries are aligned to 8 bytes */
3243 ss->tx.req_list = (mcp_kreq_ether_send_t *)
3244 ((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3246 /* allocate the tx busdma segment list */
3247 bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3248 ss->tx.seg_list = (bus_dma_segment_t *)
3249 kmalloc(bytes, M_DEVBUF, M_WAITOK);
3250 if (ss->tx.seg_list == NULL)
3253 /* allocate the tx host info ring */
3254 bytes = tx_ring_entries * sizeof (*ss->tx.info);
3255 ss->tx.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3256 if (ss->tx.info == NULL)
3259 /* allocate the tx busdma resources */
3260 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3262 sc->tx_boundary, /* boundary */
3263 BUS_SPACE_MAXADDR, /* low */
3264 BUS_SPACE_MAXADDR, /* high */
3265 NULL, NULL, /* filter */
3266 65536 + 256, /* maxsize */
3267 ss->tx.max_desc - 2, /* num segs */
3268 sc->tx_boundary, /* maxsegsz */
3269 BUS_DMA_ALLOCNOW, /* flags */
3270 &ss->tx.dmat); /* tag */
3273 device_printf(sc->dev, "Err %d allocating tx dmat\n",
3278 /* now use these tags to setup dmamaps for each slot
3280 for (i = 0; i <= ss->tx.mask; i++) {
3281 err = bus_dmamap_create(ss->tx.dmat, 0,
3282 &ss->tx.info[i].map);
3284 device_printf(sc->dev, "Err %d tx dmamap\n",
3294 mxge_alloc_rings(mxge_softc_t *sc)
3298 int tx_ring_entries, rx_ring_entries;
3301 /* get ring sizes */
3302 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3303 tx_ring_size = cmd.data0;
3305 device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3309 tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3310 rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3311 IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
3312 sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
3313 IFQ_SET_READY(&sc->ifp->if_snd);
3315 for (slice = 0; slice < sc->num_slices; slice++) {
3316 err = mxge_alloc_slice_rings(&sc->ss[slice],
3325 mxge_free_rings(sc);
3332 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3334 int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3336 if (bufsize < MCLBYTES) {
3337 /* easy, everything fits in a single buffer */
3338 *big_buf_size = MCLBYTES;
3339 *cl_size = MCLBYTES;
3344 if (bufsize < MJUMPAGESIZE) {
3345 /* still easy, everything still fits in a single buffer */
3346 *big_buf_size = MJUMPAGESIZE;
3347 *cl_size = MJUMPAGESIZE;
3351 #if MXGE_VIRT_JUMBOS
3352 /* now we need to use virtually contiguous buffers */
3353 *cl_size = MJUM9BYTES;
3354 *big_buf_size = 4096;
3355 *nbufs = mtu / 4096 + 1;
3356 /* needs to be a power of two, so round up */
3360 *cl_size = MJUM9BYTES;
3361 *big_buf_size = MJUM9BYTES;
3367 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3372 struct lro_entry *lro_entry;
3377 slice = ss - sc->ss;
3379 SLIST_INIT(&ss->lro_free);
3380 SLIST_INIT(&ss->lro_active);
3382 for (i = 0; i < sc->lro_cnt; i++) {
3383 lro_entry = (struct lro_entry *)
3384 kmalloc(sizeof (*lro_entry), M_DEVBUF,
3386 if (lro_entry == NULL) {
3390 SLIST_INSERT_HEAD(&ss->lro_free, lro_entry, next);
3392 /* get the lanai pointers to the send and receive rings */
3395 #ifndef IFNET_BUF_RING
3396 /* We currently only send from the first slice */
3400 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3402 (volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3403 ss->tx.send_go = (volatile uint32_t *)
3404 (sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3405 ss->tx.send_stop = (volatile uint32_t *)
3406 (sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3407 #ifndef IFNET_BUF_RING
3411 err |= mxge_send_cmd(sc,
3412 MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3413 ss->rx_small.lanai =
3414 (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3416 err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3418 (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3421 device_printf(sc->dev,
3422 "failed to get ring sizes or locations\n");
3426 /* stock receive rings */
3427 for (i = 0; i <= ss->rx_small.mask; i++) {
3428 map = ss->rx_small.info[i].map;
3429 err = mxge_get_buf_small(ss, map, i);
3431 device_printf(sc->dev, "alloced %d/%d smalls\n",
3432 i, ss->rx_small.mask + 1);
3436 for (i = 0; i <= ss->rx_big.mask; i++) {
3437 ss->rx_big.shadow[i].addr_low = 0xffffffff;
3438 ss->rx_big.shadow[i].addr_high = 0xffffffff;
3440 ss->rx_big.nbufs = nbufs;
3441 ss->rx_big.cl_size = cl_size;
3442 ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3443 ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3444 for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3445 map = ss->rx_big.info[i].map;
3446 err = mxge_get_buf_big(ss, map, i);
3448 device_printf(sc->dev, "alloced %d/%d bigs\n",
3449 i, ss->rx_big.mask + 1);
3457 mxge_open(mxge_softc_t *sc)
3460 int err, big_bytes, nbufs, slice, cl_size, i;
3462 volatile uint8_t *itable;
3463 struct mxge_slice_state *ss;
3465 /* Copy the MAC address in case it was overridden */
3466 bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3468 err = mxge_reset(sc, 1);
3470 device_printf(sc->dev, "failed to reset\n");
3474 if (sc->num_slices > 1) {
3475 /* setup the indirection table */
3476 cmd.data0 = sc->num_slices;
3477 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3480 err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3483 device_printf(sc->dev,
3484 "failed to setup rss tables\n");
3488 /* just enable an identity mapping */
3489 itable = sc->sram + cmd.data0;
3490 for (i = 0; i < sc->num_slices; i++)
3491 itable[i] = (uint8_t)i;
3494 cmd.data1 = mxge_rss_hash_type;
3495 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3497 device_printf(sc->dev, "failed to enable slices\n");
3503 mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3506 err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3508 /* error is only meaningful if we're trying to set
3509 MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3510 if (err && nbufs > 1) {
3511 device_printf(sc->dev,
3512 "Failed to set alway-use-n to %d\n",
3516 /* Give the firmware the mtu and the big and small buffer
3517 sizes. The firmware wants the big buf size to be a power
3518 of two. Luckily, FreeBSD's clusters are powers of two */
3519 cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3520 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3521 cmd.data0 = MHLEN - MXGEFW_PAD;
3522 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3524 cmd.data0 = big_bytes;
3525 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3528 device_printf(sc->dev, "failed to setup params\n");
3532 /* Now give him the pointer to the stats block */
3534 #ifdef IFNET_BUF_RING
3535 slice < sc->num_slices;
3540 ss = &sc->ss[slice];
3542 MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3544 MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3545 cmd.data2 = sizeof(struct mcp_irq_data);
3546 cmd.data2 |= (slice << 16);
3547 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3551 bus = sc->ss->fw_stats_dma.bus_addr;
3552 bus += offsetof(struct mcp_irq_data, send_done_count);
3553 cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3554 cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3555 err = mxge_send_cmd(sc,
3556 MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3558 /* Firmware cannot support multicast without STATS_DMA_V2 */
3559 sc->fw_multicast_support = 0;
3561 sc->fw_multicast_support = 1;
3565 device_printf(sc->dev, "failed to setup params\n");
3569 for (slice = 0; slice < sc->num_slices; slice++) {
3570 err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3572 device_printf(sc->dev, "couldn't open slice %d\n",
3578 /* Finally, start the firmware running */
3579 err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3581 device_printf(sc->dev, "Couldn't bring up link\n");
3584 #ifdef IFNET_BUF_RING
3585 for (slice = 0; slice < sc->num_slices; slice++) {
3586 ss = &sc->ss[slice];
3587 ss->if_flags |= IFF_RUNNING;
3588 ss->if_flags &= ~IFF_OACTIVE;
3591 sc->ifp->if_flags |= IFF_RUNNING;
3592 sc->ifp->if_flags &= ~IFF_OACTIVE;
3593 callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3599 mxge_free_mbufs(sc);
3605 mxge_close(mxge_softc_t *sc)
3608 int err, old_down_cnt;
3609 #ifdef IFNET_BUF_RING
3610 struct mxge_slice_state *ss;
3614 callout_stop(&sc->co_hdl);
3615 #ifdef IFNET_BUF_RING
3616 for (slice = 0; slice < sc->num_slices; slice++) {
3617 ss = &sc->ss[slice];
3618 ss->if_flags &= ~IFF_RUNNING;
3621 sc->ifp->if_flags &= ~IFF_RUNNING;
3622 old_down_cnt = sc->down_cnt;
3624 err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3626 device_printf(sc->dev, "Couldn't bring down link\n");
3628 if (old_down_cnt == sc->down_cnt) {
3629 /* wait for down irq */
3630 DELAY(10 * sc->intr_coal_delay);
3633 if (old_down_cnt == sc->down_cnt) {
3634 device_printf(sc->dev, "never got down irq\n");
3637 mxge_free_mbufs(sc);
3643 mxge_setup_cfg_space(mxge_softc_t *sc)
3645 device_t dev = sc->dev;
3647 uint16_t cmd, lnk, pectl;
3649 /* find the PCIe link width and set max read request to 4KB*/
3650 if (pci_find_extcap(dev, PCIY_EXPRESS, ®) == 0) {
3651 lnk = pci_read_config(dev, reg + 0x12, 2);
3652 sc->link_width = (lnk >> 4) & 0x3f;
3654 pectl = pci_read_config(dev, reg + 0x8, 2);
3655 pectl = (pectl & ~0x7000) | (5 << 12);
3656 pci_write_config(dev, reg + 0x8, pectl, 2);
3659 /* Enable DMA and Memory space access */
3660 pci_enable_busmaster(dev);
3661 cmd = pci_read_config(dev, PCIR_COMMAND, 2);
3662 cmd |= PCIM_CMD_MEMEN;
3663 pci_write_config(dev, PCIR_COMMAND, cmd, 2);
3667 mxge_read_reboot(mxge_softc_t *sc)
3669 device_t dev = sc->dev;
3672 /* find the vendor specific offset */
3673 if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
3674 device_printf(sc->dev,
3675 "could not find vendor specific offset\n");
3676 return (uint32_t)-1;
3678 /* enable read32 mode */
3679 pci_write_config(dev, vs + 0x10, 0x3, 1);
3680 /* tell NIC which register to read */
3681 pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3682 return (pci_read_config(dev, vs + 0x14, 4));
3686 mxge_watchdog_reset(mxge_softc_t *sc, int slice)
3688 struct pci_devinfo *dinfo;
3696 device_printf(sc->dev, "Watchdog reset!\n");
3699 * check to see if the NIC rebooted. If it did, then all of
3700 * PCI config space has been reset, and things like the
3701 * busmaster bit will be zero. If this is the case, then we
3702 * must restore PCI config space before the NIC can be used
3705 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3706 if (cmd == 0xffff) {
3708 * maybe the watchdog caught the NIC rebooting; wait
3709 * up to 100ms for it to finish. If it does not come
3710 * back, then give up
3713 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3714 if (cmd == 0xffff) {
3715 device_printf(sc->dev, "NIC disappeared!\n");
3719 if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3720 /* print the reboot status */
3721 reboot = mxge_read_reboot(sc);
3722 device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3724 /* restore PCI configuration space */
3725 dinfo = device_get_ivars(sc->dev);
3726 pci_cfg_restore(sc->dev, dinfo);
3728 /* and redo any changes we made to our config space */
3729 mxge_setup_cfg_space(sc);
3731 if (sc->ifp->if_flags & IFF_RUNNING) {
3733 err = mxge_open(sc);
3736 tx = &sc->ss[slice].tx;
3737 device_printf(sc->dev,
3738 "NIC did not reboot, slice %d ring state:\n",
3740 device_printf(sc->dev,
3741 "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3742 tx->req, tx->done, tx->queue_active);
3743 device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3744 tx->activate, tx->deactivate);
3745 device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3747 be32toh(sc->ss->fw_stats->send_done_count));
3748 device_printf(sc->dev, "not resetting\n");
3754 mxge_watchdog(mxge_softc_t *sc)
3757 uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3760 /* see if we have outstanding transmits, which
3761 have been pending for more than mxge_ticks */
3763 #ifdef IFNET_BUF_RING
3764 (i < sc->num_slices) && (err == 0);
3766 (i < 1) && (err == 0);
3770 if (tx->req != tx->done &&
3771 tx->watchdog_req != tx->watchdog_done &&
3772 tx->done == tx->watchdog_done) {
3773 /* check for pause blocking before resetting */
3774 if (tx->watchdog_rx_pause == rx_pause)
3775 err = mxge_watchdog_reset(sc, i);
3777 device_printf(sc->dev, "Flow control blocking "
3778 "xmits, check link partner\n");
3781 tx->watchdog_req = tx->req;
3782 tx->watchdog_done = tx->done;
3783 tx->watchdog_rx_pause = rx_pause;
3786 if (sc->need_media_probe)
3787 mxge_media_probe(sc);
3792 mxge_update_stats(mxge_softc_t *sc)
3794 struct mxge_slice_state *ss;
3795 u_long ipackets = 0;
3796 u_long opackets = 0;
3797 #ifdef IFNET_BUF_RING
3805 for (slice = 0; slice < sc->num_slices; slice++) {
3806 ss = &sc->ss[slice];
3807 ipackets += ss->ipackets;
3808 opackets += ss->opackets;
3809 #ifdef IFNET_BUF_RING
3810 obytes += ss->obytes;
3811 omcasts += ss->omcasts;
3812 odrops += ss->tx.br->br_drops;
3814 oerrors += ss->oerrors;
3816 sc->ifp->if_ipackets = ipackets;
3817 sc->ifp->if_opackets = opackets;
3818 #ifdef IFNET_BUF_RING
3819 sc->ifp->if_obytes = obytes;
3820 sc->ifp->if_omcasts = omcasts;
3821 sc->ifp->if_snd.ifq_drops = odrops;
3823 sc->ifp->if_oerrors = oerrors;
3827 mxge_tick(void *arg)
3829 mxge_softc_t *sc = arg;
3832 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
3833 /* aggregate stats from different slices */
3834 mxge_update_stats(sc);
3835 if (!sc->watchdog_countdown) {
3836 err = mxge_watchdog(sc);
3837 sc->watchdog_countdown = 4;
3839 sc->watchdog_countdown--;
3841 callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3842 lockmgr(&sc->driver_lock, LK_RELEASE);
3846 mxge_media_change(struct ifnet *ifp)
3852 mxge_change_mtu(mxge_softc_t *sc, int mtu)
3854 struct ifnet *ifp = sc->ifp;
3855 int real_mtu, old_mtu;
3859 real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3860 if ((real_mtu > sc->max_mtu) || real_mtu < 60)
3862 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
3863 old_mtu = ifp->if_mtu;
3865 if (ifp->if_flags & IFF_RUNNING) {
3867 err = mxge_open(sc);
3869 ifp->if_mtu = old_mtu;
3871 (void) mxge_open(sc);
3874 lockmgr(&sc->driver_lock, LK_RELEASE);
3879 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
3881 mxge_softc_t *sc = ifp->if_softc;
3886 ifmr->ifm_status = IFM_AVALID;
3887 ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
3888 ifmr->ifm_active = IFM_AUTO | IFM_ETHER;
3889 ifmr->ifm_active |= sc->link_state ? IFM_FDX : 0;
3893 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data, struct ucred *cr)
3895 mxge_softc_t *sc = ifp->if_softc;
3896 struct ifreq *ifr = (struct ifreq *)data;
3904 err = ether_ioctl(ifp, command, data);
3908 err = mxge_change_mtu(sc, ifr->ifr_mtu);
3912 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
3914 lockmgr(&sc->driver_lock, LK_RELEASE);
3917 if (ifp->if_flags & IFF_UP) {
3918 if (!(ifp->if_flags & IFF_RUNNING)) {
3919 err = mxge_open(sc);
3921 /* take care of promis can allmulti
3923 mxge_change_promisc(sc,
3924 ifp->if_flags & IFF_PROMISC);
3925 mxge_set_multicast_list(sc);
3928 if (ifp->if_flags & IFF_RUNNING) {
3932 lockmgr(&sc->driver_lock, LK_RELEASE);
3937 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
3938 mxge_set_multicast_list(sc);
3939 lockmgr(&sc->driver_lock, LK_RELEASE);
3943 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
3944 mask = ifr->ifr_reqcap ^ ifp->if_capenable;
3945 if (mask & IFCAP_TXCSUM) {
3946 if (IFCAP_TXCSUM & ifp->if_capenable) {
3947 ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
3948 ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
3951 ifp->if_capenable |= IFCAP_TXCSUM;
3952 ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
3954 } else if (mask & IFCAP_RXCSUM) {
3955 if (IFCAP_RXCSUM & ifp->if_capenable) {
3956 ifp->if_capenable &= ~IFCAP_RXCSUM;
3959 ifp->if_capenable |= IFCAP_RXCSUM;
3963 if (mask & IFCAP_TSO4) {
3964 if (IFCAP_TSO4 & ifp->if_capenable) {
3965 ifp->if_capenable &= ~IFCAP_TSO4;
3966 ifp->if_hwassist &= ~CSUM_TSO;
3967 } else if (IFCAP_TXCSUM & ifp->if_capenable) {
3968 ifp->if_capenable |= IFCAP_TSO4;
3969 ifp->if_hwassist |= CSUM_TSO;
3971 kprintf("mxge requires tx checksum offload"
3972 " be enabled to use TSO\n");
3976 if (mask & IFCAP_LRO) {
3977 if (IFCAP_LRO & ifp->if_capenable)
3978 err = mxge_change_lro_locked(sc, 0);
3980 err = mxge_change_lro_locked(sc, mxge_lro_cnt);
3982 if (mask & IFCAP_VLAN_HWTAGGING)
3983 ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
3984 lockmgr(&sc->driver_lock, LK_RELEASE);
3985 VLAN_CAPABILITIES(ifp);
3990 err = ifmedia_ioctl(ifp, (struct ifreq *)data,
3991 &sc->media, command);
4001 mxge_fetch_tunables(mxge_softc_t *sc)
4004 TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4005 TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4006 &mxge_flow_control);
4007 TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4008 &mxge_intr_coal_delay);
4009 TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4010 &mxge_nvidia_ecrc_enable);
4011 TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4012 &mxge_force_firmware);
4013 TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4014 &mxge_deassert_wait);
4015 TUNABLE_INT_FETCH("hw.mxge.verbose",
4017 TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4018 TUNABLE_INT_FETCH("hw.mxge.lro_cnt", &sc->lro_cnt);
4019 TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4020 TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4021 TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4022 if (sc->lro_cnt != 0)
4023 mxge_lro_cnt = sc->lro_cnt;
4027 if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4028 mxge_intr_coal_delay = 30;
4029 if (mxge_ticks == 0)
4030 mxge_ticks = hz / 2;
4031 sc->pause = mxge_flow_control;
4032 if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4033 || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4034 mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
4036 if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4037 mxge_initial_mtu < ETHER_MIN_LEN)
4038 mxge_initial_mtu = ETHERMTU_JUMBO;
4043 mxge_free_slices(mxge_softc_t *sc)
4045 struct mxge_slice_state *ss;
4052 for (i = 0; i < sc->num_slices; i++) {
4054 if (ss->fw_stats != NULL) {
4055 mxge_dma_free(&ss->fw_stats_dma);
4056 ss->fw_stats = NULL;
4057 #ifdef IFNET_BUF_RING
4058 if (ss->tx.br != NULL) {
4059 drbr_free(ss->tx.br, M_DEVBUF);
4063 lockuninit(&ss->tx.lock);
4065 if (ss->rx_done.entry != NULL) {
4066 mxge_dma_free(&ss->rx_done.dma);
4067 ss->rx_done.entry = NULL;
4070 kfree(sc->ss, M_DEVBUF);
4075 mxge_alloc_slices(mxge_softc_t *sc)
4078 struct mxge_slice_state *ss;
4080 int err, i, max_intr_slots;
4082 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4084 device_printf(sc->dev, "Cannot determine rx ring size\n");
4087 sc->rx_ring_size = cmd.data0;
4088 max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4090 bytes = sizeof (*sc->ss) * sc->num_slices;
4091 sc->ss = kmalloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4094 for (i = 0; i < sc->num_slices; i++) {
4099 /* allocate per-slice rx interrupt queues */
4101 bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4102 err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4105 ss->rx_done.entry = ss->rx_done.dma.addr;
4106 bzero(ss->rx_done.entry, bytes);
4109 * allocate the per-slice firmware stats; stats
4110 * (including tx) are used used only on the first
4113 #ifndef IFNET_BUF_RING
4118 bytes = sizeof (*ss->fw_stats);
4119 err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4120 sizeof (*ss->fw_stats), 64);
4123 ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4124 ksnprintf(ss->tx.lock_name, sizeof(ss->tx.lock_name),
4125 "%s:tx(%d)", device_get_nameunit(sc->dev), i);
4126 lockinit(&ss->tx.lock, ss->tx.lock_name, 0, LK_CANRECURSE);
4127 #ifdef IFNET_BUF_RING
4128 ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4136 mxge_free_slices(sc);
4141 mxge_slice_probe(mxge_softc_t *sc)
4145 int msix_cnt, status, max_intr_slots;
4149 * don't enable multiple slices if they are not enabled,
4150 * or if this is not an SMP system
4153 if (mxge_max_slices == 0 || mxge_max_slices == 1 || ncpus < 2)
4156 /* see how many MSI-X interrupts are available */
4157 msix_cnt = pci_msix_count(sc->dev);
4161 /* now load the slice aware firmware see what it supports */
4162 old_fw = sc->fw_name;
4163 if (old_fw == mxge_fw_aligned)
4164 sc->fw_name = mxge_fw_rss_aligned;
4166 sc->fw_name = mxge_fw_rss_unaligned;
4167 status = mxge_load_firmware(sc, 0);
4169 device_printf(sc->dev, "Falling back to a single slice\n");
4173 /* try to send a reset command to the card to see if it
4175 memset(&cmd, 0, sizeof (cmd));
4176 status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4178 device_printf(sc->dev, "failed reset\n");
4182 /* get rx ring size */
4183 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4185 device_printf(sc->dev, "Cannot determine rx ring size\n");
4188 max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4190 /* tell it the size of the interrupt queues */
4191 cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4192 status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4194 device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4198 /* ask the maximum number of slices it supports */
4199 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4201 device_printf(sc->dev,
4202 "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4205 sc->num_slices = cmd.data0;
4206 if (sc->num_slices > msix_cnt)
4207 sc->num_slices = msix_cnt;
4209 if (mxge_max_slices == -1) {
4210 /* cap to number of CPUs in system */
4211 if (sc->num_slices > ncpus)
4212 sc->num_slices = ncpus;
4214 if (sc->num_slices > mxge_max_slices)
4215 sc->num_slices = mxge_max_slices;
4217 /* make sure it is a power of two */
4218 while (sc->num_slices & (sc->num_slices - 1))
4222 device_printf(sc->dev, "using %d slices\n",
4228 sc->fw_name = old_fw;
4229 (void) mxge_load_firmware(sc, 0);
4233 mxge_add_msix_irqs(mxge_softc_t *sc)
4236 int count, err, i, rid;
4239 sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4242 if (sc->msix_table_res == NULL) {
4243 device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4247 count = sc->num_slices;
4248 err = pci_alloc_msix(sc->dev, &count);
4250 device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4251 "err = %d \n", sc->num_slices, err);
4252 goto abort_with_msix_table;
4254 if (count < sc->num_slices) {
4255 device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4256 count, sc->num_slices);
4257 device_printf(sc->dev,
4258 "Try setting hw.mxge.max_slices to %d\n",
4261 goto abort_with_msix;
4263 bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4264 sc->msix_irq_res = kmalloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4265 if (sc->msix_irq_res == NULL) {
4267 goto abort_with_msix;
4270 for (i = 0; i < sc->num_slices; i++) {
4272 sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4275 if (sc->msix_irq_res[i] == NULL) {
4276 device_printf(sc->dev, "couldn't allocate IRQ res"
4277 " for message %d\n", i);
4279 goto abort_with_res;
4283 bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4284 sc->msix_ih = kmalloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4286 for (i = 0; i < sc->num_slices; i++) {
4287 err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4289 mxge_intr, &sc->ss[i], &sc->msix_ih[i],
4290 XXX /* serializer */);
4292 device_printf(sc->dev, "couldn't setup intr for "
4294 goto abort_with_intr;
4299 device_printf(sc->dev, "using %d msix IRQs:",
4301 for (i = 0; i < sc->num_slices; i++)
4302 kprintf(" %ld", rman_get_start(sc->msix_irq_res[i]));
4308 for (i = 0; i < sc->num_slices; i++) {
4309 if (sc->msix_ih[i] != NULL) {
4310 bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4312 sc->msix_ih[i] = NULL;
4315 kfree(sc->msix_ih, M_DEVBUF);
4319 for (i = 0; i < sc->num_slices; i++) {
4321 if (sc->msix_irq_res[i] != NULL)
4322 bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4323 sc->msix_irq_res[i]);
4324 sc->msix_irq_res[i] = NULL;
4326 kfree(sc->msix_irq_res, M_DEVBUF);
4330 pci_release_msi(sc->dev);
4332 abort_with_msix_table:
4333 bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4334 sc->msix_table_res);
4340 mxge_add_single_irq(mxge_softc_t *sc)
4342 int count, err, rid;
4344 count = pci_msi_count(sc->dev);
4345 if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4351 sc->irq_res = bus_alloc_resource(sc->dev, SYS_RES_IRQ, &rid, 0, ~0,
4352 1, RF_SHAREABLE | RF_ACTIVE);
4353 if (sc->irq_res == NULL) {
4354 device_printf(sc->dev, "could not alloc interrupt\n");
4358 device_printf(sc->dev, "using %s irq %ld\n",
4359 sc->legacy_irq ? "INTx" : "MSI",
4360 rman_get_start(sc->irq_res));
4361 err = bus_setup_intr(sc->dev, sc->irq_res,
4363 mxge_intr, &sc->ss[0], &sc->ih,
4364 XXX /* serializer */);
4366 bus_release_resource(sc->dev, SYS_RES_IRQ,
4367 sc->legacy_irq ? 0 : 1, sc->irq_res);
4368 if (!sc->legacy_irq)
4369 pci_release_msi(sc->dev);
4375 mxge_rem_msix_irqs(mxge_softc_t *sc)
4379 for (i = 0; i < sc->num_slices; i++) {
4380 if (sc->msix_ih[i] != NULL) {
4381 bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4383 sc->msix_ih[i] = NULL;
4386 kfree(sc->msix_ih, M_DEVBUF);
4388 for (i = 0; i < sc->num_slices; i++) {
4390 if (sc->msix_irq_res[i] != NULL)
4391 bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4392 sc->msix_irq_res[i]);
4393 sc->msix_irq_res[i] = NULL;
4395 kfree(sc->msix_irq_res, M_DEVBUF);
4397 bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4398 sc->msix_table_res);
4400 pci_release_msi(sc->dev);
4405 mxge_rem_single_irq(mxge_softc_t *sc)
4407 bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4408 bus_release_resource(sc->dev, SYS_RES_IRQ,
4409 sc->legacy_irq ? 0 : 1, sc->irq_res);
4410 if (!sc->legacy_irq)
4411 pci_release_msi(sc->dev);
4415 mxge_rem_irq(mxge_softc_t *sc)
4417 if (sc->num_slices > 1)
4418 mxge_rem_msix_irqs(sc);
4420 mxge_rem_single_irq(sc);
4424 mxge_add_irq(mxge_softc_t *sc)
4428 if (sc->num_slices > 1)
4429 err = mxge_add_msix_irqs(sc);
4431 err = mxge_add_single_irq(sc);
4433 if (0 && err == 0 && sc->num_slices > 1) {
4434 mxge_rem_msix_irqs(sc);
4435 err = mxge_add_msix_irqs(sc);
4442 mxge_attach(device_t dev)
4444 mxge_softc_t *sc = device_get_softc(dev);
4445 struct ifnet *ifp = &sc->arpcom.ac_if;
4449 * avoid rewriting half the lines in this file to use
4450 * &sc->arpcom.ac_if instead
4454 mxge_fetch_tunables(sc);
4456 err = bus_dma_tag_create(NULL, /* parent */
4459 BUS_SPACE_MAXADDR, /* low */
4460 BUS_SPACE_MAXADDR, /* high */
4461 NULL, NULL, /* filter */
4462 65536 + 256, /* maxsize */
4463 MXGE_MAX_SEND_DESC, /* num segs */
4464 65536, /* maxsegsize */
4466 &sc->parent_dmat); /* tag */
4469 device_printf(sc->dev, "Err %d allocating parent dmat\n",
4471 goto abort_with_nothing;
4475 if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4477 ksnprintf(sc->cmd_lock_name, sizeof(sc->cmd_lock_name), "%s:cmd",
4478 device_get_nameunit(dev));
4479 lockinit(&sc->cmd_lock, sc->cmd_lock_name, 0, LK_CANRECURSE);
4480 ksnprintf(sc->driver_lock_name, sizeof(sc->driver_lock_name),
4481 "%s:drv", device_get_nameunit(dev));
4482 lockinit(&sc->driver_lock, sc->driver_lock_name,
4485 callout_init(&sc->co_hdl);
4487 mxge_setup_cfg_space(sc);
4489 /* Map the board into the kernel */
4491 sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
4493 if (sc->mem_res == NULL) {
4494 device_printf(dev, "could not map memory\n");
4496 goto abort_with_lock;
4498 sc->sram = rman_get_virtual(sc->mem_res);
4499 sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4500 if (sc->sram_size > rman_get_size(sc->mem_res)) {
4501 device_printf(dev, "impossible memory region size %ld\n",
4502 rman_get_size(sc->mem_res));
4504 goto abort_with_mem_res;
4507 /* make NULL terminated copy of the EEPROM strings section of
4509 bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4510 bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4511 rman_get_bushandle(sc->mem_res),
4512 sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4514 MXGE_EEPROM_STRINGS_SIZE - 2);
4515 err = mxge_parse_strings(sc);
4517 goto abort_with_mem_res;
4519 /* Enable write combining for efficient use of PCIe bus */
4522 /* Allocate the out of band dma memory */
4523 err = mxge_dma_alloc(sc, &sc->cmd_dma,
4524 sizeof (mxge_cmd_t), 64);
4526 goto abort_with_mem_res;
4527 sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4528 err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4530 goto abort_with_cmd_dma;
4532 err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4534 goto abort_with_zeropad_dma;
4536 /* select & load the firmware */
4537 err = mxge_select_firmware(sc);
4539 goto abort_with_dmabench;
4540 sc->intr_coal_delay = mxge_intr_coal_delay;
4542 mxge_slice_probe(sc);
4543 err = mxge_alloc_slices(sc);
4545 goto abort_with_dmabench;
4547 err = mxge_reset(sc, 0);
4549 goto abort_with_slices;
4551 err = mxge_alloc_rings(sc);
4553 device_printf(sc->dev, "failed to allocate rings\n");
4554 goto abort_with_dmabench;
4557 err = mxge_add_irq(sc);
4559 device_printf(sc->dev, "failed to add irq\n");
4560 goto abort_with_rings;
4563 ifp->if_baudrate = IF_Gbps(10UL);
4564 ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4567 ifp->if_capabilities |= IFCAP_LRO;
4570 #ifdef MXGE_NEW_VLAN_API
4571 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4574 sc->max_mtu = mxge_max_mtu(sc);
4575 if (sc->max_mtu >= 9000)
4576 ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4578 device_printf(dev, "MTU limited to %d. Install "
4579 "latest firmware for 9000 byte jumbo support\n",
4580 sc->max_mtu - ETHER_HDR_LEN);
4581 ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4582 ifp->if_capenable = ifp->if_capabilities;
4583 if (sc->lro_cnt == 0)
4584 ifp->if_capenable &= ~IFCAP_LRO;
4586 ifp->if_init = mxge_init;
4588 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4589 ifp->if_ioctl = mxge_ioctl;
4590 ifp->if_start = mxge_start;
4591 /* Initialise the ifmedia structure */
4592 ifmedia_init(&sc->media, 0, mxge_media_change,
4594 mxge_set_media(sc, IFM_ETHER | IFM_AUTO);
4595 mxge_media_probe(sc);
4597 ether_ifattach(ifp, sc->mac_addr);
4598 /* ether_ifattach sets mtu to ETHERMTU */
4599 if (mxge_initial_mtu != ETHERMTU)
4600 mxge_change_mtu(sc, mxge_initial_mtu);
4602 mxge_add_sysctls(sc);
4603 #ifdef IFNET_BUF_RING
4604 ifp->if_transmit = mxge_transmit;
4605 ifp->if_qflush = mxge_qflush;
4610 mxge_free_rings(sc);
4612 mxge_free_slices(sc);
4613 abort_with_dmabench:
4614 mxge_dma_free(&sc->dmabench_dma);
4615 abort_with_zeropad_dma:
4616 mxge_dma_free(&sc->zeropad_dma);
4618 mxge_dma_free(&sc->cmd_dma);
4620 bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4622 pci_disable_busmaster(dev);
4623 lockuninit(&sc->cmd_lock);
4624 lockuninit(&sc->driver_lock);
4626 abort_with_parent_dmat:
4627 bus_dma_tag_destroy(sc->parent_dmat);
4634 mxge_detach(device_t dev)
4636 mxge_softc_t *sc = device_get_softc(dev);
4638 if (mxge_vlans_active(sc)) {
4639 device_printf(sc->dev,
4640 "Detach vlans before removing module\n");
4643 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
4645 if (sc->ifp->if_flags & IFF_RUNNING)
4647 lockmgr(&sc->driver_lock, LK_RELEASE);
4648 ether_ifdetach(sc->ifp);
4649 callout_drain(&sc->co_hdl);
4650 ifmedia_removeall(&sc->media);
4651 mxge_dummy_rdma(sc, 0);
4652 mxge_rem_sysctls(sc);
4654 mxge_free_rings(sc);
4655 mxge_free_slices(sc);
4656 mxge_dma_free(&sc->dmabench_dma);
4657 mxge_dma_free(&sc->zeropad_dma);
4658 mxge_dma_free(&sc->cmd_dma);
4659 bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4660 pci_disable_busmaster(dev);
4661 lockuninit(&sc->cmd_lock);
4662 lockuninit(&sc->driver_lock);
4664 bus_dma_tag_destroy(sc->parent_dmat);
4669 mxge_shutdown(device_t dev)
4675 This file uses Myri10GE driver indentation.
4678 c-file-style:"linux"