1 /******************************************************************************
3 Copyright (c) 2006-2009, Myricom Inc.
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
9 1. Redistributions of source code must retain the above copyright notice,
10 this list of conditions and the following disclaimer.
12 2. Neither the name of the Myricom Inc, nor the names of its
13 contributors may be used to endorse or promote products derived from
14 this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
28 ***************************************************************************/
30 #include <sys/cdefs.h>
31 /*__FBSDID("$FreeBSD: src/sys/dev/mxge/if_mxge.c,v 1.63 2009/06/26 11:45:06 rwatson Exp $");*/
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/linker.h>
36 #include <sys/firmware.h>
37 #include <sys/endian.h>
38 #include <sys/sockio.h>
40 #include <sys/malloc.h>
41 #include <sys/kernel.h>
43 #include <sys/module.h>
44 #include <sys/socket.h>
45 #include <sys/sysctl.h>
47 /* count xmits ourselves, rather than via drbr */
50 #include <net/if_arp.h>
51 #include <net/ethernet.h>
52 #include <net/if_dl.h>
53 #include <net/if_media.h>
57 #include <net/if_types.h>
58 #include <net/vlan/if_vlan_var.h>
61 #include <netinet/in_systm.h>
62 #include <netinet/in.h>
63 #include <netinet/ip.h>
64 #include <netinet/tcp.h>
66 #include <machine/resource.h>
70 #include <bus/pci/pcireg.h>
71 #include <bus/pci/pcivar.h>
72 #include <bus/pci/pci_private.h> /* XXX for pci_cfg_restore */
74 #include <vm/vm.h> /* for pmap_mapdev() */
77 #if defined(__i386) || defined(__amd64)
78 #include <machine/specialreg.h>
81 #include <dev/netif/mxge/mxge_mcp.h>
82 #include <dev/netif/mxge/mcp_gen_header.h>
83 /*#define MXGE_FAKE_IFP*/
84 #include <dev/netif/mxge/if_mxge_var.h>
86 #include <sys/buf_ring.h>
92 static int mxge_nvidia_ecrc_enable = 1;
93 static int mxge_force_firmware = 0;
94 static int mxge_intr_coal_delay = 30;
95 static int mxge_deassert_wait = 1;
96 static int mxge_flow_control = 1;
97 static int mxge_verbose = 0;
98 static int mxge_lro_cnt = 8;
99 static int mxge_ticks;
100 static int mxge_max_slices = 1;
101 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
102 static int mxge_always_promisc = 0;
103 static int mxge_initial_mtu = ETHERMTU_JUMBO;
104 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
105 static char *mxge_fw_aligned = "mxge_eth_z8e";
106 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
107 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
109 static int mxge_probe(device_t dev);
110 static int mxge_attach(device_t dev);
111 static int mxge_detach(device_t dev);
112 static int mxge_shutdown(device_t dev);
113 static void mxge_intr(void *arg);
115 static device_method_t mxge_methods[] =
117 /* Device interface */
118 DEVMETHOD(device_probe, mxge_probe),
119 DEVMETHOD(device_attach, mxge_attach),
120 DEVMETHOD(device_detach, mxge_detach),
121 DEVMETHOD(device_shutdown, mxge_shutdown),
125 static driver_t mxge_driver =
129 sizeof(mxge_softc_t),
132 static devclass_t mxge_devclass;
134 /* Declare ourselves to be a child of the PCI bus.*/
135 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
136 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
137 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
139 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
140 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
141 static int mxge_close(mxge_softc_t *sc);
142 static int mxge_open(mxge_softc_t *sc);
143 static void mxge_tick(void *arg);
146 mxge_probe(device_t dev)
151 if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
152 ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
153 (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
154 rev = pci_get_revid(dev);
156 case MXGE_PCI_REV_Z8E:
157 device_set_desc(dev, "Myri10G-PCIE-8A");
159 case MXGE_PCI_REV_Z8ES:
160 device_set_desc(dev, "Myri10G-PCIE-8B");
163 device_set_desc(dev, "Myri10G-PCIE-8??");
164 device_printf(dev, "Unrecognized rev %d NIC\n",
174 mxge_enable_wc(mxge_softc_t *sc)
176 #if defined(__i386) || defined(__amd64)
181 len = rman_get_size(sc->mem_res);
182 err = pmap_change_attr((vm_offset_t) sc->sram,
183 len, PAT_WRITE_COMBINING);
185 device_printf(sc->dev, "pmap_change_attr failed, %d\n",
193 /* callback to get our DMA address */
195 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
199 *(bus_addr_t *) arg = segs->ds_addr;
204 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
205 bus_size_t alignment)
208 device_t dev = sc->dev;
209 bus_size_t boundary, maxsegsize;
211 if (bytes > 4096 && alignment == 4096) {
219 /* allocate DMAable memory tags */
220 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
221 alignment, /* alignment */
222 boundary, /* boundary */
223 BUS_SPACE_MAXADDR, /* low */
224 BUS_SPACE_MAXADDR, /* high */
225 NULL, NULL, /* filter */
228 maxsegsize, /* maxsegsize */
229 BUS_DMA_COHERENT, /* flags */
230 &dma->dmat); /* tag */
232 device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
236 /* allocate DMAable memory & map */
237 err = bus_dmamem_alloc(dma->dmat, &dma->addr,
238 (BUS_DMA_WAITOK | BUS_DMA_COHERENT
239 | BUS_DMA_ZERO), &dma->map);
241 device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
242 goto abort_with_dmat;
245 /* load the memory */
246 err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
247 mxge_dmamap_callback,
248 (void *)&dma->bus_addr, 0);
250 device_printf(dev, "couldn't load map (err = %d)\n", err);
256 bus_dmamem_free(dma->dmat, dma->addr, dma->map);
258 (void)bus_dma_tag_destroy(dma->dmat);
264 mxge_dma_free(mxge_dma_t *dma)
266 bus_dmamap_unload(dma->dmat, dma->map);
267 bus_dmamem_free(dma->dmat, dma->addr, dma->map);
268 (void)bus_dma_tag_destroy(dma->dmat);
272 * The eeprom strings on the lanaiX have the format
279 mxge_parse_strings(mxge_softc_t *sc)
281 #define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
286 ptr = sc->eeprom_strings;
287 limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
289 while (ptr < limit && *ptr != '\0') {
290 if (memcmp(ptr, "MAC=", 4) == 0) {
292 sc->mac_addr_string = ptr;
293 for (i = 0; i < 6; i++) {
295 if ((ptr + 2) > limit)
297 sc->mac_addr[i] = strtoul(ptr, NULL, 16);
300 } else if (memcmp(ptr, "PC=", 3) == 0) {
302 strncpy(sc->product_code_string, ptr,
303 sizeof (sc->product_code_string) - 1);
304 } else if (memcmp(ptr, "SN=", 3) == 0) {
306 strncpy(sc->serial_number_string, ptr,
307 sizeof (sc->serial_number_string) - 1);
309 MXGE_NEXT_STRING(ptr);
316 device_printf(sc->dev, "failed to parse eeprom_strings\n");
321 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
323 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
326 unsigned long base, off;
328 device_t pdev, mcp55;
329 uint16_t vendor_id, device_id, word;
330 uintptr_t bus, slot, func, ivend, idev;
334 if (!mxge_nvidia_ecrc_enable)
337 pdev = device_get_parent(device_get_parent(sc->dev));
339 device_printf(sc->dev, "could not find parent?\n");
342 vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
343 device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
345 if (vendor_id != 0x10de)
350 if (device_id == 0x005d) {
351 /* ck804, base address is magic */
353 } else if (device_id >= 0x0374 && device_id <= 0x378) {
354 /* mcp55, base address stored in chipset */
355 mcp55 = pci_find_bsf(0, 0, 0);
357 0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
358 0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
359 word = pci_read_config(mcp55, 0x90, 2);
360 base = ((unsigned long)word & 0x7ffeU) << 25;
367 Test below is commented because it is believed that doing
368 config read/write beyond 0xff will access the config space
369 for the next larger function. Uncomment this and remove
370 the hacky pmap_mapdev() way of accessing config space when
371 FreeBSD grows support for extended pcie config space access
374 /* See if we can, by some miracle, access the extended
376 val = pci_read_config(pdev, 0x178, 4);
377 if (val != 0xffffffff) {
379 pci_write_config(pdev, 0x178, val, 4);
383 /* Rather than using normal pci config space writes, we must
384 * map the Nvidia config space ourselves. This is because on
385 * opteron/nvidia class machine the 0xe000000 mapping is
386 * handled by the nvidia chipset, that means the internal PCI
387 * device (the on-chip northbridge), or the amd-8131 bridge
388 * and things behind them are not visible by this method.
391 BUS_READ_IVAR(device_get_parent(pdev), pdev,
393 BUS_READ_IVAR(device_get_parent(pdev), pdev,
394 PCI_IVAR_SLOT, &slot);
395 BUS_READ_IVAR(device_get_parent(pdev), pdev,
396 PCI_IVAR_FUNCTION, &func);
397 BUS_READ_IVAR(device_get_parent(pdev), pdev,
398 PCI_IVAR_VENDOR, &ivend);
399 BUS_READ_IVAR(device_get_parent(pdev), pdev,
400 PCI_IVAR_DEVICE, &idev);
403 + 0x00100000UL * (unsigned long)bus
404 + 0x00001000UL * (unsigned long)(func
407 /* map it into the kernel */
408 va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
412 device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
415 /* get a pointer to the config space mapped into the kernel */
416 cfgptr = va + (off & PAGE_MASK);
418 /* make sure that we can really access it */
419 vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
420 device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
421 if (! (vendor_id == ivend && device_id == idev)) {
422 device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
423 vendor_id, device_id);
424 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
428 ptr32 = (uint32_t*)(cfgptr + 0x178);
431 if (val == 0xffffffff) {
432 device_printf(sc->dev, "extended mapping failed\n");
433 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
437 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
439 device_printf(sc->dev,
440 "Enabled ECRC on upstream Nvidia bridge "
442 (int)bus, (int)slot, (int)func);
447 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
449 device_printf(sc->dev,
450 "Nforce 4 chipset on non-x86/amd64!?!?!\n");
457 mxge_dma_test(mxge_softc_t *sc, int test_type)
460 bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
466 /* Run a small DMA test.
467 * The magic multipliers to the length tell the firmware
468 * to do DMA read, write, or read+write tests. The
469 * results are returned in cmd.data0. The upper 16
470 * bits of the return is the number of transfers completed.
471 * The lower 16 bits is the time in 0.5us ticks that the
472 * transfers took to complete.
475 len = sc->tx_boundary;
477 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
478 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
479 cmd.data2 = len * 0x10000;
480 status = mxge_send_cmd(sc, test_type, &cmd);
485 sc->read_dma = ((cmd.data0>>16) * len * 2) /
486 (cmd.data0 & 0xffff);
487 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
488 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
489 cmd.data2 = len * 0x1;
490 status = mxge_send_cmd(sc, test_type, &cmd);
495 sc->write_dma = ((cmd.data0>>16) * len * 2) /
496 (cmd.data0 & 0xffff);
498 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
499 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
500 cmd.data2 = len * 0x10001;
501 status = mxge_send_cmd(sc, test_type, &cmd);
506 sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
507 (cmd.data0 & 0xffff);
510 if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
511 device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
518 * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
519 * when the PCI-E Completion packets are aligned on an 8-byte
520 * boundary. Some PCI-E chip sets always align Completion packets; on
521 * the ones that do not, the alignment can be enforced by enabling
522 * ECRC generation (if supported).
524 * When PCI-E Completion packets are not aligned, it is actually more
525 * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
527 * If the driver can neither enable ECRC nor verify that it has
528 * already been enabled, then it must use a firmware image which works
529 * around unaligned completion packets (ethp_z8e.dat), and it should
530 * also ensure that it never gives the device a Read-DMA which is
531 * larger than 2KB by setting the tx_boundary to 2KB. If ECRC is
532 * enabled, then the driver should use the aligned (eth_z8e.dat)
533 * firmware image, and set tx_boundary to 4KB.
537 mxge_firmware_probe(mxge_softc_t *sc)
539 device_t dev = sc->dev;
543 sc->tx_boundary = 4096;
545 * Verify the max read request size was set to 4KB
546 * before trying the test with 4KB.
548 if (pci_find_extcap(dev, PCIY_EXPRESS, ®) == 0) {
549 pectl = pci_read_config(dev, reg + 0x8, 2);
550 if ((pectl & (5 << 12)) != (5 << 12)) {
551 device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
553 sc->tx_boundary = 2048;
558 * load the optimized firmware (which assumes aligned PCIe
559 * completions) in order to see if it works on this host.
561 sc->fw_name = mxge_fw_aligned;
562 status = mxge_load_firmware(sc, 1);
568 * Enable ECRC if possible
570 mxge_enable_nvidia_ecrc(sc);
573 * Run a DMA test which watches for unaligned completions and
574 * aborts on the first one seen.
577 status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
579 return 0; /* keep the aligned firmware */
582 device_printf(dev, "DMA test failed: %d\n", status);
583 if (status == ENOSYS)
584 device_printf(dev, "Falling back to ethp! "
585 "Please install up to date fw\n");
590 mxge_select_firmware(mxge_softc_t *sc)
595 if (mxge_force_firmware != 0) {
596 if (mxge_force_firmware == 1)
601 device_printf(sc->dev,
602 "Assuming %s completions (forced)\n",
603 aligned ? "aligned" : "unaligned");
607 /* if the PCIe link width is 4 or less, we can use the aligned
608 firmware and skip any checks */
609 if (sc->link_width != 0 && sc->link_width <= 4) {
610 device_printf(sc->dev,
611 "PCIe x%d Link, expect reduced performance\n",
617 if (0 == mxge_firmware_probe(sc))
622 sc->fw_name = mxge_fw_aligned;
623 sc->tx_boundary = 4096;
625 sc->fw_name = mxge_fw_unaligned;
626 sc->tx_boundary = 2048;
628 return (mxge_load_firmware(sc, 0));
638 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
642 if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
643 device_printf(sc->dev, "Bad firmware type: 0x%x\n",
644 be32toh(hdr->mcp_type));
648 /* save firmware version for sysctl */
649 strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
651 device_printf(sc->dev, "firmware id: %s\n", hdr->version);
653 ksscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
654 &sc->fw_ver_minor, &sc->fw_ver_tiny);
656 if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
657 && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
658 device_printf(sc->dev, "Found firmware version %s\n",
660 device_printf(sc->dev, "Driver needs %d.%d\n",
661 MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
669 z_alloc(void *nil, u_int items, u_int size)
673 ptr = kmalloc(items * size, M_TEMP, M_NOWAIT);
678 z_free(void *nil, void *ptr)
685 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
688 char *inflate_buffer;
689 const struct firmware *fw;
690 const mcp_gen_header_t *hdr;
697 fw = firmware_get(sc->fw_name);
699 device_printf(sc->dev, "Could not find firmware image %s\n",
706 /* setup zlib and decompress f/w */
707 bzero(&zs, sizeof (zs));
710 status = inflateInit(&zs);
711 if (status != Z_OK) {
716 /* the uncompressed size is stored as the firmware version,
717 which would otherwise go unused */
718 fw_len = (size_t) fw->version;
719 inflate_buffer = kmalloc(fw_len, M_TEMP, M_NOWAIT);
720 if (inflate_buffer == NULL)
722 zs.avail_in = fw->datasize;
723 zs.next_in = __DECONST(char *, fw->data);
724 zs.avail_out = fw_len;
725 zs.next_out = inflate_buffer;
726 status = inflate(&zs, Z_FINISH);
727 if (status != Z_STREAM_END) {
728 device_printf(sc->dev, "zlib %d\n", status);
730 goto abort_with_buffer;
734 hdr_offset = htobe32(*(const uint32_t *)
735 (inflate_buffer + MCP_HEADER_PTR_OFFSET));
736 if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
737 device_printf(sc->dev, "Bad firmware file");
739 goto abort_with_buffer;
741 hdr = (const void*)(inflate_buffer + hdr_offset);
743 status = mxge_validate_firmware(sc, hdr);
745 goto abort_with_buffer;
747 /* Copy the inflated firmware to NIC SRAM. */
748 for (i = 0; i < fw_len; i += 256) {
749 mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
751 min(256U, (unsigned)(fw_len - i)));
760 kfree(inflate_buffer, M_TEMP);
764 firmware_put(fw, FIRMWARE_UNLOAD);
769 * Enable or disable periodic RDMAs from the host to make certain
770 * chipsets resend dropped PCIe messages
774 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
777 volatile uint32_t *confirm;
778 volatile char *submit;
779 uint32_t *buf, dma_low, dma_high;
782 buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
784 /* clear confirmation addr */
785 confirm = (volatile uint32_t *)sc->cmd;
789 /* send an rdma command to the PCIe engine, and wait for the
790 response in the confirmation address. The firmware should
791 write a -1 there to indicate it is alive and well
794 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
795 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
796 buf[0] = htobe32(dma_high); /* confirm addr MSW */
797 buf[1] = htobe32(dma_low); /* confirm addr LSW */
798 buf[2] = htobe32(0xffffffff); /* confirm data */
799 dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
800 dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
801 buf[3] = htobe32(dma_high); /* dummy addr MSW */
802 buf[4] = htobe32(dma_low); /* dummy addr LSW */
803 buf[5] = htobe32(enable); /* enable? */
806 submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
808 mxge_pio_copy(submit, buf, 64);
813 while (*confirm != 0xffffffff && i < 20) {
817 if (*confirm != 0xffffffff) {
818 device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
819 (enable ? "enable" : "disable"), confirm,
826 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
829 char buf_bytes[sizeof(*buf) + 8];
830 volatile mcp_cmd_response_t *response = sc->cmd;
831 volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
832 uint32_t dma_low, dma_high;
833 int err, sleep_total = 0;
835 /* ensure buf is aligned to 8 bytes */
836 buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
838 buf->data0 = htobe32(data->data0);
839 buf->data1 = htobe32(data->data1);
840 buf->data2 = htobe32(data->data2);
841 buf->cmd = htobe32(cmd);
842 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
843 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
845 buf->response_addr.low = htobe32(dma_low);
846 buf->response_addr.high = htobe32(dma_high);
847 lockmgr(&sc->cmd_lock, LK_EXCLUSIVE);
848 response->result = 0xffffffff;
850 mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
852 /* wait up to 20ms */
854 for (sleep_total = 0; sleep_total < 20; sleep_total++) {
855 bus_dmamap_sync(sc->cmd_dma.dmat,
856 sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
858 switch (be32toh(response->result)) {
860 data->data0 = be32toh(response->data);
866 case MXGEFW_CMD_UNKNOWN:
869 case MXGEFW_CMD_ERROR_UNALIGNED:
872 case MXGEFW_CMD_ERROR_BUSY:
876 device_printf(sc->dev,
878 "failed, result = %d\n",
879 cmd, be32toh(response->result));
887 device_printf(sc->dev, "mxge: command %d timed out"
889 cmd, be32toh(response->result));
890 lockmgr(&sc->cmd_lock, LK_RELEASE);
895 mxge_adopt_running_firmware(mxge_softc_t *sc)
897 struct mcp_gen_header *hdr;
898 const size_t bytes = sizeof (struct mcp_gen_header);
902 /* find running firmware header */
903 hdr_offset = htobe32(*(volatile uint32_t *)
904 (sc->sram + MCP_HEADER_PTR_OFFSET));
906 if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
907 device_printf(sc->dev,
908 "Running firmware has bad header offset (%d)\n",
913 /* copy header of running firmware from SRAM to host memory to
914 * validate firmware */
915 hdr = kmalloc(bytes, M_DEVBUF, M_NOWAIT);
917 device_printf(sc->dev, "could not kmalloc firmware hdr\n");
920 bus_space_read_region_1(rman_get_bustag(sc->mem_res),
921 rman_get_bushandle(sc->mem_res),
922 hdr_offset, (char *)hdr, bytes);
923 status = mxge_validate_firmware(sc, hdr);
924 kfree(hdr, M_DEVBUF);
927 * check to see if adopted firmware has bug where adopting
928 * it will cause broadcasts to be filtered unless the NIC
929 * is kept in ALLMULTI mode
931 if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
932 sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
933 sc->adopted_rx_filter_bug = 1;
934 device_printf(sc->dev, "Adopting fw %d.%d.%d: "
935 "working around rx filter bug\n",
936 sc->fw_ver_major, sc->fw_ver_minor,
945 mxge_load_firmware(mxge_softc_t *sc, int adopt)
947 volatile uint32_t *confirm;
948 volatile char *submit;
950 uint32_t *buf, size, dma_low, dma_high;
953 buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
955 size = sc->sram_size;
956 status = mxge_load_firmware_helper(sc, &size);
960 /* Try to use the currently running firmware, if
962 status = mxge_adopt_running_firmware(sc);
964 device_printf(sc->dev,
965 "failed to adopt running firmware\n");
968 device_printf(sc->dev,
969 "Successfully adopted running firmware\n");
970 if (sc->tx_boundary == 4096) {
971 device_printf(sc->dev,
972 "Using firmware currently running on NIC"
974 device_printf(sc->dev,
975 "performance consider loading optimized "
978 sc->fw_name = mxge_fw_unaligned;
979 sc->tx_boundary = 2048;
982 /* clear confirmation addr */
983 confirm = (volatile uint32_t *)sc->cmd;
986 /* send a reload command to the bootstrap MCP, and wait for the
987 response in the confirmation address. The firmware should
988 write a -1 there to indicate it is alive and well
991 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
992 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
994 buf[0] = htobe32(dma_high); /* confirm addr MSW */
995 buf[1] = htobe32(dma_low); /* confirm addr LSW */
996 buf[2] = htobe32(0xffffffff); /* confirm data */
998 /* FIX: All newest firmware should un-protect the bottom of
999 the sram before handoff. However, the very first interfaces
1000 do not. Therefore the handoff copy must skip the first 8 bytes
1002 /* where the code starts*/
1003 buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1004 buf[4] = htobe32(size - 8); /* length of code */
1005 buf[5] = htobe32(8); /* where to copy to */
1006 buf[6] = htobe32(0); /* where to jump to */
1008 submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1009 mxge_pio_copy(submit, buf, 64);
1014 while (*confirm != 0xffffffff && i < 20) {
1017 bus_dmamap_sync(sc->cmd_dma.dmat,
1018 sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1020 if (*confirm != 0xffffffff) {
1021 device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1030 mxge_update_mac_address(mxge_softc_t *sc)
1033 uint8_t *addr = sc->mac_addr;
1037 cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1038 | (addr[2] << 8) | addr[3]);
1040 cmd.data1 = ((addr[4] << 8) | (addr[5]));
1042 status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1047 mxge_change_pause(mxge_softc_t *sc, int pause)
1053 status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1056 status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1060 device_printf(sc->dev, "Failed to set flow control mode\n");
1068 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1073 if (mxge_always_promisc)
1077 status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1080 status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1084 device_printf(sc->dev, "Failed to set promisc mode\n");
1089 mxge_set_multicast_list(mxge_softc_t *sc)
1092 struct ifmultiaddr *ifma;
1093 struct ifnet *ifp = sc->ifp;
1096 /* This firmware is known to not support multicast */
1097 if (!sc->fw_multicast_support)
1100 /* Disable multicast filtering while we play with the lists*/
1101 err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1103 device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1104 " error status: %d\n", err);
1108 if (sc->adopted_rx_filter_bug)
1111 if (ifp->if_flags & IFF_ALLMULTI)
1112 /* request to disable multicast filtering, so quit here */
1115 /* Flush all the filters */
1117 err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1119 device_printf(sc->dev,
1120 "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1121 ", error status: %d\n", err);
1125 /* Walk the multicast list, and add each address */
1127 if_maddr_rlock(ifp);
1128 TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1129 if (ifma->ifma_addr->sa_family != AF_LINK)
1131 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1133 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1135 cmd.data0 = htonl(cmd.data0);
1136 cmd.data1 = htonl(cmd.data1);
1137 err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1139 device_printf(sc->dev, "Failed "
1140 "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1142 /* abort, leaving multicast filtering off */
1143 if_maddr_runlock(ifp);
1147 if_maddr_runlock(ifp);
1148 /* Enable multicast filtering */
1149 err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1151 device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1152 ", error status: %d\n", err);
1157 mxge_max_mtu(mxge_softc_t *sc)
1162 if (MJUMPAGESIZE - MXGEFW_PAD > MXGEFW_MAX_MTU)
1163 return MXGEFW_MAX_MTU - MXGEFW_PAD;
1165 /* try to set nbufs to see if it we can
1166 use virtually contiguous jumbos */
1168 status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1171 return MXGEFW_MAX_MTU - MXGEFW_PAD;
1173 /* otherwise, we're limited to MJUMPAGESIZE */
1174 return MJUMPAGESIZE - MXGEFW_PAD;
1178 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1180 struct mxge_slice_state *ss;
1181 mxge_rx_done_t *rx_done;
1182 volatile uint32_t *irq_claim;
1186 /* try to send a reset command to the card to see if it
1188 memset(&cmd, 0, sizeof (cmd));
1189 status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1191 device_printf(sc->dev, "failed reset\n");
1195 mxge_dummy_rdma(sc, 1);
1198 /* set the intrq size */
1199 cmd.data0 = sc->rx_ring_size;
1200 status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1203 * Even though we already know how many slices are supported
1204 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1205 * has magic side effects, and must be called after a reset.
1206 * It must be called prior to calling any RSS related cmds,
1207 * including assigning an interrupt queue for anything but
1208 * slice 0. It must also be called *after*
1209 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1210 * the firmware to compute offsets.
1213 if (sc->num_slices > 1) {
1214 /* ask the maximum number of slices it supports */
1215 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1218 device_printf(sc->dev,
1219 "failed to get number of slices\n");
1223 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1224 * to setting up the interrupt queue DMA
1226 cmd.data0 = sc->num_slices;
1227 cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1228 #ifdef IFNET_BUF_RING
1229 cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1231 status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1234 device_printf(sc->dev,
1235 "failed to set number of slices\n");
1241 if (interrupts_setup) {
1242 /* Now exchange information about interrupts */
1243 for (slice = 0; slice < sc->num_slices; slice++) {
1244 rx_done = &sc->ss[slice].rx_done;
1245 memset(rx_done->entry, 0, sc->rx_ring_size);
1246 cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1247 cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1249 status |= mxge_send_cmd(sc,
1250 MXGEFW_CMD_SET_INTRQ_DMA,
1255 status |= mxge_send_cmd(sc,
1256 MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1259 sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1261 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1262 irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1265 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1267 sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1269 device_printf(sc->dev, "failed set interrupt parameters\n");
1274 *sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1277 /* run a DMA benchmark */
1278 (void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1280 for (slice = 0; slice < sc->num_slices; slice++) {
1281 ss = &sc->ss[slice];
1283 ss->irq_claim = irq_claim + (2 * slice);
1284 /* reset mcp/driver shared state back to 0 */
1285 ss->rx_done.idx = 0;
1286 ss->rx_done.cnt = 0;
1289 ss->tx.pkt_done = 0;
1290 ss->tx.queue_active = 0;
1291 ss->tx.activate = 0;
1292 ss->tx.deactivate = 0;
1297 ss->rx_small.cnt = 0;
1298 ss->lro_bad_csum = 0;
1300 ss->lro_flushed = 0;
1301 if (ss->fw_stats != NULL) {
1302 ss->fw_stats->valid = 0;
1303 ss->fw_stats->send_done_count = 0;
1306 sc->rdma_tags_available = 15;
1307 status = mxge_update_mac_address(sc);
1308 mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1309 mxge_change_pause(sc, sc->pause);
1310 mxge_set_multicast_list(sc);
1315 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1318 unsigned int intr_coal_delay;
1322 intr_coal_delay = sc->intr_coal_delay;
1323 err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1327 if (intr_coal_delay == sc->intr_coal_delay)
1330 if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1333 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
1334 *sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1335 sc->intr_coal_delay = intr_coal_delay;
1337 lockmgr(&sc->driver_lock, LK_RELEASE);
1342 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1345 unsigned int enabled;
1349 enabled = sc->pause;
1350 err = sysctl_handle_int(oidp, &enabled, arg2, req);
1354 if (enabled == sc->pause)
1357 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
1358 err = mxge_change_pause(sc, enabled);
1359 lockmgr(&sc->driver_lock, LK_RELEASE);
1364 mxge_change_lro_locked(mxge_softc_t *sc, int lro_cnt)
1371 ifp->if_capenable &= ~IFCAP_LRO;
1373 ifp->if_capenable |= IFCAP_LRO;
1374 sc->lro_cnt = lro_cnt;
1375 if (ifp->if_flags & IFF_RUNNING) {
1377 err = mxge_open(sc);
1383 mxge_change_lro(SYSCTL_HANDLER_ARGS)
1386 unsigned int lro_cnt;
1390 lro_cnt = sc->lro_cnt;
1391 err = sysctl_handle_int(oidp, &lro_cnt, arg2, req);
1395 if (lro_cnt == sc->lro_cnt)
1401 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
1402 err = mxge_change_lro_locked(sc, lro_cnt);
1403 lockmgr(&sc->driver_lock, LK_RELEASE);
1408 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1414 arg2 = be32toh(*(int *)arg1);
1416 err = sysctl_handle_int(oidp, arg1, arg2, req);
1422 mxge_rem_sysctls(mxge_softc_t *sc)
1424 struct mxge_slice_state *ss;
1427 if (sc->slice_sysctl_tree == NULL)
1430 for (slice = 0; slice < sc->num_slices; slice++) {
1431 ss = &sc->ss[slice];
1432 if (ss == NULL || ss->sysctl_tree == NULL)
1434 sysctl_ctx_free(&ss->sysctl_ctx);
1435 ss->sysctl_tree = NULL;
1437 sysctl_ctx_free(&sc->slice_sysctl_ctx);
1438 sc->slice_sysctl_tree = NULL;
1442 mxge_add_sysctls(mxge_softc_t *sc)
1444 struct sysctl_ctx_list *ctx;
1445 struct sysctl_oid_list *children;
1447 struct mxge_slice_state *ss;
1451 ctx = &sc->sysctl_ctx;
1452 sysctl_ctx_init(ctx);
1453 sc->sysctl_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_STATIC_CHILDREN(_hw),
1455 device_get_nameunit(sc->dev),
1457 if (sc->sysctl_tree == NULL) {
1458 device_printf(sc->dev, "can't add sysctl node\n");
1462 children = SYSCTL_CHILDREN(sc->sysctl_tree);
1463 fw = sc->ss[0].fw_stats;
1465 /* random information */
1466 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1468 CTLFLAG_RD, &sc->fw_version,
1469 0, "firmware version");
1470 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1472 CTLFLAG_RD, &sc->serial_number_string,
1473 0, "serial number");
1474 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1476 CTLFLAG_RD, &sc->product_code_string,
1478 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1480 CTLFLAG_RD, &sc->link_width,
1482 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1484 CTLFLAG_RD, &sc->tx_boundary,
1486 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1488 CTLFLAG_RD, &sc->wc,
1489 0, "write combining PIO?");
1490 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1492 CTLFLAG_RD, &sc->read_dma,
1493 0, "DMA Read speed in MB/s");
1494 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1496 CTLFLAG_RD, &sc->write_dma,
1497 0, "DMA Write speed in MB/s");
1498 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1499 "read_write_dma_MBs",
1500 CTLFLAG_RD, &sc->read_write_dma,
1501 0, "DMA concurrent Read/Write speed in MB/s");
1504 /* performance related tunables */
1505 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1507 CTLTYPE_INT|CTLFLAG_RW, sc,
1508 0, mxge_change_intr_coal,
1509 "I", "interrupt coalescing delay in usecs");
1511 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1512 "flow_control_enabled",
1513 CTLTYPE_INT|CTLFLAG_RW, sc,
1514 0, mxge_change_flow_control,
1515 "I", "interrupt coalescing delay in usecs");
1517 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1519 CTLFLAG_RW, &mxge_deassert_wait,
1520 0, "Wait for IRQ line to go low in ihandler");
1522 /* stats block from firmware is in network byte order.
1524 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1526 CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1527 0, mxge_handle_be32,
1529 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1530 "rdma_tags_available",
1531 CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1532 0, mxge_handle_be32,
1533 "I", "rdma_tags_available");
1534 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1535 "dropped_bad_crc32",
1536 CTLTYPE_INT|CTLFLAG_RD,
1537 &fw->dropped_bad_crc32,
1538 0, mxge_handle_be32,
1539 "I", "dropped_bad_crc32");
1540 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1542 CTLTYPE_INT|CTLFLAG_RD,
1543 &fw->dropped_bad_phy,
1544 0, mxge_handle_be32,
1545 "I", "dropped_bad_phy");
1546 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1547 "dropped_link_error_or_filtered",
1548 CTLTYPE_INT|CTLFLAG_RD,
1549 &fw->dropped_link_error_or_filtered,
1550 0, mxge_handle_be32,
1551 "I", "dropped_link_error_or_filtered");
1552 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1553 "dropped_link_overflow",
1554 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1555 0, mxge_handle_be32,
1556 "I", "dropped_link_overflow");
1557 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1558 "dropped_multicast_filtered",
1559 CTLTYPE_INT|CTLFLAG_RD,
1560 &fw->dropped_multicast_filtered,
1561 0, mxge_handle_be32,
1562 "I", "dropped_multicast_filtered");
1563 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1564 "dropped_no_big_buffer",
1565 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1566 0, mxge_handle_be32,
1567 "I", "dropped_no_big_buffer");
1568 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1569 "dropped_no_small_buffer",
1570 CTLTYPE_INT|CTLFLAG_RD,
1571 &fw->dropped_no_small_buffer,
1572 0, mxge_handle_be32,
1573 "I", "dropped_no_small_buffer");
1574 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1576 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1577 0, mxge_handle_be32,
1578 "I", "dropped_overrun");
1579 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1581 CTLTYPE_INT|CTLFLAG_RD,
1583 0, mxge_handle_be32,
1584 "I", "dropped_pause");
1585 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1587 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1588 0, mxge_handle_be32,
1589 "I", "dropped_runt");
1591 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1592 "dropped_unicast_filtered",
1593 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1594 0, mxge_handle_be32,
1595 "I", "dropped_unicast_filtered");
1597 /* verbose printing? */
1598 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1600 CTLFLAG_RW, &mxge_verbose,
1601 0, "verbose printing");
1604 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1606 CTLTYPE_INT|CTLFLAG_RW, sc,
1608 "I", "number of lro merge queues");
1611 /* add counters exported for debugging from all slices */
1612 sysctl_ctx_init(&sc->slice_sysctl_ctx);
1613 sc->slice_sysctl_tree =
1614 SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1615 "slice", CTLFLAG_RD, 0, "");
1617 for (slice = 0; slice < sc->num_slices; slice++) {
1618 ss = &sc->ss[slice];
1619 sysctl_ctx_init(&ss->sysctl_ctx);
1620 ctx = &ss->sysctl_ctx;
1621 children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1622 ksprintf(slice_num, "%d", slice);
1624 SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1626 children = SYSCTL_CHILDREN(ss->sysctl_tree);
1627 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1629 CTLFLAG_RD, &ss->rx_small.cnt,
1631 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1633 CTLFLAG_RD, &ss->rx_big.cnt,
1635 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1636 "lro_flushed", CTLFLAG_RD, &ss->lro_flushed,
1637 0, "number of lro merge queues flushed");
1639 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1640 "lro_queued", CTLFLAG_RD, &ss->lro_queued,
1641 0, "number of frames appended to lro merge"
1644 #ifndef IFNET_BUF_RING
1645 /* only transmit from slice 0 for now */
1649 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1651 CTLFLAG_RD, &ss->tx.req,
1654 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1656 CTLFLAG_RD, &ss->tx.done,
1658 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1660 CTLFLAG_RD, &ss->tx.pkt_done,
1662 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1664 CTLFLAG_RD, &ss->tx.stall,
1666 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1668 CTLFLAG_RD, &ss->tx.wake,
1670 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1672 CTLFLAG_RD, &ss->tx.defrag,
1674 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1676 CTLFLAG_RD, &ss->tx.queue_active,
1677 0, "tx_queue_active");
1678 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1680 CTLFLAG_RD, &ss->tx.activate,
1682 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1684 CTLFLAG_RD, &ss->tx.deactivate,
1685 0, "tx_deactivate");
1689 /* copy an array of mcp_kreq_ether_send_t's to the mcp. Copy
1690 backwards one at a time and handle ring wraps */
1693 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1694 mcp_kreq_ether_send_t *src, int cnt)
1696 int idx, starting_slot;
1697 starting_slot = tx->req;
1700 idx = (starting_slot + cnt) & tx->mask;
1701 mxge_pio_copy(&tx->lanai[idx],
1702 &src[cnt], sizeof(*src));
1708 * copy an array of mcp_kreq_ether_send_t's to the mcp. Copy
1709 * at most 32 bytes at a time, so as to avoid involving the software
1710 * pio handler in the nic. We re-write the first segment's flags
1711 * to mark them valid only after writing the entire chain
1715 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1720 volatile uint32_t *dst_ints;
1721 mcp_kreq_ether_send_t *srcp;
1722 volatile mcp_kreq_ether_send_t *dstp, *dst;
1725 idx = tx->req & tx->mask;
1727 last_flags = src->flags;
1730 dst = dstp = &tx->lanai[idx];
1733 if ((idx + cnt) < tx->mask) {
1734 for (i = 0; i < (cnt - 1); i += 2) {
1735 mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1736 wmb(); /* force write every 32 bytes */
1741 /* submit all but the first request, and ensure
1742 that it is submitted below */
1743 mxge_submit_req_backwards(tx, src, cnt);
1747 /* submit the first request */
1748 mxge_pio_copy(dstp, srcp, sizeof(*src));
1749 wmb(); /* barrier before setting valid flag */
1752 /* re-write the last 32-bits with the valid flags */
1753 src->flags = last_flags;
1754 src_ints = (uint32_t *)src;
1756 dst_ints = (volatile uint32_t *)dst;
1758 *dst_ints = *src_ints;
1766 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1767 int busdma_seg_cnt, int ip_off)
1770 mcp_kreq_ether_send_t *req;
1771 bus_dma_segment_t *seg;
1774 uint32_t low, high_swapped;
1775 int len, seglen, cum_len, cum_len_next;
1776 int next_is_first, chop, cnt, rdma_count, small;
1777 uint16_t pseudo_hdr_offset, cksum_offset, mss;
1778 uint8_t flags, flags_next;
1781 mss = m->m_pkthdr.tso_segsz;
1783 /* negative cum_len signifies to the
1784 * send loop that we are still in the
1785 * header portion of the TSO packet.
1788 /* ensure we have the ethernet, IP and TCP
1789 header together in the first mbuf, copy
1790 it to a scratch buffer if not */
1791 if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1792 m_copydata(m, 0, ip_off + sizeof (*ip),
1794 ip = (struct ip *)(ss->scratch + ip_off);
1796 ip = (struct ip *)(mtod(m, char *) + ip_off);
1798 if (__predict_false(m->m_len < ip_off + (ip->ip_hl << 2)
1800 m_copydata(m, 0, ip_off + (ip->ip_hl << 2)
1801 + sizeof (*tcp), ss->scratch);
1802 ip = (struct ip *)(mtod(m, char *) + ip_off);
1805 tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1806 cum_len = -(ip_off + ((ip->ip_hl + tcp->th_off) << 2));
1808 /* TSO implies checksum offload on this hardware */
1809 cksum_offset = ip_off + (ip->ip_hl << 2);
1810 flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1813 /* for TSO, pseudo_hdr_offset holds mss.
1814 * The firmware figures out where to put
1815 * the checksum by parsing the header. */
1816 pseudo_hdr_offset = htobe16(mss);
1823 /* "rdma_count" is the number of RDMAs belonging to the
1824 * current packet BEFORE the current send request. For
1825 * non-TSO packets, this is equal to "count".
1826 * For TSO packets, rdma_count needs to be reset
1827 * to 0 after a segment cut.
1829 * The rdma_count field of the send request is
1830 * the number of RDMAs of the packet starting at
1831 * that request. For TSO send requests with one ore more cuts
1832 * in the middle, this is the number of RDMAs starting
1833 * after the last cut in the request. All previous
1834 * segments before the last cut implicitly have 1 RDMA.
1836 * Since the number of RDMAs is not known beforehand,
1837 * it must be filled-in retroactively - after each
1838 * segmentation cut or at the end of the entire packet.
1841 while (busdma_seg_cnt) {
1842 /* Break the busdma segment up into pieces*/
1843 low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1844 high_swapped = htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1848 flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1850 cum_len_next = cum_len + seglen;
1851 (req-rdma_count)->rdma_count = rdma_count + 1;
1852 if (__predict_true(cum_len >= 0)) {
1854 chop = (cum_len_next > mss);
1855 cum_len_next = cum_len_next % mss;
1856 next_is_first = (cum_len_next == 0);
1857 flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1858 flags_next |= next_is_first *
1860 rdma_count |= -(chop | next_is_first);
1861 rdma_count += chop & !next_is_first;
1862 } else if (cum_len_next >= 0) {
1867 small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1868 flags_next = MXGEFW_FLAGS_TSO_PLD |
1869 MXGEFW_FLAGS_FIRST |
1870 (small * MXGEFW_FLAGS_SMALL);
1873 req->addr_high = high_swapped;
1874 req->addr_low = htobe32(low);
1875 req->pseudo_hdr_offset = pseudo_hdr_offset;
1877 req->rdma_count = 1;
1878 req->length = htobe16(seglen);
1879 req->cksum_offset = cksum_offset;
1880 req->flags = flags | ((cum_len & 1) *
1881 MXGEFW_FLAGS_ALIGN_ODD);
1884 cum_len = cum_len_next;
1889 if (__predict_false(cksum_offset > seglen))
1890 cksum_offset -= seglen;
1893 if (__predict_false(cnt > tx->max_desc))
1899 (req-rdma_count)->rdma_count = rdma_count;
1903 req->flags |= MXGEFW_FLAGS_TSO_LAST;
1904 } while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1906 tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1907 mxge_submit_req(tx, tx->req_list, cnt);
1908 #ifdef IFNET_BUF_RING
1909 if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1910 /* tell the NIC to start polling this slice */
1912 tx->queue_active = 1;
1920 bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1924 kprintf("tx->max_desc exceeded via TSO!\n");
1925 kprintf("mss = %d, %ld, %d!\n", mss,
1926 (long)seg - (long)tx->seg_list, tx->max_desc);
1933 #endif /* IFCAP_TSO4 */
1935 #ifdef MXGE_NEW_VLAN_API
1937 * We reproduce the software vlan tag insertion from
1938 * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
1939 * vlan tag insertion. We need to advertise this in order to have the
1940 * vlan interface respect our csum offload flags.
1942 static struct mbuf *
1943 mxge_vlan_tag_insert(struct mbuf *m)
1945 struct ether_vlan_header *evl;
1947 M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, MB_DONTWAIT);
1948 if (__predict_false(m == NULL))
1950 if (m->m_len < sizeof(*evl)) {
1951 m = m_pullup(m, sizeof(*evl));
1952 if (__predict_false(m == NULL))
1956 * Transform the Ethernet header into an Ethernet header
1957 * with 802.1Q encapsulation.
1959 evl = mtod(m, struct ether_vlan_header *);
1960 bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
1961 (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
1962 evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
1963 evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
1964 m->m_flags &= ~M_VLANTAG;
1967 #endif /* MXGE_NEW_VLAN_API */
1970 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
1973 mcp_kreq_ether_send_t *req;
1974 bus_dma_segment_t *seg;
1979 int cnt, cum_len, err, i, idx, odd_flag, ip_off;
1980 uint16_t pseudo_hdr_offset;
1981 uint8_t flags, cksum_offset;
1988 ip_off = sizeof (struct ether_header);
1989 #ifdef MXGE_NEW_VLAN_API
1990 if (m->m_flags & M_VLANTAG) {
1991 m = mxge_vlan_tag_insert(m);
1992 if (__predict_false(m == NULL))
1994 ip_off += ETHER_VLAN_ENCAP_LEN;
1997 /* (try to) map the frame for DMA */
1998 idx = tx->req & tx->mask;
1999 err = bus_dmamap_load_mbuf_segment(tx->dmat, tx->info[idx].map,
2000 m, tx->seg_list, 1, &cnt,
2002 if (__predict_false(err == EFBIG)) {
2003 /* Too many segments in the chain. Try
2005 m_tmp = m_defrag(m, M_NOWAIT);
2006 if (m_tmp == NULL) {
2011 err = bus_dmamap_load_mbuf_segment(tx->dmat,
2013 m, tx->seg_list, 1, &cnt,
2016 if (__predict_false(err != 0)) {
2017 device_printf(sc->dev, "bus_dmamap_load_mbuf_segment returned %d"
2018 " packet len = %d\n", err, m->m_pkthdr.len);
2021 bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2022 BUS_DMASYNC_PREWRITE);
2023 tx->info[idx].m = m;
2026 /* TSO is different enough, we handle it in another routine */
2027 if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2028 mxge_encap_tso(ss, m, cnt, ip_off);
2035 pseudo_hdr_offset = 0;
2036 flags = MXGEFW_FLAGS_NO_TSO;
2038 /* checksum offloading? */
2039 if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
2040 /* ensure ip header is in first mbuf, copy
2041 it to a scratch buffer if not */
2042 if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
2043 m_copydata(m, 0, ip_off + sizeof (*ip),
2045 ip = (struct ip *)(ss->scratch + ip_off);
2047 ip = (struct ip *)(mtod(m, char *) + ip_off);
2049 cksum_offset = ip_off + (ip->ip_hl << 2);
2050 pseudo_hdr_offset = cksum_offset + m->m_pkthdr.csum_data;
2051 pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2052 req->cksum_offset = cksum_offset;
2053 flags |= MXGEFW_FLAGS_CKSUM;
2054 odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2058 if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2059 flags |= MXGEFW_FLAGS_SMALL;
2061 /* convert segments into a request list */
2064 req->flags = MXGEFW_FLAGS_FIRST;
2065 for (i = 0; i < cnt; i++) {
2067 htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2069 htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2070 req->length = htobe16(seg->ds_len);
2071 req->cksum_offset = cksum_offset;
2072 if (cksum_offset > seg->ds_len)
2073 cksum_offset -= seg->ds_len;
2076 req->pseudo_hdr_offset = pseudo_hdr_offset;
2077 req->pad = 0; /* complete solid 16-byte block */
2078 req->rdma_count = 1;
2079 req->flags |= flags | ((cum_len & 1) * odd_flag);
2080 cum_len += seg->ds_len;
2086 /* pad runts to 60 bytes */
2090 htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2092 htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2093 req->length = htobe16(60 - cum_len);
2094 req->cksum_offset = 0;
2095 req->pseudo_hdr_offset = pseudo_hdr_offset;
2096 req->pad = 0; /* complete solid 16-byte block */
2097 req->rdma_count = 1;
2098 req->flags |= flags | ((cum_len & 1) * odd_flag);
2102 tx->req_list[0].rdma_count = cnt;
2104 /* print what the firmware will see */
2105 for (i = 0; i < cnt; i++) {
2106 kprintf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2107 "cso:%d, flags:0x%x, rdma:%d\n",
2108 i, (int)ntohl(tx->req_list[i].addr_high),
2109 (int)ntohl(tx->req_list[i].addr_low),
2110 (int)ntohs(tx->req_list[i].length),
2111 (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2112 tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2113 tx->req_list[i].rdma_count);
2115 kprintf("--------------\n");
2117 tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2118 mxge_submit_req(tx, tx->req_list, cnt);
2119 #ifdef IFNET_BUF_RING
2120 if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2121 /* tell the NIC to start polling this slice */
2123 tx->queue_active = 1;
2136 #ifdef IFNET_BUF_RING
2138 mxge_qflush(struct ifnet *ifp)
2140 mxge_softc_t *sc = ifp->if_softc;
2145 for (slice = 0; slice < sc->num_slices; slice++) {
2146 tx = &sc->ss[slice].tx;
2147 lockmgr(&tx->lock, LK_EXCLUSIVE);
2148 while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2150 lockmgr(&tx->lock, LK_RELEASE);
2156 mxge_start_locked(struct mxge_slice_state *ss)
2167 while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2168 m = drbr_dequeue(ifp, tx->br);
2172 /* let BPF see it */
2175 /* give it to the nic */
2178 /* ran out of transmit slots */
2179 if (((ss->if_flags & IFF_OACTIVE) == 0)
2180 && (!drbr_empty(ifp, tx->br))) {
2181 ss->if_flags |= IFF_OACTIVE;
2187 mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2198 if ((ss->if_flags & (IFF_RUNNING|IFF_OACTIVE)) !=
2200 err = drbr_enqueue(ifp, tx->br, m);
2204 if (drbr_empty(ifp, tx->br) &&
2205 ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2206 /* let BPF see it */
2208 /* give it to the nic */
2210 } else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2213 if (!drbr_empty(ifp, tx->br))
2214 mxge_start_locked(ss);
2219 mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2221 mxge_softc_t *sc = ifp->if_softc;
2222 struct mxge_slice_state *ss;
2227 slice = m->m_pkthdr.flowid;
2228 slice &= (sc->num_slices - 1); /* num_slices always power of 2 */
2230 ss = &sc->ss[slice];
2233 if (lockmgr(&tx->lock, LK_EXCLUSIVE|LK_NOWAIT)) {
2234 err = mxge_transmit_locked(ss, m);
2235 lockmgr(&tx->lock, LK_RELEASE);
2237 err = drbr_enqueue(ifp, tx->br, m);
2246 mxge_start_locked(struct mxge_slice_state *ss)
2256 while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2257 IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
2261 /* let BPF see it */
2264 /* give it to the nic */
2267 /* ran out of transmit slots */
2268 if ((sc->ifp->if_flags & IFF_OACTIVE) == 0) {
2269 sc->ifp->if_flags |= IFF_OACTIVE;
2275 mxge_start(struct ifnet *ifp)
2277 mxge_softc_t *sc = ifp->if_softc;
2278 struct mxge_slice_state *ss;
2280 /* only use the first slice for now */
2282 lockmgr(&ss->tx.lock, LK_EXCLUSIVE);
2283 mxge_start_locked(ss);
2284 lockmgr(&ss->tx.lock, LK_RELEASE);
2288 * copy an array of mcp_kreq_ether_recv_t's to the mcp. Copy
2289 * at most 32 bytes at a time, so as to avoid involving the software
2290 * pio handler in the nic. We re-write the first segment's low
2291 * DMA address to mark it valid only after we write the entire chunk
2295 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2296 mcp_kreq_ether_recv_t *src)
2300 low = src->addr_low;
2301 src->addr_low = 0xffffffff;
2302 mxge_pio_copy(dst, src, 4 * sizeof (*src));
2304 mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2306 src->addr_low = low;
2307 dst->addr_low = low;
2312 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2314 bus_dma_segment_t seg;
2316 mxge_rx_ring_t *rx = &ss->rx_small;
2319 m = m_gethdr(MB_DONTWAIT, MT_DATA);
2326 err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2327 &seg, 1, &cnt, BUS_DMA_NOWAIT);
2332 rx->info[idx].m = m;
2333 rx->shadow[idx].addr_low =
2334 htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2335 rx->shadow[idx].addr_high =
2336 htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2340 mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2345 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2347 bus_dma_segment_t seg[3];
2349 mxge_rx_ring_t *rx = &ss->rx_big;
2352 if (rx->cl_size == MCLBYTES)
2353 m = m_getcl(MB_DONTWAIT, MT_DATA, M_PKTHDR);
2355 m = m_getjcl(MB_DONTWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2361 m->m_len = rx->mlen;
2362 err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2363 seg, 1, &cnt, BUS_DMA_NOWAIT);
2368 rx->info[idx].m = m;
2369 rx->shadow[idx].addr_low =
2370 htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2371 rx->shadow[idx].addr_high =
2372 htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2374 #if MXGE_VIRT_JUMBOS
2375 for (i = 1; i < cnt; i++) {
2376 rx->shadow[idx + i].addr_low =
2377 htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2378 rx->shadow[idx + i].addr_high =
2379 htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2384 for (i = 0; i < rx->nbufs; i++) {
2385 if ((idx & 7) == 7) {
2386 mxge_submit_8rx(&rx->lanai[idx - 7],
2387 &rx->shadow[idx - 7]);
2395 * Myri10GE hardware checksums are not valid if the sender
2396 * padded the frame with non-zero padding. This is because
2397 * the firmware just does a simple 16-bit 1s complement
2398 * checksum across the entire frame, excluding the first 14
2399 * bytes. It is best to simply to check the checksum and
2400 * tell the stack about it only if the checksum is good
2403 static inline uint16_t
2404 mxge_rx_csum(struct mbuf *m, int csum)
2406 struct ether_header *eh;
2410 eh = mtod(m, struct ether_header *);
2412 /* only deal with IPv4 TCP & UDP for now */
2413 if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2415 ip = (struct ip *)(eh + 1);
2416 if (__predict_false(ip->ip_p != IPPROTO_TCP &&
2417 ip->ip_p != IPPROTO_UDP))
2420 c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2421 htonl(ntohs(csum) + ntohs(ip->ip_len) +
2422 - (ip->ip_hl << 2) + ip->ip_p));
2431 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2433 struct ether_vlan_header *evl;
2434 struct ether_header *eh;
2437 evl = mtod(m, struct ether_vlan_header *);
2438 eh = mtod(m, struct ether_header *);
2441 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2442 * after what the firmware thought was the end of the ethernet
2446 /* put checksum into host byte order */
2447 *csum = ntohs(*csum);
2448 partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2449 (*csum) += ~partial;
2450 (*csum) += ((*csum) < ~partial);
2451 (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2452 (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2454 /* restore checksum to network byte order;
2455 later consumers expect this */
2456 *csum = htons(*csum);
2459 #ifdef MXGE_NEW_VLAN_API
2460 m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2464 mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2468 VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2469 m_tag_prepend(m, mtag);
2473 m->m_flags |= M_VLANTAG;
2476 * Remove the 802.1q header by copying the Ethernet
2477 * addresses over it and adjusting the beginning of
2478 * the data in the mbuf. The encapsulated Ethernet
2479 * type field is already in place.
2481 bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2482 ETHER_HDR_LEN - ETHER_TYPE_LEN);
2483 m_adj(m, ETHER_VLAN_ENCAP_LEN);
2488 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2493 struct ether_header *eh;
2495 bus_dmamap_t old_map;
2497 uint16_t tcpudp_csum;
2502 idx = rx->cnt & rx->mask;
2503 rx->cnt += rx->nbufs;
2504 /* save a pointer to the received mbuf */
2505 m = rx->info[idx].m;
2506 /* try to replace the received mbuf */
2507 if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2508 /* drop the frame -- the old mbuf is re-cycled */
2513 /* unmap the received buffer */
2514 old_map = rx->info[idx].map;
2515 bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2516 bus_dmamap_unload(rx->dmat, old_map);
2518 /* swap the bus_dmamap_t's */
2519 rx->info[idx].map = rx->extra_map;
2520 rx->extra_map = old_map;
2522 /* mcp implicitly skips 1st 2 bytes so that packet is properly
2524 m->m_data += MXGEFW_PAD;
2526 m->m_pkthdr.rcvif = ifp;
2527 m->m_len = m->m_pkthdr.len = len;
2529 eh = mtod(m, struct ether_header *);
2530 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2531 mxge_vlan_tag_remove(m, &csum);
2533 /* if the checksum is valid, mark it in the mbuf header */
2534 if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2535 if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2537 /* otherwise, it was a UDP frame, or a TCP frame which
2538 we could not do LRO on. Tell the stack that the
2540 m->m_pkthdr.csum_data = 0xffff;
2541 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2543 /* flowid only valid if RSS hashing is enabled */
2544 if (sc->num_slices > 1) {
2545 m->m_pkthdr.flowid = (ss - sc->ss);
2546 m->m_flags |= M_FLOWID;
2548 /* pass the frame up the stack */
2549 (*ifp->if_input)(ifp, m);
2553 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2557 struct ether_header *eh;
2560 bus_dmamap_t old_map;
2562 uint16_t tcpudp_csum;
2567 idx = rx->cnt & rx->mask;
2569 /* save a pointer to the received mbuf */
2570 m = rx->info[idx].m;
2571 /* try to replace the received mbuf */
2572 if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2573 /* drop the frame -- the old mbuf is re-cycled */
2578 /* unmap the received buffer */
2579 old_map = rx->info[idx].map;
2580 bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2581 bus_dmamap_unload(rx->dmat, old_map);
2583 /* swap the bus_dmamap_t's */
2584 rx->info[idx].map = rx->extra_map;
2585 rx->extra_map = old_map;
2587 /* mcp implicitly skips 1st 2 bytes so that packet is properly
2589 m->m_data += MXGEFW_PAD;
2591 m->m_pkthdr.rcvif = ifp;
2592 m->m_len = m->m_pkthdr.len = len;
2594 eh = mtod(m, struct ether_header *);
2595 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2596 mxge_vlan_tag_remove(m, &csum);
2598 /* if the checksum is valid, mark it in the mbuf header */
2599 if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2600 if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2602 /* otherwise, it was a UDP frame, or a TCP frame which
2603 we could not do LRO on. Tell the stack that the
2605 m->m_pkthdr.csum_data = 0xffff;
2606 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2608 /* flowid only valid if RSS hashing is enabled */
2609 if (sc->num_slices > 1) {
2610 m->m_pkthdr.flowid = (ss - sc->ss);
2611 m->m_flags |= M_FLOWID;
2613 /* pass the frame up the stack */
2614 (*ifp->if_input)(ifp, m);
2618 mxge_clean_rx_done(struct mxge_slice_state *ss)
2620 mxge_rx_done_t *rx_done = &ss->rx_done;
2626 while (rx_done->entry[rx_done->idx].length != 0) {
2627 length = ntohs(rx_done->entry[rx_done->idx].length);
2628 rx_done->entry[rx_done->idx].length = 0;
2629 checksum = rx_done->entry[rx_done->idx].checksum;
2630 if (length <= (MHLEN - MXGEFW_PAD))
2631 mxge_rx_done_small(ss, length, checksum);
2633 mxge_rx_done_big(ss, length, checksum);
2635 rx_done->idx = rx_done->cnt & rx_done->mask;
2637 /* limit potential for livelock */
2638 if (__predict_false(++limit > rx_done->mask / 2))
2642 while (!SLIST_EMPTY(&ss->lro_active)) {
2643 struct lro_entry *lro = SLIST_FIRST(&ss->lro_active);
2644 SLIST_REMOVE_HEAD(&ss->lro_active, next);
2645 mxge_lro_flush(ss, lro);
2652 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2663 while (tx->pkt_done != mcp_idx) {
2664 idx = tx->done & tx->mask;
2666 m = tx->info[idx].m;
2667 /* mbuf and DMA map only attached to the first
2670 ss->obytes += m->m_pkthdr.len;
2671 if (m->m_flags & M_MCAST)
2674 tx->info[idx].m = NULL;
2675 map = tx->info[idx].map;
2676 bus_dmamap_unload(tx->dmat, map);
2679 if (tx->info[idx].flag) {
2680 tx->info[idx].flag = 0;
2685 /* If we have space, clear IFF_OACTIVE to tell the stack that
2686 its OK to send packets */
2687 #ifdef IFNET_BUF_RING
2688 flags = &ss->if_flags;
2690 flags = &ifp->if_flags;
2692 lockmgr(&ss->tx.lock, LK_EXCLUSIVE);
2693 if ((*flags) & IFF_OACTIVE &&
2694 tx->req - tx->done < (tx->mask + 1)/4) {
2695 *(flags) &= ~IFF_OACTIVE;
2697 mxge_start_locked(ss);
2699 #ifdef IFNET_BUF_RING
2700 if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2701 /* let the NIC stop polling this queue, since there
2702 * are no more transmits pending */
2703 if (tx->req == tx->done) {
2705 tx->queue_active = 0;
2711 lockmgr(&ss->tx.lock, LK_RELEASE);
2715 static struct mxge_media_type mxge_xfp_media_types[] =
2717 {IFM_10G_CX4, 0x7f, "10GBASE-CX4 (module)"},
2718 {IFM_10G_SR, (1 << 7), "10GBASE-SR"},
2719 {IFM_10G_LR, (1 << 6), "10GBASE-LR"},
2720 {0, (1 << 5), "10GBASE-ER"},
2721 {IFM_10G_LRM, (1 << 4), "10GBASE-LRM"},
2722 {0, (1 << 3), "10GBASE-SW"},
2723 {0, (1 << 2), "10GBASE-LW"},
2724 {0, (1 << 1), "10GBASE-EW"},
2725 {0, (1 << 0), "Reserved"}
2727 static struct mxge_media_type mxge_sfp_media_types[] =
2729 {0, (1 << 7), "Reserved"},
2730 {IFM_10G_LRM, (1 << 6), "10GBASE-LRM"},
2731 {IFM_10G_LR, (1 << 5), "10GBASE-LR"},
2732 {IFM_10G_SR, (1 << 4), "10GBASE-SR"}
2736 mxge_set_media(mxge_softc_t *sc, int type)
2738 sc->media_flags |= type;
2739 ifmedia_add(&sc->media, sc->media_flags, 0, NULL);
2740 ifmedia_set(&sc->media, sc->media_flags);
2745 * Determine the media type for a NIC. Some XFPs will identify
2746 * themselves only when their link is up, so this is initiated via a
2747 * link up interrupt. However, this can potentially take up to
2748 * several milliseconds, so it is run via the watchdog routine, rather
2749 * than in the interrupt handler itself. This need only be done
2750 * once, not each time the link is up.
2753 mxge_media_probe(mxge_softc_t *sc)
2758 struct mxge_media_type *mxge_media_types = NULL;
2759 int i, err, ms, mxge_media_type_entries;
2762 sc->need_media_probe = 0;
2764 /* if we've already set a media type, we're done */
2765 if (sc->media_flags != (IFM_ETHER | IFM_AUTO))
2769 * parse the product code to deterimine the interface type
2770 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2771 * after the 3rd dash in the driver's cached copy of the
2772 * EEPROM's product code string.
2774 ptr = sc->product_code_string;
2776 device_printf(sc->dev, "Missing product code\n");
2779 for (i = 0; i < 3; i++, ptr++) {
2780 ptr = index(ptr, '-');
2782 device_printf(sc->dev,
2783 "only %d dashes in PC?!?\n", i);
2789 mxge_set_media(sc, IFM_10G_CX4);
2792 else if (*ptr == 'Q') {
2793 /* -Q is Quad Ribbon Fiber */
2794 device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2795 /* FreeBSD has no media type for Quad ribbon fiber */
2801 mxge_media_types = mxge_xfp_media_types;
2802 mxge_media_type_entries =
2803 sizeof (mxge_xfp_media_types) /
2804 sizeof (mxge_xfp_media_types[0]);
2805 byte = MXGE_XFP_COMPLIANCE_BYTE;
2809 if (*ptr == 'S' || *(ptr +1) == 'S') {
2810 /* -S or -2S is SFP+ */
2811 mxge_media_types = mxge_sfp_media_types;
2812 mxge_media_type_entries =
2813 sizeof (mxge_sfp_media_types) /
2814 sizeof (mxge_sfp_media_types[0]);
2819 if (mxge_media_types == NULL) {
2820 device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2825 * At this point we know the NIC has an XFP cage, so now we
2826 * try to determine what is in the cage by using the
2827 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2828 * register. We read just one byte, which may take over
2832 cmd.data0 = 0; /* just fetch 1 byte, not all 256 */
2834 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2835 if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
2836 device_printf(sc->dev, "failed to read XFP\n");
2838 if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
2839 device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
2841 if (err != MXGEFW_CMD_OK) {
2845 /* now we wait for the data to be cached */
2847 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2848 for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
2851 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2853 if (err != MXGEFW_CMD_OK) {
2854 device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
2855 cage_type, err, ms);
2859 if (cmd.data0 == mxge_media_types[0].bitmask) {
2861 device_printf(sc->dev, "%s:%s\n", cage_type,
2862 mxge_media_types[0].name);
2863 mxge_set_media(sc, IFM_10G_CX4);
2866 for (i = 1; i < mxge_media_type_entries; i++) {
2867 if (cmd.data0 & mxge_media_types[i].bitmask) {
2869 device_printf(sc->dev, "%s:%s\n",
2871 mxge_media_types[i].name);
2873 mxge_set_media(sc, mxge_media_types[i].flag);
2877 device_printf(sc->dev, "%s media 0x%x unknown\n", cage_type,
2884 mxge_intr(void *arg)
2886 struct mxge_slice_state *ss = arg;
2887 mxge_softc_t *sc = ss->sc;
2888 mcp_irq_data_t *stats = ss->fw_stats;
2889 mxge_tx_ring_t *tx = &ss->tx;
2890 mxge_rx_done_t *rx_done = &ss->rx_done;
2891 uint32_t send_done_count;
2895 #ifndef IFNET_BUF_RING
2896 /* an interrupt on a non-zero slice is implicitly valid
2897 since MSI-X irqs are not shared */
2899 mxge_clean_rx_done(ss);
2900 *ss->irq_claim = be32toh(3);
2905 /* make sure the DMA has finished */
2906 if (!stats->valid) {
2909 valid = stats->valid;
2911 if (sc->legacy_irq) {
2912 /* lower legacy IRQ */
2913 *sc->irq_deassert = 0;
2914 if (!mxge_deassert_wait)
2915 /* don't wait for conf. that irq is low */
2921 /* loop while waiting for legacy irq deassertion */
2923 /* check for transmit completes and receives */
2924 send_done_count = be32toh(stats->send_done_count);
2925 while ((send_done_count != tx->pkt_done) ||
2926 (rx_done->entry[rx_done->idx].length != 0)) {
2927 if (send_done_count != tx->pkt_done)
2928 mxge_tx_done(ss, (int)send_done_count);
2929 mxge_clean_rx_done(ss);
2930 send_done_count = be32toh(stats->send_done_count);
2932 if (sc->legacy_irq && mxge_deassert_wait)
2934 } while (*((volatile uint8_t *) &stats->valid));
2936 /* fw link & error stats meaningful only on the first slice */
2937 if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
2938 if (sc->link_state != stats->link_up) {
2939 sc->link_state = stats->link_up;
2940 if (sc->link_state) {
2941 sc->ifp->if_link_state = LINK_STATE_UP;
2942 if_link_state_change(sc->ifp);
2944 device_printf(sc->dev, "link up\n");
2946 sc->ifp->if_link_state = LINK_STATE_DOWN;
2947 if_link_state_change(sc->ifp);
2949 device_printf(sc->dev, "link down\n");
2951 sc->need_media_probe = 1;
2953 if (sc->rdma_tags_available !=
2954 be32toh(stats->rdma_tags_available)) {
2955 sc->rdma_tags_available =
2956 be32toh(stats->rdma_tags_available);
2957 device_printf(sc->dev, "RDMA timed out! %d tags "
2958 "left\n", sc->rdma_tags_available);
2961 if (stats->link_down) {
2962 sc->down_cnt += stats->link_down;
2964 sc->ifp->if_link_state = LINK_STATE_DOWN;
2965 if_link_state_change(sc->ifp);
2969 /* check to see if we have rx token to pass back */
2971 *ss->irq_claim = be32toh(3);
2972 *(ss->irq_claim + 1) = be32toh(3);
2976 mxge_init(void *arg)
2983 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
2985 struct lro_entry *lro_entry;
2988 while (!SLIST_EMPTY(&ss->lro_free)) {
2989 lro_entry = SLIST_FIRST(&ss->lro_free);
2990 SLIST_REMOVE_HEAD(&ss->lro_free, next);
2991 kfree(lro_entry, M_DEVBUF);
2994 for (i = 0; i <= ss->rx_big.mask; i++) {
2995 if (ss->rx_big.info[i].m == NULL)
2997 bus_dmamap_unload(ss->rx_big.dmat,
2998 ss->rx_big.info[i].map);
2999 m_freem(ss->rx_big.info[i].m);
3000 ss->rx_big.info[i].m = NULL;
3003 for (i = 0; i <= ss->rx_small.mask; i++) {
3004 if (ss->rx_small.info[i].m == NULL)
3006 bus_dmamap_unload(ss->rx_small.dmat,
3007 ss->rx_small.info[i].map);
3008 m_freem(ss->rx_small.info[i].m);
3009 ss->rx_small.info[i].m = NULL;
3012 /* transmit ring used only on the first slice */
3013 if (ss->tx.info == NULL)
3016 for (i = 0; i <= ss->tx.mask; i++) {
3017 ss->tx.info[i].flag = 0;
3018 if (ss->tx.info[i].m == NULL)
3020 bus_dmamap_unload(ss->tx.dmat,
3021 ss->tx.info[i].map);
3022 m_freem(ss->tx.info[i].m);
3023 ss->tx.info[i].m = NULL;
3028 mxge_free_mbufs(mxge_softc_t *sc)
3032 for (slice = 0; slice < sc->num_slices; slice++)
3033 mxge_free_slice_mbufs(&sc->ss[slice]);
3037 mxge_free_slice_rings(struct mxge_slice_state *ss)
3042 if (ss->rx_done.entry != NULL)
3043 mxge_dma_free(&ss->rx_done.dma);
3044 ss->rx_done.entry = NULL;
3046 if (ss->tx.req_bytes != NULL)
3047 kfree(ss->tx.req_bytes, M_DEVBUF);
3048 ss->tx.req_bytes = NULL;
3050 if (ss->tx.seg_list != NULL)
3051 kfree(ss->tx.seg_list, M_DEVBUF);
3052 ss->tx.seg_list = NULL;
3054 if (ss->rx_small.shadow != NULL)
3055 kfree(ss->rx_small.shadow, M_DEVBUF);
3056 ss->rx_small.shadow = NULL;
3058 if (ss->rx_big.shadow != NULL)
3059 kfree(ss->rx_big.shadow, M_DEVBUF);
3060 ss->rx_big.shadow = NULL;
3062 if (ss->tx.info != NULL) {
3063 if (ss->tx.dmat != NULL) {
3064 for (i = 0; i <= ss->tx.mask; i++) {
3065 bus_dmamap_destroy(ss->tx.dmat,
3066 ss->tx.info[i].map);
3068 bus_dma_tag_destroy(ss->tx.dmat);
3070 kfree(ss->tx.info, M_DEVBUF);
3074 if (ss->rx_small.info != NULL) {
3075 if (ss->rx_small.dmat != NULL) {
3076 for (i = 0; i <= ss->rx_small.mask; i++) {
3077 bus_dmamap_destroy(ss->rx_small.dmat,
3078 ss->rx_small.info[i].map);
3080 bus_dmamap_destroy(ss->rx_small.dmat,
3081 ss->rx_small.extra_map);
3082 bus_dma_tag_destroy(ss->rx_small.dmat);
3084 kfree(ss->rx_small.info, M_DEVBUF);
3086 ss->rx_small.info = NULL;
3088 if (ss->rx_big.info != NULL) {
3089 if (ss->rx_big.dmat != NULL) {
3090 for (i = 0; i <= ss->rx_big.mask; i++) {
3091 bus_dmamap_destroy(ss->rx_big.dmat,
3092 ss->rx_big.info[i].map);
3094 bus_dmamap_destroy(ss->rx_big.dmat,
3095 ss->rx_big.extra_map);
3096 bus_dma_tag_destroy(ss->rx_big.dmat);
3098 kfree(ss->rx_big.info, M_DEVBUF);
3100 ss->rx_big.info = NULL;
3104 mxge_free_rings(mxge_softc_t *sc)
3108 for (slice = 0; slice < sc->num_slices; slice++)
3109 mxge_free_slice_rings(&sc->ss[slice]);
3113 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3114 int tx_ring_entries)
3116 mxge_softc_t *sc = ss->sc;
3122 /* allocate per-slice receive resources */
3124 ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3125 ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3127 /* allocate the rx shadow rings */
3128 bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3129 ss->rx_small.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3130 if (ss->rx_small.shadow == NULL)
3133 bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3134 ss->rx_big.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3135 if (ss->rx_big.shadow == NULL)
3138 /* allocate the rx host info rings */
3139 bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3140 ss->rx_small.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3141 if (ss->rx_small.info == NULL)
3144 bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3145 ss->rx_big.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3146 if (ss->rx_big.info == NULL)
3149 /* allocate the rx busdma resources */
3150 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3152 4096, /* boundary */
3153 BUS_SPACE_MAXADDR, /* low */
3154 BUS_SPACE_MAXADDR, /* high */
3155 NULL, NULL, /* filter */
3156 MHLEN, /* maxsize */
3158 MHLEN, /* maxsegsize */
3159 BUS_DMA_ALLOCNOW, /* flags */
3160 &ss->rx_small.dmat); /* tag */
3162 device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3167 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3169 #if MXGE_VIRT_JUMBOS
3170 4096, /* boundary */
3174 BUS_SPACE_MAXADDR, /* low */
3175 BUS_SPACE_MAXADDR, /* high */
3176 NULL, NULL, /* filter */
3177 3*4096, /* maxsize */
3178 #if MXGE_VIRT_JUMBOS
3180 4096, /* maxsegsize*/
3183 MJUM9BYTES, /* maxsegsize*/
3185 BUS_DMA_ALLOCNOW, /* flags */
3186 &ss->rx_big.dmat); /* tag */
3188 device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3192 for (i = 0; i <= ss->rx_small.mask; i++) {
3193 err = bus_dmamap_create(ss->rx_small.dmat, 0,
3194 &ss->rx_small.info[i].map);
3196 device_printf(sc->dev, "Err %d rx_small dmamap\n",
3201 err = bus_dmamap_create(ss->rx_small.dmat, 0,
3202 &ss->rx_small.extra_map);
3204 device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3209 for (i = 0; i <= ss->rx_big.mask; i++) {
3210 err = bus_dmamap_create(ss->rx_big.dmat, 0,
3211 &ss->rx_big.info[i].map);
3213 device_printf(sc->dev, "Err %d rx_big dmamap\n",
3218 err = bus_dmamap_create(ss->rx_big.dmat, 0,
3219 &ss->rx_big.extra_map);
3221 device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3226 /* now allocate TX resouces */
3228 #ifndef IFNET_BUF_RING
3229 /* only use a single TX ring for now */
3230 if (ss != ss->sc->ss)
3234 ss->tx.mask = tx_ring_entries - 1;
3235 ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3238 /* allocate the tx request copy block */
3240 sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3241 ss->tx.req_bytes = kmalloc(bytes, M_DEVBUF, M_WAITOK);
3242 if (ss->tx.req_bytes == NULL)
3244 /* ensure req_list entries are aligned to 8 bytes */
3245 ss->tx.req_list = (mcp_kreq_ether_send_t *)
3246 ((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3248 /* allocate the tx busdma segment list */
3249 bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3250 ss->tx.seg_list = (bus_dma_segment_t *)
3251 kmalloc(bytes, M_DEVBUF, M_WAITOK);
3252 if (ss->tx.seg_list == NULL)
3255 /* allocate the tx host info ring */
3256 bytes = tx_ring_entries * sizeof (*ss->tx.info);
3257 ss->tx.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3258 if (ss->tx.info == NULL)
3261 /* allocate the tx busdma resources */
3262 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3264 sc->tx_boundary, /* boundary */
3265 BUS_SPACE_MAXADDR, /* low */
3266 BUS_SPACE_MAXADDR, /* high */
3267 NULL, NULL, /* filter */
3268 65536 + 256, /* maxsize */
3269 ss->tx.max_desc - 2, /* num segs */
3270 sc->tx_boundary, /* maxsegsz */
3271 BUS_DMA_ALLOCNOW, /* flags */
3272 &ss->tx.dmat); /* tag */
3275 device_printf(sc->dev, "Err %d allocating tx dmat\n",
3280 /* now use these tags to setup dmamaps for each slot
3282 for (i = 0; i <= ss->tx.mask; i++) {
3283 err = bus_dmamap_create(ss->tx.dmat, 0,
3284 &ss->tx.info[i].map);
3286 device_printf(sc->dev, "Err %d tx dmamap\n",
3296 mxge_alloc_rings(mxge_softc_t *sc)
3300 int tx_ring_entries, rx_ring_entries;
3303 /* get ring sizes */
3304 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3305 tx_ring_size = cmd.data0;
3307 device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3311 tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3312 rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3313 IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
3314 sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
3315 IFQ_SET_READY(&sc->ifp->if_snd);
3317 for (slice = 0; slice < sc->num_slices; slice++) {
3318 err = mxge_alloc_slice_rings(&sc->ss[slice],
3327 mxge_free_rings(sc);
3334 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3336 int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3338 if (bufsize < MCLBYTES) {
3339 /* easy, everything fits in a single buffer */
3340 *big_buf_size = MCLBYTES;
3341 *cl_size = MCLBYTES;
3346 if (bufsize < MJUMPAGESIZE) {
3347 /* still easy, everything still fits in a single buffer */
3348 *big_buf_size = MJUMPAGESIZE;
3349 *cl_size = MJUMPAGESIZE;
3353 #if MXGE_VIRT_JUMBOS
3354 /* now we need to use virtually contiguous buffers */
3355 *cl_size = MJUM9BYTES;
3356 *big_buf_size = 4096;
3357 *nbufs = mtu / 4096 + 1;
3358 /* needs to be a power of two, so round up */
3362 *cl_size = MJUM9BYTES;
3363 *big_buf_size = MJUM9BYTES;
3369 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3374 struct lro_entry *lro_entry;
3379 slice = ss - sc->ss;
3381 SLIST_INIT(&ss->lro_free);
3382 SLIST_INIT(&ss->lro_active);
3384 for (i = 0; i < sc->lro_cnt; i++) {
3385 lro_entry = (struct lro_entry *)
3386 kmalloc(sizeof (*lro_entry), M_DEVBUF,
3388 if (lro_entry == NULL) {
3392 SLIST_INSERT_HEAD(&ss->lro_free, lro_entry, next);
3394 /* get the lanai pointers to the send and receive rings */
3397 #ifndef IFNET_BUF_RING
3398 /* We currently only send from the first slice */
3402 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3404 (volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3405 ss->tx.send_go = (volatile uint32_t *)
3406 (sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3407 ss->tx.send_stop = (volatile uint32_t *)
3408 (sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3409 #ifndef IFNET_BUF_RING
3413 err |= mxge_send_cmd(sc,
3414 MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3415 ss->rx_small.lanai =
3416 (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3418 err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3420 (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3423 device_printf(sc->dev,
3424 "failed to get ring sizes or locations\n");
3428 /* stock receive rings */
3429 for (i = 0; i <= ss->rx_small.mask; i++) {
3430 map = ss->rx_small.info[i].map;
3431 err = mxge_get_buf_small(ss, map, i);
3433 device_printf(sc->dev, "alloced %d/%d smalls\n",
3434 i, ss->rx_small.mask + 1);
3438 for (i = 0; i <= ss->rx_big.mask; i++) {
3439 ss->rx_big.shadow[i].addr_low = 0xffffffff;
3440 ss->rx_big.shadow[i].addr_high = 0xffffffff;
3442 ss->rx_big.nbufs = nbufs;
3443 ss->rx_big.cl_size = cl_size;
3444 ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3445 ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3446 for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3447 map = ss->rx_big.info[i].map;
3448 err = mxge_get_buf_big(ss, map, i);
3450 device_printf(sc->dev, "alloced %d/%d bigs\n",
3451 i, ss->rx_big.mask + 1);
3459 mxge_open(mxge_softc_t *sc)
3462 int err, big_bytes, nbufs, slice, cl_size, i;
3464 volatile uint8_t *itable;
3465 struct mxge_slice_state *ss;
3467 /* Copy the MAC address in case it was overridden */
3468 bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3470 err = mxge_reset(sc, 1);
3472 device_printf(sc->dev, "failed to reset\n");
3476 if (sc->num_slices > 1) {
3477 /* setup the indirection table */
3478 cmd.data0 = sc->num_slices;
3479 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3482 err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3485 device_printf(sc->dev,
3486 "failed to setup rss tables\n");
3490 /* just enable an identity mapping */
3491 itable = sc->sram + cmd.data0;
3492 for (i = 0; i < sc->num_slices; i++)
3493 itable[i] = (uint8_t)i;
3496 cmd.data1 = mxge_rss_hash_type;
3497 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3499 device_printf(sc->dev, "failed to enable slices\n");
3505 mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3508 err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3510 /* error is only meaningful if we're trying to set
3511 MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3512 if (err && nbufs > 1) {
3513 device_printf(sc->dev,
3514 "Failed to set alway-use-n to %d\n",
3518 /* Give the firmware the mtu and the big and small buffer
3519 sizes. The firmware wants the big buf size to be a power
3520 of two. Luckily, FreeBSD's clusters are powers of two */
3521 cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3522 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3523 cmd.data0 = MHLEN - MXGEFW_PAD;
3524 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3526 cmd.data0 = big_bytes;
3527 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3530 device_printf(sc->dev, "failed to setup params\n");
3534 /* Now give him the pointer to the stats block */
3536 #ifdef IFNET_BUF_RING
3537 slice < sc->num_slices;
3542 ss = &sc->ss[slice];
3544 MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3546 MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3547 cmd.data2 = sizeof(struct mcp_irq_data);
3548 cmd.data2 |= (slice << 16);
3549 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3553 bus = sc->ss->fw_stats_dma.bus_addr;
3554 bus += offsetof(struct mcp_irq_data, send_done_count);
3555 cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3556 cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3557 err = mxge_send_cmd(sc,
3558 MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3560 /* Firmware cannot support multicast without STATS_DMA_V2 */
3561 sc->fw_multicast_support = 0;
3563 sc->fw_multicast_support = 1;
3567 device_printf(sc->dev, "failed to setup params\n");
3571 for (slice = 0; slice < sc->num_slices; slice++) {
3572 err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3574 device_printf(sc->dev, "couldn't open slice %d\n",
3580 /* Finally, start the firmware running */
3581 err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3583 device_printf(sc->dev, "Couldn't bring up link\n");
3586 #ifdef IFNET_BUF_RING
3587 for (slice = 0; slice < sc->num_slices; slice++) {
3588 ss = &sc->ss[slice];
3589 ss->if_flags |= IFF_RUNNING;
3590 ss->if_flags &= ~IFF_OACTIVE;
3593 sc->ifp->if_flags |= IFF_RUNNING;
3594 sc->ifp->if_flags &= ~IFF_OACTIVE;
3595 callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3601 mxge_free_mbufs(sc);
3607 mxge_close(mxge_softc_t *sc)
3610 int err, old_down_cnt;
3611 #ifdef IFNET_BUF_RING
3612 struct mxge_slice_state *ss;
3616 callout_stop(&sc->co_hdl);
3617 #ifdef IFNET_BUF_RING
3618 for (slice = 0; slice < sc->num_slices; slice++) {
3619 ss = &sc->ss[slice];
3620 ss->if_flags &= ~IFF_RUNNING;
3623 sc->ifp->if_flags &= ~IFF_RUNNING;
3624 old_down_cnt = sc->down_cnt;
3626 err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3628 device_printf(sc->dev, "Couldn't bring down link\n");
3630 if (old_down_cnt == sc->down_cnt) {
3631 /* wait for down irq */
3632 DELAY(10 * sc->intr_coal_delay);
3635 if (old_down_cnt == sc->down_cnt) {
3636 device_printf(sc->dev, "never got down irq\n");
3639 mxge_free_mbufs(sc);
3645 mxge_setup_cfg_space(mxge_softc_t *sc)
3647 device_t dev = sc->dev;
3649 uint16_t cmd, lnk, pectl;
3651 /* find the PCIe link width and set max read request to 4KB*/
3652 if (pci_find_extcap(dev, PCIY_EXPRESS, ®) == 0) {
3653 lnk = pci_read_config(dev, reg + 0x12, 2);
3654 sc->link_width = (lnk >> 4) & 0x3f;
3656 pectl = pci_read_config(dev, reg + 0x8, 2);
3657 pectl = (pectl & ~0x7000) | (5 << 12);
3658 pci_write_config(dev, reg + 0x8, pectl, 2);
3661 /* Enable DMA and Memory space access */
3662 pci_enable_busmaster(dev);
3663 cmd = pci_read_config(dev, PCIR_COMMAND, 2);
3664 cmd |= PCIM_CMD_MEMEN;
3665 pci_write_config(dev, PCIR_COMMAND, cmd, 2);
3669 mxge_read_reboot(mxge_softc_t *sc)
3671 device_t dev = sc->dev;
3674 /* find the vendor specific offset */
3675 if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
3676 device_printf(sc->dev,
3677 "could not find vendor specific offset\n");
3678 return (uint32_t)-1;
3680 /* enable read32 mode */
3681 pci_write_config(dev, vs + 0x10, 0x3, 1);
3682 /* tell NIC which register to read */
3683 pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3684 return (pci_read_config(dev, vs + 0x14, 4));
3688 mxge_watchdog_reset(mxge_softc_t *sc, int slice)
3690 struct pci_devinfo *dinfo;
3698 device_printf(sc->dev, "Watchdog reset!\n");
3701 * check to see if the NIC rebooted. If it did, then all of
3702 * PCI config space has been reset, and things like the
3703 * busmaster bit will be zero. If this is the case, then we
3704 * must restore PCI config space before the NIC can be used
3707 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3708 if (cmd == 0xffff) {
3710 * maybe the watchdog caught the NIC rebooting; wait
3711 * up to 100ms for it to finish. If it does not come
3712 * back, then give up
3715 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3716 if (cmd == 0xffff) {
3717 device_printf(sc->dev, "NIC disappeared!\n");
3721 if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3722 /* print the reboot status */
3723 reboot = mxge_read_reboot(sc);
3724 device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3726 /* restore PCI configuration space */
3727 dinfo = device_get_ivars(sc->dev);
3728 pci_cfg_restore(sc->dev, dinfo);
3730 /* and redo any changes we made to our config space */
3731 mxge_setup_cfg_space(sc);
3733 if (sc->ifp->if_flags & IFF_RUNNING) {
3735 err = mxge_open(sc);
3738 tx = &sc->ss[slice].tx;
3739 device_printf(sc->dev,
3740 "NIC did not reboot, slice %d ring state:\n",
3742 device_printf(sc->dev,
3743 "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3744 tx->req, tx->done, tx->queue_active);
3745 device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3746 tx->activate, tx->deactivate);
3747 device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3749 be32toh(sc->ss->fw_stats->send_done_count));
3750 device_printf(sc->dev, "not resetting\n");
3756 mxge_watchdog(mxge_softc_t *sc)
3759 uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3762 /* see if we have outstanding transmits, which
3763 have been pending for more than mxge_ticks */
3765 #ifdef IFNET_BUF_RING
3766 (i < sc->num_slices) && (err == 0);
3768 (i < 1) && (err == 0);
3772 if (tx->req != tx->done &&
3773 tx->watchdog_req != tx->watchdog_done &&
3774 tx->done == tx->watchdog_done) {
3775 /* check for pause blocking before resetting */
3776 if (tx->watchdog_rx_pause == rx_pause)
3777 err = mxge_watchdog_reset(sc, i);
3779 device_printf(sc->dev, "Flow control blocking "
3780 "xmits, check link partner\n");
3783 tx->watchdog_req = tx->req;
3784 tx->watchdog_done = tx->done;
3785 tx->watchdog_rx_pause = rx_pause;
3788 if (sc->need_media_probe)
3789 mxge_media_probe(sc);
3794 mxge_update_stats(mxge_softc_t *sc)
3796 struct mxge_slice_state *ss;
3797 u_long ipackets = 0;
3798 u_long opackets = 0;
3799 #ifdef IFNET_BUF_RING
3807 for (slice = 0; slice < sc->num_slices; slice++) {
3808 ss = &sc->ss[slice];
3809 ipackets += ss->ipackets;
3810 opackets += ss->opackets;
3811 #ifdef IFNET_BUF_RING
3812 obytes += ss->obytes;
3813 omcasts += ss->omcasts;
3814 odrops += ss->tx.br->br_drops;
3816 oerrors += ss->oerrors;
3818 sc->ifp->if_ipackets = ipackets;
3819 sc->ifp->if_opackets = opackets;
3820 #ifdef IFNET_BUF_RING
3821 sc->ifp->if_obytes = obytes;
3822 sc->ifp->if_omcasts = omcasts;
3823 sc->ifp->if_snd.ifq_drops = odrops;
3825 sc->ifp->if_oerrors = oerrors;
3829 mxge_tick(void *arg)
3831 mxge_softc_t *sc = arg;
3834 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
3835 /* aggregate stats from different slices */
3836 mxge_update_stats(sc);
3837 if (!sc->watchdog_countdown) {
3838 err = mxge_watchdog(sc);
3839 sc->watchdog_countdown = 4;
3841 sc->watchdog_countdown--;
3843 callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3844 lockmgr(&sc->driver_lock, LK_RELEASE);
3848 mxge_media_change(struct ifnet *ifp)
3854 mxge_change_mtu(mxge_softc_t *sc, int mtu)
3856 struct ifnet *ifp = sc->ifp;
3857 int real_mtu, old_mtu;
3861 real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3862 if ((real_mtu > sc->max_mtu) || real_mtu < 60)
3864 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
3865 old_mtu = ifp->if_mtu;
3867 if (ifp->if_flags & IFF_RUNNING) {
3869 err = mxge_open(sc);
3871 ifp->if_mtu = old_mtu;
3873 (void) mxge_open(sc);
3876 lockmgr(&sc->driver_lock, LK_RELEASE);
3881 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
3883 mxge_softc_t *sc = ifp->if_softc;
3888 ifmr->ifm_status = IFM_AVALID;
3889 ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
3890 ifmr->ifm_active = IFM_AUTO | IFM_ETHER;
3891 ifmr->ifm_active |= sc->link_state ? IFM_FDX : 0;
3895 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data, struct ucred *cr)
3897 mxge_softc_t *sc = ifp->if_softc;
3898 struct ifreq *ifr = (struct ifreq *)data;
3906 err = ether_ioctl(ifp, command, data);
3910 err = mxge_change_mtu(sc, ifr->ifr_mtu);
3914 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
3916 lockmgr(&sc->driver_lock, LK_RELEASE);
3919 if (ifp->if_flags & IFF_UP) {
3920 if (!(ifp->if_flags & IFF_RUNNING)) {
3921 err = mxge_open(sc);
3923 /* take care of promis can allmulti
3925 mxge_change_promisc(sc,
3926 ifp->if_flags & IFF_PROMISC);
3927 mxge_set_multicast_list(sc);
3930 if (ifp->if_flags & IFF_RUNNING) {
3934 lockmgr(&sc->driver_lock, LK_RELEASE);
3939 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
3940 mxge_set_multicast_list(sc);
3941 lockmgr(&sc->driver_lock, LK_RELEASE);
3945 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
3946 mask = ifr->ifr_reqcap ^ ifp->if_capenable;
3947 if (mask & IFCAP_TXCSUM) {
3948 if (IFCAP_TXCSUM & ifp->if_capenable) {
3949 ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
3950 ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
3953 ifp->if_capenable |= IFCAP_TXCSUM;
3954 ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
3956 } else if (mask & IFCAP_RXCSUM) {
3957 if (IFCAP_RXCSUM & ifp->if_capenable) {
3958 ifp->if_capenable &= ~IFCAP_RXCSUM;
3961 ifp->if_capenable |= IFCAP_RXCSUM;
3965 if (mask & IFCAP_TSO4) {
3966 if (IFCAP_TSO4 & ifp->if_capenable) {
3967 ifp->if_capenable &= ~IFCAP_TSO4;
3968 ifp->if_hwassist &= ~CSUM_TSO;
3969 } else if (IFCAP_TXCSUM & ifp->if_capenable) {
3970 ifp->if_capenable |= IFCAP_TSO4;
3971 ifp->if_hwassist |= CSUM_TSO;
3973 kprintf("mxge requires tx checksum offload"
3974 " be enabled to use TSO\n");
3978 if (mask & IFCAP_LRO) {
3979 if (IFCAP_LRO & ifp->if_capenable)
3980 err = mxge_change_lro_locked(sc, 0);
3982 err = mxge_change_lro_locked(sc, mxge_lro_cnt);
3984 if (mask & IFCAP_VLAN_HWTAGGING)
3985 ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
3986 lockmgr(&sc->driver_lock, LK_RELEASE);
3987 VLAN_CAPABILITIES(ifp);
3992 err = ifmedia_ioctl(ifp, (struct ifreq *)data,
3993 &sc->media, command);
4003 mxge_fetch_tunables(mxge_softc_t *sc)
4006 TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4007 TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4008 &mxge_flow_control);
4009 TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4010 &mxge_intr_coal_delay);
4011 TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4012 &mxge_nvidia_ecrc_enable);
4013 TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4014 &mxge_force_firmware);
4015 TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4016 &mxge_deassert_wait);
4017 TUNABLE_INT_FETCH("hw.mxge.verbose",
4019 TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4020 TUNABLE_INT_FETCH("hw.mxge.lro_cnt", &sc->lro_cnt);
4021 TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4022 TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4023 TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4024 if (sc->lro_cnt != 0)
4025 mxge_lro_cnt = sc->lro_cnt;
4029 if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4030 mxge_intr_coal_delay = 30;
4031 if (mxge_ticks == 0)
4032 mxge_ticks = hz / 2;
4033 sc->pause = mxge_flow_control;
4034 if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4035 || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4036 mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
4038 if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4039 mxge_initial_mtu < ETHER_MIN_LEN)
4040 mxge_initial_mtu = ETHERMTU_JUMBO;
4045 mxge_free_slices(mxge_softc_t *sc)
4047 struct mxge_slice_state *ss;
4054 for (i = 0; i < sc->num_slices; i++) {
4056 if (ss->fw_stats != NULL) {
4057 mxge_dma_free(&ss->fw_stats_dma);
4058 ss->fw_stats = NULL;
4059 #ifdef IFNET_BUF_RING
4060 if (ss->tx.br != NULL) {
4061 drbr_free(ss->tx.br, M_DEVBUF);
4065 lockuninit(&ss->tx.lock);
4067 if (ss->rx_done.entry != NULL) {
4068 mxge_dma_free(&ss->rx_done.dma);
4069 ss->rx_done.entry = NULL;
4072 kfree(sc->ss, M_DEVBUF);
4077 mxge_alloc_slices(mxge_softc_t *sc)
4080 struct mxge_slice_state *ss;
4082 int err, i, max_intr_slots;
4084 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4086 device_printf(sc->dev, "Cannot determine rx ring size\n");
4089 sc->rx_ring_size = cmd.data0;
4090 max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4092 bytes = sizeof (*sc->ss) * sc->num_slices;
4093 sc->ss = kmalloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4096 for (i = 0; i < sc->num_slices; i++) {
4101 /* allocate per-slice rx interrupt queues */
4103 bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4104 err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4107 ss->rx_done.entry = ss->rx_done.dma.addr;
4108 bzero(ss->rx_done.entry, bytes);
4111 * allocate the per-slice firmware stats; stats
4112 * (including tx) are used used only on the first
4115 #ifndef IFNET_BUF_RING
4120 bytes = sizeof (*ss->fw_stats);
4121 err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4122 sizeof (*ss->fw_stats), 64);
4125 ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4126 ksnprintf(ss->tx.lock_name, sizeof(ss->tx.lock_name),
4127 "%s:tx(%d)", device_get_nameunit(sc->dev), i);
4128 lockinit(&ss->tx.lock, ss->tx.lock_name, 0, LK_CANRECURSE);
4129 #ifdef IFNET_BUF_RING
4130 ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4138 mxge_free_slices(sc);
4143 mxge_slice_probe(mxge_softc_t *sc)
4147 int msix_cnt, status, max_intr_slots;
4151 * don't enable multiple slices if they are not enabled,
4152 * or if this is not an SMP system
4155 if (mxge_max_slices == 0 || mxge_max_slices == 1 || ncpus < 2)
4158 /* see how many MSI-X interrupts are available */
4159 msix_cnt = pci_msix_count(sc->dev);
4163 /* now load the slice aware firmware see what it supports */
4164 old_fw = sc->fw_name;
4165 if (old_fw == mxge_fw_aligned)
4166 sc->fw_name = mxge_fw_rss_aligned;
4168 sc->fw_name = mxge_fw_rss_unaligned;
4169 status = mxge_load_firmware(sc, 0);
4171 device_printf(sc->dev, "Falling back to a single slice\n");
4175 /* try to send a reset command to the card to see if it
4177 memset(&cmd, 0, sizeof (cmd));
4178 status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4180 device_printf(sc->dev, "failed reset\n");
4184 /* get rx ring size */
4185 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4187 device_printf(sc->dev, "Cannot determine rx ring size\n");
4190 max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4192 /* tell it the size of the interrupt queues */
4193 cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4194 status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4196 device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4200 /* ask the maximum number of slices it supports */
4201 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4203 device_printf(sc->dev,
4204 "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4207 sc->num_slices = cmd.data0;
4208 if (sc->num_slices > msix_cnt)
4209 sc->num_slices = msix_cnt;
4211 if (mxge_max_slices == -1) {
4212 /* cap to number of CPUs in system */
4213 if (sc->num_slices > ncpus)
4214 sc->num_slices = ncpus;
4216 if (sc->num_slices > mxge_max_slices)
4217 sc->num_slices = mxge_max_slices;
4219 /* make sure it is a power of two */
4220 while (sc->num_slices & (sc->num_slices - 1))
4224 device_printf(sc->dev, "using %d slices\n",
4230 sc->fw_name = old_fw;
4231 (void) mxge_load_firmware(sc, 0);
4235 mxge_add_msix_irqs(mxge_softc_t *sc)
4238 int count, err, i, rid;
4241 sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4244 if (sc->msix_table_res == NULL) {
4245 device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4249 count = sc->num_slices;
4250 err = pci_alloc_msix(sc->dev, &count);
4252 device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4253 "err = %d \n", sc->num_slices, err);
4254 goto abort_with_msix_table;
4256 if (count < sc->num_slices) {
4257 device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4258 count, sc->num_slices);
4259 device_printf(sc->dev,
4260 "Try setting hw.mxge.max_slices to %d\n",
4263 goto abort_with_msix;
4265 bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4266 sc->msix_irq_res = kmalloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4267 if (sc->msix_irq_res == NULL) {
4269 goto abort_with_msix;
4272 for (i = 0; i < sc->num_slices; i++) {
4274 sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4277 if (sc->msix_irq_res[i] == NULL) {
4278 device_printf(sc->dev, "couldn't allocate IRQ res"
4279 " for message %d\n", i);
4281 goto abort_with_res;
4285 bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4286 sc->msix_ih = kmalloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4288 for (i = 0; i < sc->num_slices; i++) {
4289 err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4291 mxge_intr, &sc->ss[i], &sc->msix_ih[i],
4292 XXX /* serializer */);
4294 device_printf(sc->dev, "couldn't setup intr for "
4296 goto abort_with_intr;
4301 device_printf(sc->dev, "using %d msix IRQs:",
4303 for (i = 0; i < sc->num_slices; i++)
4304 kprintf(" %ld", rman_get_start(sc->msix_irq_res[i]));
4310 for (i = 0; i < sc->num_slices; i++) {
4311 if (sc->msix_ih[i] != NULL) {
4312 bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4314 sc->msix_ih[i] = NULL;
4317 kfree(sc->msix_ih, M_DEVBUF);
4321 for (i = 0; i < sc->num_slices; i++) {
4323 if (sc->msix_irq_res[i] != NULL)
4324 bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4325 sc->msix_irq_res[i]);
4326 sc->msix_irq_res[i] = NULL;
4328 kfree(sc->msix_irq_res, M_DEVBUF);
4332 pci_release_msi(sc->dev);
4334 abort_with_msix_table:
4335 bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4336 sc->msix_table_res);
4342 mxge_add_single_irq(mxge_softc_t *sc)
4344 int count, err, rid;
4346 count = pci_msi_count(sc->dev);
4347 if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4353 sc->irq_res = bus_alloc_resource(sc->dev, SYS_RES_IRQ, &rid, 0, ~0,
4354 1, RF_SHAREABLE | RF_ACTIVE);
4355 if (sc->irq_res == NULL) {
4356 device_printf(sc->dev, "could not alloc interrupt\n");
4360 device_printf(sc->dev, "using %s irq %ld\n",
4361 sc->legacy_irq ? "INTx" : "MSI",
4362 rman_get_start(sc->irq_res));
4363 err = bus_setup_intr(sc->dev, sc->irq_res,
4365 mxge_intr, &sc->ss[0], &sc->ih,
4366 XXX /* serializer */);
4368 bus_release_resource(sc->dev, SYS_RES_IRQ,
4369 sc->legacy_irq ? 0 : 1, sc->irq_res);
4370 if (!sc->legacy_irq)
4371 pci_release_msi(sc->dev);
4377 mxge_rem_msix_irqs(mxge_softc_t *sc)
4381 for (i = 0; i < sc->num_slices; i++) {
4382 if (sc->msix_ih[i] != NULL) {
4383 bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4385 sc->msix_ih[i] = NULL;
4388 kfree(sc->msix_ih, M_DEVBUF);
4390 for (i = 0; i < sc->num_slices; i++) {
4392 if (sc->msix_irq_res[i] != NULL)
4393 bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4394 sc->msix_irq_res[i]);
4395 sc->msix_irq_res[i] = NULL;
4397 kfree(sc->msix_irq_res, M_DEVBUF);
4399 bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4400 sc->msix_table_res);
4402 pci_release_msi(sc->dev);
4407 mxge_rem_single_irq(mxge_softc_t *sc)
4409 bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4410 bus_release_resource(sc->dev, SYS_RES_IRQ,
4411 sc->legacy_irq ? 0 : 1, sc->irq_res);
4412 if (!sc->legacy_irq)
4413 pci_release_msi(sc->dev);
4417 mxge_rem_irq(mxge_softc_t *sc)
4419 if (sc->num_slices > 1)
4420 mxge_rem_msix_irqs(sc);
4422 mxge_rem_single_irq(sc);
4426 mxge_add_irq(mxge_softc_t *sc)
4430 if (sc->num_slices > 1)
4431 err = mxge_add_msix_irqs(sc);
4433 err = mxge_add_single_irq(sc);
4435 if (0 && err == 0 && sc->num_slices > 1) {
4436 mxge_rem_msix_irqs(sc);
4437 err = mxge_add_msix_irqs(sc);
4444 mxge_attach(device_t dev)
4446 mxge_softc_t *sc = device_get_softc(dev);
4447 struct ifnet *ifp = &sc->arpcom.ac_if;
4451 * avoid rewriting half the lines in this file to use
4452 * &sc->arpcom.ac_if instead
4456 mxge_fetch_tunables(sc);
4458 err = bus_dma_tag_create(NULL, /* parent */
4461 BUS_SPACE_MAXADDR, /* low */
4462 BUS_SPACE_MAXADDR, /* high */
4463 NULL, NULL, /* filter */
4464 65536 + 256, /* maxsize */
4465 MXGE_MAX_SEND_DESC, /* num segs */
4466 65536, /* maxsegsize */
4468 &sc->parent_dmat); /* tag */
4471 device_printf(sc->dev, "Err %d allocating parent dmat\n",
4473 goto abort_with_nothing;
4477 if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4479 ksnprintf(sc->cmd_lock_name, sizeof(sc->cmd_lock_name), "%s:cmd",
4480 device_get_nameunit(dev));
4481 lockinit(&sc->cmd_lock, sc->cmd_lock_name, 0, LK_CANRECURSE);
4482 ksnprintf(sc->driver_lock_name, sizeof(sc->driver_lock_name),
4483 "%s:drv", device_get_nameunit(dev));
4484 lockinit(&sc->driver_lock, sc->driver_lock_name,
4487 callout_init(&sc->co_hdl);
4489 mxge_setup_cfg_space(sc);
4491 /* Map the board into the kernel */
4493 sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
4495 if (sc->mem_res == NULL) {
4496 device_printf(dev, "could not map memory\n");
4498 goto abort_with_lock;
4500 sc->sram = rman_get_virtual(sc->mem_res);
4501 sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4502 if (sc->sram_size > rman_get_size(sc->mem_res)) {
4503 device_printf(dev, "impossible memory region size %ld\n",
4504 rman_get_size(sc->mem_res));
4506 goto abort_with_mem_res;
4509 /* make NULL terminated copy of the EEPROM strings section of
4511 bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4512 bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4513 rman_get_bushandle(sc->mem_res),
4514 sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4516 MXGE_EEPROM_STRINGS_SIZE - 2);
4517 err = mxge_parse_strings(sc);
4519 goto abort_with_mem_res;
4521 /* Enable write combining for efficient use of PCIe bus */
4524 /* Allocate the out of band dma memory */
4525 err = mxge_dma_alloc(sc, &sc->cmd_dma,
4526 sizeof (mxge_cmd_t), 64);
4528 goto abort_with_mem_res;
4529 sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4530 err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4532 goto abort_with_cmd_dma;
4534 err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4536 goto abort_with_zeropad_dma;
4538 /* select & load the firmware */
4539 err = mxge_select_firmware(sc);
4541 goto abort_with_dmabench;
4542 sc->intr_coal_delay = mxge_intr_coal_delay;
4544 mxge_slice_probe(sc);
4545 err = mxge_alloc_slices(sc);
4547 goto abort_with_dmabench;
4549 err = mxge_reset(sc, 0);
4551 goto abort_with_slices;
4553 err = mxge_alloc_rings(sc);
4555 device_printf(sc->dev, "failed to allocate rings\n");
4556 goto abort_with_dmabench;
4559 err = mxge_add_irq(sc);
4561 device_printf(sc->dev, "failed to add irq\n");
4562 goto abort_with_rings;
4565 ifp->if_baudrate = IF_Gbps(10UL);
4566 ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4569 ifp->if_capabilities |= IFCAP_LRO;
4572 #ifdef MXGE_NEW_VLAN_API
4573 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4576 sc->max_mtu = mxge_max_mtu(sc);
4577 if (sc->max_mtu >= 9000)
4578 ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4580 device_printf(dev, "MTU limited to %d. Install "
4581 "latest firmware for 9000 byte jumbo support\n",
4582 sc->max_mtu - ETHER_HDR_LEN);
4583 ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4584 ifp->if_capenable = ifp->if_capabilities;
4585 if (sc->lro_cnt == 0)
4586 ifp->if_capenable &= ~IFCAP_LRO;
4588 ifp->if_init = mxge_init;
4590 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4591 ifp->if_ioctl = mxge_ioctl;
4592 ifp->if_start = mxge_start;
4593 /* Initialise the ifmedia structure */
4594 ifmedia_init(&sc->media, 0, mxge_media_change,
4596 mxge_set_media(sc, IFM_ETHER | IFM_AUTO);
4597 mxge_media_probe(sc);
4599 ether_ifattach(ifp, sc->mac_addr);
4600 /* ether_ifattach sets mtu to ETHERMTU */
4601 if (mxge_initial_mtu != ETHERMTU)
4602 mxge_change_mtu(sc, mxge_initial_mtu);
4604 mxge_add_sysctls(sc);
4605 #ifdef IFNET_BUF_RING
4606 ifp->if_transmit = mxge_transmit;
4607 ifp->if_qflush = mxge_qflush;
4612 mxge_free_rings(sc);
4614 mxge_free_slices(sc);
4615 abort_with_dmabench:
4616 mxge_dma_free(&sc->dmabench_dma);
4617 abort_with_zeropad_dma:
4618 mxge_dma_free(&sc->zeropad_dma);
4620 mxge_dma_free(&sc->cmd_dma);
4622 bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4624 pci_disable_busmaster(dev);
4625 lockuninit(&sc->cmd_lock);
4626 lockuninit(&sc->driver_lock);
4628 abort_with_parent_dmat:
4629 bus_dma_tag_destroy(sc->parent_dmat);
4636 mxge_detach(device_t dev)
4638 mxge_softc_t *sc = device_get_softc(dev);
4640 if (mxge_vlans_active(sc)) {
4641 device_printf(sc->dev,
4642 "Detach vlans before removing module\n");
4645 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
4647 if (sc->ifp->if_flags & IFF_RUNNING)
4649 lockmgr(&sc->driver_lock, LK_RELEASE);
4650 ether_ifdetach(sc->ifp);
4651 callout_drain(&sc->co_hdl);
4652 ifmedia_removeall(&sc->media);
4653 mxge_dummy_rdma(sc, 0);
4654 mxge_rem_sysctls(sc);
4656 mxge_free_rings(sc);
4657 mxge_free_slices(sc);
4658 mxge_dma_free(&sc->dmabench_dma);
4659 mxge_dma_free(&sc->zeropad_dma);
4660 mxge_dma_free(&sc->cmd_dma);
4661 bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4662 pci_disable_busmaster(dev);
4663 lockuninit(&sc->cmd_lock);
4664 lockuninit(&sc->driver_lock);
4666 bus_dma_tag_destroy(sc->parent_dmat);
4671 mxge_shutdown(device_t dev)
4677 This file uses Myri10GE driver indentation.
4680 c-file-style:"linux"