1 /******************************************************************************
3 Copyright (c) 2006-2009, Myricom Inc.
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
9 1. Redistributions of source code must retain the above copyright notice,
10 this list of conditions and the following disclaimer.
12 2. Neither the name of the Myricom Inc, nor the names of its
13 contributors may be used to endorse or promote products derived from
14 this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
28 ***************************************************************************/
30 #include <sys/cdefs.h>
31 /*__FBSDID("$FreeBSD: src/sys/dev/mxge/if_mxge.c,v 1.63 2009/06/26 11:45:06 rwatson Exp $");*/
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/linker.h>
36 #include <sys/firmware.h>
37 #include <sys/endian.h>
38 #include <sys/in_cksum.h>
39 #include <sys/sockio.h>
41 #include <sys/malloc.h>
42 #include <sys/kernel.h>
43 #include <sys/module.h>
44 #include <sys/serialize.h>
45 #include <sys/socket.h>
46 #include <sys/sysctl.h>
48 /* count xmits ourselves, rather than via drbr */
51 #include <net/if_arp.h>
52 #include <net/ifq_var.h>
53 #include <net/ethernet.h>
54 #include <net/if_dl.h>
55 #include <net/if_media.h>
59 #include <net/if_types.h>
60 #include <net/vlan/if_vlan_var.h>
63 #include <netinet/in_systm.h>
64 #include <netinet/in.h>
65 #include <netinet/ip.h>
66 #include <netinet/tcp.h>
71 #include <bus/pci/pcireg.h>
72 #include <bus/pci/pcivar.h>
73 #include <bus/pci/pci_private.h> /* XXX for pci_cfg_restore */
75 #include <vm/vm.h> /* for pmap_mapdev() */
78 #if defined(__i386) || defined(__amd64)
79 #include <machine/specialreg.h>
82 #include <dev/netif/mxge/mxge_mcp.h>
83 #include <dev/netif/mxge/mcp_gen_header.h>
84 /*#define MXGE_FAKE_IFP*/
85 #include <dev/netif/mxge/if_mxge_var.h>
87 #include <sys/buf_ring.h>
93 static int mxge_nvidia_ecrc_enable = 1;
94 static int mxge_force_firmware = 0;
95 static int mxge_intr_coal_delay = 30;
96 static int mxge_deassert_wait = 1;
97 static int mxge_flow_control = 1;
98 static int mxge_verbose = 0;
99 static int mxge_lro_cnt = 8;
100 static int mxge_ticks;
101 static int mxge_max_slices = 1;
102 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
103 static int mxge_always_promisc = 0;
104 static int mxge_initial_mtu = ETHERMTU_JUMBO;
105 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
106 static char *mxge_fw_aligned = "mxge_eth_z8e";
107 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
108 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
110 static int mxge_probe(device_t dev);
111 static int mxge_attach(device_t dev);
112 static int mxge_detach(device_t dev);
113 static int mxge_shutdown(device_t dev);
114 static void mxge_intr(void *arg);
116 static device_method_t mxge_methods[] =
118 /* Device interface */
119 DEVMETHOD(device_probe, mxge_probe),
120 DEVMETHOD(device_attach, mxge_attach),
121 DEVMETHOD(device_detach, mxge_detach),
122 DEVMETHOD(device_shutdown, mxge_shutdown),
126 static driver_t mxge_driver =
130 sizeof(mxge_softc_t),
133 static devclass_t mxge_devclass;
135 /* Declare ourselves to be a child of the PCI bus.*/
136 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
137 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
138 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
140 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
141 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
142 static int mxge_close(mxge_softc_t *sc);
143 static int mxge_open(mxge_softc_t *sc);
144 static void mxge_tick(void *arg);
147 mxge_probe(device_t dev)
152 if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
153 ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
154 (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
155 rev = pci_get_revid(dev);
157 case MXGE_PCI_REV_Z8E:
158 device_set_desc(dev, "Myri10G-PCIE-8A");
160 case MXGE_PCI_REV_Z8ES:
161 device_set_desc(dev, "Myri10G-PCIE-8B");
164 device_set_desc(dev, "Myri10G-PCIE-8??");
165 device_printf(dev, "Unrecognized rev %d NIC\n",
175 mxge_enable_wc(mxge_softc_t *sc)
178 #if defined(__i386) || defined(__amd64)
183 len = rman_get_size(sc->mem_res);
184 err = pmap_change_attr((vm_offset_t) sc->sram,
185 len, PAT_WRITE_COMBINING);
187 device_printf(sc->dev, "pmap_change_attr failed, %d\n",
193 sc->wc = 0; /* TBD: PAT support */
198 /* callback to get our DMA address */
200 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
204 *(bus_addr_t *) arg = segs->ds_addr;
209 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
210 bus_size_t alignment)
213 device_t dev = sc->dev;
214 bus_size_t boundary, maxsegsize;
216 if (bytes > 4096 && alignment == 4096) {
224 /* allocate DMAable memory tags */
225 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
226 alignment, /* alignment */
227 boundary, /* boundary */
228 BUS_SPACE_MAXADDR, /* low */
229 BUS_SPACE_MAXADDR, /* high */
230 NULL, NULL, /* filter */
233 maxsegsize, /* maxsegsize */
234 BUS_DMA_COHERENT, /* flags */
235 &dma->dmat); /* tag */
237 device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
241 /* allocate DMAable memory & map */
242 err = bus_dmamem_alloc(dma->dmat, &dma->addr,
243 (BUS_DMA_WAITOK | BUS_DMA_COHERENT
244 | BUS_DMA_ZERO), &dma->map);
246 device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
247 goto abort_with_dmat;
250 /* load the memory */
251 err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
252 mxge_dmamap_callback,
253 (void *)&dma->bus_addr, 0);
255 device_printf(dev, "couldn't load map (err = %d)\n", err);
261 bus_dmamem_free(dma->dmat, dma->addr, dma->map);
263 (void)bus_dma_tag_destroy(dma->dmat);
269 mxge_dma_free(mxge_dma_t *dma)
271 bus_dmamap_unload(dma->dmat, dma->map);
272 bus_dmamem_free(dma->dmat, dma->addr, dma->map);
273 (void)bus_dma_tag_destroy(dma->dmat);
277 * The eeprom strings on the lanaiX have the format
284 mxge_parse_strings(mxge_softc_t *sc)
286 #define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
291 ptr = sc->eeprom_strings;
292 limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
294 while (ptr < limit && *ptr != '\0') {
295 if (memcmp(ptr, "MAC=", 4) == 0) {
297 sc->mac_addr_string = ptr;
298 for (i = 0; i < 6; i++) {
300 if ((ptr + 2) > limit)
302 sc->mac_addr[i] = strtoul(ptr, NULL, 16);
305 } else if (memcmp(ptr, "PC=", 3) == 0) {
307 strncpy(sc->product_code_string, ptr,
308 sizeof (sc->product_code_string) - 1);
309 } else if (memcmp(ptr, "SN=", 3) == 0) {
311 strncpy(sc->serial_number_string, ptr,
312 sizeof (sc->serial_number_string) - 1);
314 MXGE_NEXT_STRING(ptr);
321 device_printf(sc->dev, "failed to parse eeprom_strings\n");
326 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
328 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
331 unsigned long base, off;
333 device_t pdev, mcp55;
334 uint16_t vendor_id, device_id, word;
335 uintptr_t bus, slot, func, ivend, idev;
339 if (!mxge_nvidia_ecrc_enable)
342 pdev = device_get_parent(device_get_parent(sc->dev));
344 device_printf(sc->dev, "could not find parent?\n");
347 vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
348 device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
350 if (vendor_id != 0x10de)
355 if (device_id == 0x005d) {
356 /* ck804, base address is magic */
358 } else if (device_id >= 0x0374 && device_id <= 0x378) {
359 /* mcp55, base address stored in chipset */
360 mcp55 = pci_find_bsf(0, 0, 0);
362 0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
363 0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
364 word = pci_read_config(mcp55, 0x90, 2);
365 base = ((unsigned long)word & 0x7ffeU) << 25;
372 Test below is commented because it is believed that doing
373 config read/write beyond 0xff will access the config space
374 for the next larger function. Uncomment this and remove
375 the hacky pmap_mapdev() way of accessing config space when
376 FreeBSD grows support for extended pcie config space access
379 /* See if we can, by some miracle, access the extended
381 val = pci_read_config(pdev, 0x178, 4);
382 if (val != 0xffffffff) {
384 pci_write_config(pdev, 0x178, val, 4);
388 /* Rather than using normal pci config space writes, we must
389 * map the Nvidia config space ourselves. This is because on
390 * opteron/nvidia class machine the 0xe000000 mapping is
391 * handled by the nvidia chipset, that means the internal PCI
392 * device (the on-chip northbridge), or the amd-8131 bridge
393 * and things behind them are not visible by this method.
396 BUS_READ_IVAR(device_get_parent(pdev), pdev,
398 BUS_READ_IVAR(device_get_parent(pdev), pdev,
399 PCI_IVAR_SLOT, &slot);
400 BUS_READ_IVAR(device_get_parent(pdev), pdev,
401 PCI_IVAR_FUNCTION, &func);
402 BUS_READ_IVAR(device_get_parent(pdev), pdev,
403 PCI_IVAR_VENDOR, &ivend);
404 BUS_READ_IVAR(device_get_parent(pdev), pdev,
405 PCI_IVAR_DEVICE, &idev);
408 + 0x00100000UL * (unsigned long)bus
409 + 0x00001000UL * (unsigned long)(func
412 /* map it into the kernel */
413 va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
417 device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
420 /* get a pointer to the config space mapped into the kernel */
421 cfgptr = va + (off & PAGE_MASK);
423 /* make sure that we can really access it */
424 vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
425 device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
426 if (! (vendor_id == ivend && device_id == idev)) {
427 device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
428 vendor_id, device_id);
429 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
433 ptr32 = (uint32_t*)(cfgptr + 0x178);
436 if (val == 0xffffffff) {
437 device_printf(sc->dev, "extended mapping failed\n");
438 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
442 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
444 device_printf(sc->dev,
445 "Enabled ECRC on upstream Nvidia bridge "
447 (int)bus, (int)slot, (int)func);
452 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
454 device_printf(sc->dev,
455 "Nforce 4 chipset on non-x86/amd64!?!?!\n");
462 mxge_dma_test(mxge_softc_t *sc, int test_type)
465 bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
471 /* Run a small DMA test.
472 * The magic multipliers to the length tell the firmware
473 * to do DMA read, write, or read+write tests. The
474 * results are returned in cmd.data0. The upper 16
475 * bits of the return is the number of transfers completed.
476 * The lower 16 bits is the time in 0.5us ticks that the
477 * transfers took to complete.
480 len = sc->tx_boundary;
482 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
483 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
484 cmd.data2 = len * 0x10000;
485 status = mxge_send_cmd(sc, test_type, &cmd);
490 sc->read_dma = ((cmd.data0>>16) * len * 2) /
491 (cmd.data0 & 0xffff);
492 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
493 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
494 cmd.data2 = len * 0x1;
495 status = mxge_send_cmd(sc, test_type, &cmd);
500 sc->write_dma = ((cmd.data0>>16) * len * 2) /
501 (cmd.data0 & 0xffff);
503 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
504 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
505 cmd.data2 = len * 0x10001;
506 status = mxge_send_cmd(sc, test_type, &cmd);
511 sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
512 (cmd.data0 & 0xffff);
515 if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
516 device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
523 * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
524 * when the PCI-E Completion packets are aligned on an 8-byte
525 * boundary. Some PCI-E chip sets always align Completion packets; on
526 * the ones that do not, the alignment can be enforced by enabling
527 * ECRC generation (if supported).
529 * When PCI-E Completion packets are not aligned, it is actually more
530 * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
532 * If the driver can neither enable ECRC nor verify that it has
533 * already been enabled, then it must use a firmware image which works
534 * around unaligned completion packets (ethp_z8e.dat), and it should
535 * also ensure that it never gives the device a Read-DMA which is
536 * larger than 2KB by setting the tx_boundary to 2KB. If ECRC is
537 * enabled, then the driver should use the aligned (eth_z8e.dat)
538 * firmware image, and set tx_boundary to 4KB.
542 mxge_firmware_probe(mxge_softc_t *sc)
544 device_t dev = sc->dev;
548 sc->tx_boundary = 4096;
550 * Verify the max read request size was set to 4KB
551 * before trying the test with 4KB.
553 if (pci_find_extcap(dev, PCIY_EXPRESS, ®) == 0) {
554 pectl = pci_read_config(dev, reg + 0x8, 2);
555 if ((pectl & (5 << 12)) != (5 << 12)) {
556 device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
558 sc->tx_boundary = 2048;
563 * load the optimized firmware (which assumes aligned PCIe
564 * completions) in order to see if it works on this host.
566 sc->fw_name = mxge_fw_aligned;
567 status = mxge_load_firmware(sc, 1);
573 * Enable ECRC if possible
575 mxge_enable_nvidia_ecrc(sc);
578 * Run a DMA test which watches for unaligned completions and
579 * aborts on the first one seen.
582 status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
584 return 0; /* keep the aligned firmware */
587 device_printf(dev, "DMA test failed: %d\n", status);
588 if (status == ENOSYS)
589 device_printf(dev, "Falling back to ethp! "
590 "Please install up to date fw\n");
595 mxge_select_firmware(mxge_softc_t *sc)
600 if (mxge_force_firmware != 0) {
601 if (mxge_force_firmware == 1)
606 device_printf(sc->dev,
607 "Assuming %s completions (forced)\n",
608 aligned ? "aligned" : "unaligned");
612 /* if the PCIe link width is 4 or less, we can use the aligned
613 firmware and skip any checks */
614 if (sc->link_width != 0 && sc->link_width <= 4) {
615 device_printf(sc->dev,
616 "PCIe x%d Link, expect reduced performance\n",
622 if (0 == mxge_firmware_probe(sc))
627 sc->fw_name = mxge_fw_aligned;
628 sc->tx_boundary = 4096;
630 sc->fw_name = mxge_fw_unaligned;
631 sc->tx_boundary = 2048;
633 return (mxge_load_firmware(sc, 0));
643 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
647 if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
648 device_printf(sc->dev, "Bad firmware type: 0x%x\n",
649 be32toh(hdr->mcp_type));
653 /* save firmware version for sysctl */
654 strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
656 device_printf(sc->dev, "firmware id: %s\n", hdr->version);
658 ksscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
659 &sc->fw_ver_minor, &sc->fw_ver_tiny);
661 if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
662 && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
663 device_printf(sc->dev, "Found firmware version %s\n",
665 device_printf(sc->dev, "Driver needs %d.%d\n",
666 MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
674 z_alloc(void *nil, u_int items, u_int size)
678 ptr = kmalloc(items * size, M_TEMP, M_NOWAIT);
683 z_free(void *nil, void *ptr)
690 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
693 const mcp_gen_header_t *hdr;
700 fw = firmware_image_load(sc->fw_name, NULL);
702 device_printf(sc->dev, "Could not find firmware image %s\n",
707 /* setup zlib and decompress f/w */
708 bzero(&zs, sizeof (zs));
711 status = inflateInit(&zs);
712 if (status != Z_OK) {
717 /* the uncompressed size is stored as the firmware version,
718 which would otherwise go unused */
719 fw_len = (size_t) fw->version;
720 inflate_buffer = kmalloc(fw_len, M_TEMP, M_NOWAIT);
721 if (inflate_buffer == NULL)
723 zs.avail_in = fw->datasize;
724 zs.next_in = __DECONST(char *, fw->data);
725 zs.avail_out = fw_len;
726 zs.next_out = inflate_buffer;
727 status = inflate(&zs, Z_FINISH);
728 if (status != Z_STREAM_END) {
729 device_printf(sc->dev, "zlib %d\n", status);
731 goto abort_with_buffer;
735 hdr_offset = htobe32(*(const uint32_t *)
736 (fw->fw_image + MCP_HEADER_PTR_OFFSET));
737 if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
738 device_printf(sc->dev, "Bad firmware file");
742 hdr = (const void*)(fw->fw_image + hdr_offset);
744 status = mxge_validate_firmware(sc, hdr);
748 /* Copy the inflated firmware to NIC SRAM. */
749 for (i = 0; i < fw_len; i += 256) {
750 mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
752 min(256U, (unsigned)(fw_len - i)));
762 kfree(inflate_buffer, M_TEMP);
767 firmware_image_unload(fw);
772 * Enable or disable periodic RDMAs from the host to make certain
773 * chipsets resend dropped PCIe messages
777 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
780 volatile uint32_t *confirm;
781 volatile char *submit;
782 uint32_t *buf, dma_low, dma_high;
785 buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
787 /* clear confirmation addr */
788 confirm = (volatile uint32_t *)sc->cmd;
792 /* send an rdma command to the PCIe engine, and wait for the
793 response in the confirmation address. The firmware should
794 write a -1 there to indicate it is alive and well
797 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
798 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
799 buf[0] = htobe32(dma_high); /* confirm addr MSW */
800 buf[1] = htobe32(dma_low); /* confirm addr LSW */
801 buf[2] = htobe32(0xffffffff); /* confirm data */
802 dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
803 dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
804 buf[3] = htobe32(dma_high); /* dummy addr MSW */
805 buf[4] = htobe32(dma_low); /* dummy addr LSW */
806 buf[5] = htobe32(enable); /* enable? */
809 submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
811 mxge_pio_copy(submit, buf, 64);
816 while (*confirm != 0xffffffff && i < 20) {
820 if (*confirm != 0xffffffff) {
821 device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
822 (enable ? "enable" : "disable"), confirm,
829 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
832 char buf_bytes[sizeof(*buf) + 8];
833 volatile mcp_cmd_response_t *response = sc->cmd;
834 volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
835 uint32_t dma_low, dma_high;
836 int err, sleep_total = 0;
838 /* ensure buf is aligned to 8 bytes */
839 buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
841 buf->data0 = htobe32(data->data0);
842 buf->data1 = htobe32(data->data1);
843 buf->data2 = htobe32(data->data2);
844 buf->cmd = htobe32(cmd);
845 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
846 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
848 buf->response_addr.low = htobe32(dma_low);
849 buf->response_addr.high = htobe32(dma_high);
851 lwkt_serialize_enter(sc->ifp->if_serializer);
853 response->result = 0xffffffff;
855 mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
857 /* wait up to 20ms */
859 for (sleep_total = 0; sleep_total < 20; sleep_total++) {
860 bus_dmamap_sync(sc->cmd_dma.dmat,
861 sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
863 switch (be32toh(response->result)) {
865 data->data0 = be32toh(response->data);
871 case MXGEFW_CMD_UNKNOWN:
874 case MXGEFW_CMD_ERROR_UNALIGNED:
877 case MXGEFW_CMD_ERROR_BUSY:
881 device_printf(sc->dev,
883 "failed, result = %d\n",
884 cmd, be32toh(response->result));
892 device_printf(sc->dev, "mxge: command %d timed out"
894 cmd, be32toh(response->result));
895 lwkt_serialize_exit(sc->ifp->if_serializer);
900 mxge_adopt_running_firmware(mxge_softc_t *sc)
902 struct mcp_gen_header *hdr;
903 const size_t bytes = sizeof (struct mcp_gen_header);
907 /* find running firmware header */
908 hdr_offset = htobe32(*(volatile uint32_t *)
909 (sc->sram + MCP_HEADER_PTR_OFFSET));
911 if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
912 device_printf(sc->dev,
913 "Running firmware has bad header offset (%d)\n",
918 /* copy header of running firmware from SRAM to host memory to
919 * validate firmware */
920 hdr = kmalloc(bytes, M_DEVBUF, M_NOWAIT);
922 device_printf(sc->dev, "could not kmalloc firmware hdr\n");
925 bus_space_read_region_1(rman_get_bustag(sc->mem_res),
926 rman_get_bushandle(sc->mem_res),
927 hdr_offset, (char *)hdr, bytes);
928 status = mxge_validate_firmware(sc, hdr);
929 kfree(hdr, M_DEVBUF);
932 * check to see if adopted firmware has bug where adopting
933 * it will cause broadcasts to be filtered unless the NIC
934 * is kept in ALLMULTI mode
936 if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
937 sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
938 sc->adopted_rx_filter_bug = 1;
939 device_printf(sc->dev, "Adopting fw %d.%d.%d: "
940 "working around rx filter bug\n",
941 sc->fw_ver_major, sc->fw_ver_minor,
950 mxge_load_firmware(mxge_softc_t *sc, int adopt)
952 volatile uint32_t *confirm;
953 volatile char *submit;
955 uint32_t *buf, size, dma_low, dma_high;
958 buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
960 size = sc->sram_size;
961 status = mxge_load_firmware_helper(sc, &size);
965 /* Try to use the currently running firmware, if
967 status = mxge_adopt_running_firmware(sc);
969 device_printf(sc->dev,
970 "failed to adopt running firmware\n");
973 device_printf(sc->dev,
974 "Successfully adopted running firmware\n");
975 if (sc->tx_boundary == 4096) {
976 device_printf(sc->dev,
977 "Using firmware currently running on NIC"
979 device_printf(sc->dev,
980 "performance consider loading optimized "
983 sc->fw_name = mxge_fw_unaligned;
984 sc->tx_boundary = 2048;
987 /* clear confirmation addr */
988 confirm = (volatile uint32_t *)sc->cmd;
991 /* send a reload command to the bootstrap MCP, and wait for the
992 response in the confirmation address. The firmware should
993 write a -1 there to indicate it is alive and well
996 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
997 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
999 buf[0] = htobe32(dma_high); /* confirm addr MSW */
1000 buf[1] = htobe32(dma_low); /* confirm addr LSW */
1001 buf[2] = htobe32(0xffffffff); /* confirm data */
1003 /* FIX: All newest firmware should un-protect the bottom of
1004 the sram before handoff. However, the very first interfaces
1005 do not. Therefore the handoff copy must skip the first 8 bytes
1007 /* where the code starts*/
1008 buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1009 buf[4] = htobe32(size - 8); /* length of code */
1010 buf[5] = htobe32(8); /* where to copy to */
1011 buf[6] = htobe32(0); /* where to jump to */
1013 submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1014 mxge_pio_copy(submit, buf, 64);
1019 while (*confirm != 0xffffffff && i < 20) {
1022 bus_dmamap_sync(sc->cmd_dma.dmat,
1023 sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1025 if (*confirm != 0xffffffff) {
1026 device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1035 mxge_update_mac_address(mxge_softc_t *sc)
1038 uint8_t *addr = sc->mac_addr;
1042 cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1043 | (addr[2] << 8) | addr[3]);
1045 cmd.data1 = ((addr[4] << 8) | (addr[5]));
1047 status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1052 mxge_change_pause(mxge_softc_t *sc, int pause)
1058 status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1061 status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1065 device_printf(sc->dev, "Failed to set flow control mode\n");
1073 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1078 if (mxge_always_promisc)
1082 status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1085 status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1089 device_printf(sc->dev, "Failed to set promisc mode\n");
1094 mxge_set_multicast_list(mxge_softc_t *sc)
1097 struct ifmultiaddr *ifma;
1098 struct ifnet *ifp = sc->ifp;
1101 /* This firmware is known to not support multicast */
1102 if (!sc->fw_multicast_support)
1105 /* Disable multicast filtering while we play with the lists*/
1106 err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1108 device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1109 " error status: %d\n", err);
1113 if (sc->adopted_rx_filter_bug)
1116 if (ifp->if_flags & IFF_ALLMULTI)
1117 /* request to disable multicast filtering, so quit here */
1120 /* Flush all the filters */
1122 err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1124 device_printf(sc->dev,
1125 "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1126 ", error status: %d\n", err);
1130 /* Walk the multicast list, and add each address */
1132 lwkt_serialize_enter(ifp->if_serializer);
1133 LIST_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1134 if (ifma->ifma_addr->sa_family != AF_LINK)
1136 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1138 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1140 cmd.data0 = htonl(cmd.data0);
1141 cmd.data1 = htonl(cmd.data1);
1142 err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1144 device_printf(sc->dev, "Failed "
1145 "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1147 /* abort, leaving multicast filtering off */
1148 lwkt_serialize_exit(ifp->if_serializer);
1152 lwkt_serialize_exit(ifp->if_serializer);
1153 /* Enable multicast filtering */
1154 err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1156 device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1157 ", error status: %d\n", err);
1162 mxge_max_mtu(mxge_softc_t *sc)
1167 if (MJUMPAGESIZE - MXGEFW_PAD > MXGEFW_MAX_MTU)
1168 return MXGEFW_MAX_MTU - MXGEFW_PAD;
1170 /* try to set nbufs to see if it we can
1171 use virtually contiguous jumbos */
1173 status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1176 return MXGEFW_MAX_MTU - MXGEFW_PAD;
1178 /* otherwise, we're limited to MJUMPAGESIZE */
1179 return MJUMPAGESIZE - MXGEFW_PAD;
1183 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1185 struct mxge_slice_state *ss;
1186 mxge_rx_done_t *rx_done;
1187 volatile uint32_t *irq_claim;
1191 /* try to send a reset command to the card to see if it
1193 memset(&cmd, 0, sizeof (cmd));
1194 status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1196 device_printf(sc->dev, "failed reset\n");
1200 mxge_dummy_rdma(sc, 1);
1203 /* set the intrq size */
1204 cmd.data0 = sc->rx_ring_size;
1205 status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1208 * Even though we already know how many slices are supported
1209 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1210 * has magic side effects, and must be called after a reset.
1211 * It must be called prior to calling any RSS related cmds,
1212 * including assigning an interrupt queue for anything but
1213 * slice 0. It must also be called *after*
1214 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1215 * the firmware to compute offsets.
1218 if (sc->num_slices > 1) {
1219 /* ask the maximum number of slices it supports */
1220 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1223 device_printf(sc->dev,
1224 "failed to get number of slices\n");
1228 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1229 * to setting up the interrupt queue DMA
1231 cmd.data0 = sc->num_slices;
1232 cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1233 #ifdef IFNET_BUF_RING
1234 cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1236 status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1239 device_printf(sc->dev,
1240 "failed to set number of slices\n");
1246 if (interrupts_setup) {
1247 /* Now exchange information about interrupts */
1248 for (slice = 0; slice < sc->num_slices; slice++) {
1249 rx_done = &sc->ss[slice].rx_done;
1250 memset(rx_done->entry, 0, sc->rx_ring_size);
1251 cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1252 cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1254 status |= mxge_send_cmd(sc,
1255 MXGEFW_CMD_SET_INTRQ_DMA,
1260 status |= mxge_send_cmd(sc,
1261 MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1264 sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1266 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1267 irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1270 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1272 sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1274 device_printf(sc->dev, "failed set interrupt parameters\n");
1279 *sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1282 /* run a DMA benchmark */
1283 (void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1285 for (slice = 0; slice < sc->num_slices; slice++) {
1286 ss = &sc->ss[slice];
1288 ss->irq_claim = irq_claim + (2 * slice);
1289 /* reset mcp/driver shared state back to 0 */
1290 ss->rx_done.idx = 0;
1291 ss->rx_done.cnt = 0;
1294 ss->tx.pkt_done = 0;
1295 ss->tx.queue_active = 0;
1296 ss->tx.activate = 0;
1297 ss->tx.deactivate = 0;
1302 ss->rx_small.cnt = 0;
1303 ss->lro_bad_csum = 0;
1305 ss->lro_flushed = 0;
1306 if (ss->fw_stats != NULL) {
1307 ss->fw_stats->valid = 0;
1308 ss->fw_stats->send_done_count = 0;
1311 sc->rdma_tags_available = 15;
1312 status = mxge_update_mac_address(sc);
1313 mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1314 mxge_change_pause(sc, sc->pause);
1315 mxge_set_multicast_list(sc);
1320 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1323 unsigned int intr_coal_delay;
1327 intr_coal_delay = sc->intr_coal_delay;
1328 err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1332 if (intr_coal_delay == sc->intr_coal_delay)
1335 if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1338 lwkt_serialize_enter(sc->ifp->if_serializer);
1339 *sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1340 sc->intr_coal_delay = intr_coal_delay;
1342 lwkt_serialize_exit(sc->ifp->if_serializer);
1347 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1350 unsigned int enabled;
1354 enabled = sc->pause;
1355 err = sysctl_handle_int(oidp, &enabled, arg2, req);
1359 if (enabled == sc->pause)
1362 lwkt_serialize_enter(sc->ifp->if_serializer);
1363 err = mxge_change_pause(sc, enabled);
1364 lwkt_serialize_exit(sc->ifp->if_serializer);
1369 mxge_change_lro_locked(mxge_softc_t *sc, int lro_cnt)
1376 ifp->if_capenable &= ~IFCAP_LRO;
1378 ifp->if_capenable |= IFCAP_LRO;
1379 sc->lro_cnt = lro_cnt;
1380 if (ifp->if_flags & IFF_RUNNING) {
1382 err = mxge_open(sc);
1388 mxge_change_lro(SYSCTL_HANDLER_ARGS)
1391 unsigned int lro_cnt;
1395 lro_cnt = sc->lro_cnt;
1396 err = sysctl_handle_int(oidp, &lro_cnt, arg2, req);
1400 if (lro_cnt == sc->lro_cnt)
1406 lwkt_serialize_enter(sc->ifp->if_serializer);
1407 err = mxge_change_lro_locked(sc, lro_cnt);
1408 lwkt_serialize_exit(sc->ifp->if_serializer);
1413 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1419 arg2 = be32toh(*(int *)arg1);
1421 err = sysctl_handle_int(oidp, arg1, arg2, req);
1427 mxge_rem_sysctls(mxge_softc_t *sc)
1429 struct mxge_slice_state *ss;
1432 if (sc->sysctl_tree != NULL) {
1433 sysctl_ctx_free(&sc->sysctl_ctx);
1434 sc->sysctl_tree = NULL;
1436 if (sc->slice_sysctl_tree == NULL)
1439 for (slice = 0; slice < sc->num_slices; slice++) {
1440 ss = &sc->ss[slice];
1441 if (ss == NULL || ss->sysctl_tree == NULL)
1443 sysctl_ctx_free(&ss->sysctl_ctx);
1444 ss->sysctl_tree = NULL;
1446 sysctl_ctx_free(&sc->slice_sysctl_ctx);
1447 sc->slice_sysctl_tree = NULL;
1451 mxge_add_sysctls(mxge_softc_t *sc)
1453 struct sysctl_ctx_list *ctx;
1454 struct sysctl_oid_list *children;
1456 struct mxge_slice_state *ss;
1460 ctx = &sc->sysctl_ctx;
1461 sysctl_ctx_init(ctx);
1462 sc->sysctl_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_STATIC_CHILDREN(_hw),
1464 device_get_nameunit(sc->dev),
1466 if (sc->sysctl_tree == NULL) {
1467 device_printf(sc->dev, "can't add sysctl node\n");
1471 children = SYSCTL_CHILDREN(sc->sysctl_tree);
1472 fw = sc->ss[0].fw_stats;
1474 /* random information */
1475 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1477 CTLFLAG_RD, &sc->fw_version,
1478 0, "firmware version");
1479 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1481 CTLFLAG_RD, &sc->serial_number_string,
1482 0, "serial number");
1483 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1485 CTLFLAG_RD, &sc->product_code_string,
1487 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1489 CTLFLAG_RD, &sc->link_width,
1491 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1493 CTLFLAG_RD, &sc->tx_boundary,
1495 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1497 CTLFLAG_RD, &sc->wc,
1498 0, "write combining PIO?");
1499 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1501 CTLFLAG_RD, &sc->read_dma,
1502 0, "DMA Read speed in MB/s");
1503 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1505 CTLFLAG_RD, &sc->write_dma,
1506 0, "DMA Write speed in MB/s");
1507 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1508 "read_write_dma_MBs",
1509 CTLFLAG_RD, &sc->read_write_dma,
1510 0, "DMA concurrent Read/Write speed in MB/s");
1513 /* performance related tunables */
1514 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1516 CTLTYPE_INT|CTLFLAG_RW, sc,
1517 0, mxge_change_intr_coal,
1518 "I", "interrupt coalescing delay in usecs");
1520 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1521 "flow_control_enabled",
1522 CTLTYPE_INT|CTLFLAG_RW, sc,
1523 0, mxge_change_flow_control,
1524 "I", "interrupt coalescing delay in usecs");
1526 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1528 CTLFLAG_RW, &mxge_deassert_wait,
1529 0, "Wait for IRQ line to go low in ihandler");
1531 /* stats block from firmware is in network byte order.
1533 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1535 CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1536 0, mxge_handle_be32,
1538 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1539 "rdma_tags_available",
1540 CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1541 0, mxge_handle_be32,
1542 "I", "rdma_tags_available");
1543 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1544 "dropped_bad_crc32",
1545 CTLTYPE_INT|CTLFLAG_RD,
1546 &fw->dropped_bad_crc32,
1547 0, mxge_handle_be32,
1548 "I", "dropped_bad_crc32");
1549 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1551 CTLTYPE_INT|CTLFLAG_RD,
1552 &fw->dropped_bad_phy,
1553 0, mxge_handle_be32,
1554 "I", "dropped_bad_phy");
1555 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1556 "dropped_link_error_or_filtered",
1557 CTLTYPE_INT|CTLFLAG_RD,
1558 &fw->dropped_link_error_or_filtered,
1559 0, mxge_handle_be32,
1560 "I", "dropped_link_error_or_filtered");
1561 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1562 "dropped_link_overflow",
1563 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1564 0, mxge_handle_be32,
1565 "I", "dropped_link_overflow");
1566 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1567 "dropped_multicast_filtered",
1568 CTLTYPE_INT|CTLFLAG_RD,
1569 &fw->dropped_multicast_filtered,
1570 0, mxge_handle_be32,
1571 "I", "dropped_multicast_filtered");
1572 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1573 "dropped_no_big_buffer",
1574 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1575 0, mxge_handle_be32,
1576 "I", "dropped_no_big_buffer");
1577 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1578 "dropped_no_small_buffer",
1579 CTLTYPE_INT|CTLFLAG_RD,
1580 &fw->dropped_no_small_buffer,
1581 0, mxge_handle_be32,
1582 "I", "dropped_no_small_buffer");
1583 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1585 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1586 0, mxge_handle_be32,
1587 "I", "dropped_overrun");
1588 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1590 CTLTYPE_INT|CTLFLAG_RD,
1592 0, mxge_handle_be32,
1593 "I", "dropped_pause");
1594 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1596 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1597 0, mxge_handle_be32,
1598 "I", "dropped_runt");
1600 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1601 "dropped_unicast_filtered",
1602 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1603 0, mxge_handle_be32,
1604 "I", "dropped_unicast_filtered");
1606 /* verbose printing? */
1607 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1609 CTLFLAG_RW, &mxge_verbose,
1610 0, "verbose printing");
1613 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1615 CTLTYPE_INT|CTLFLAG_RW, sc,
1617 "I", "number of lro merge queues");
1620 /* add counters exported for debugging from all slices */
1621 sysctl_ctx_init(&sc->slice_sysctl_ctx);
1622 sc->slice_sysctl_tree =
1623 SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1624 "slice", CTLFLAG_RD, 0, "");
1626 for (slice = 0; slice < sc->num_slices; slice++) {
1627 ss = &sc->ss[slice];
1628 sysctl_ctx_init(&ss->sysctl_ctx);
1629 ctx = &ss->sysctl_ctx;
1630 children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1631 ksprintf(slice_num, "%d", slice);
1633 SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1635 children = SYSCTL_CHILDREN(ss->sysctl_tree);
1636 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1638 CTLFLAG_RD, &ss->rx_small.cnt,
1640 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1642 CTLFLAG_RD, &ss->rx_big.cnt,
1644 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1645 "lro_flushed", CTLFLAG_RD, &ss->lro_flushed,
1646 0, "number of lro merge queues flushed");
1648 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1649 "lro_queued", CTLFLAG_RD, &ss->lro_queued,
1650 0, "number of frames appended to lro merge"
1653 #ifndef IFNET_BUF_RING
1654 /* only transmit from slice 0 for now */
1658 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1660 CTLFLAG_RD, &ss->tx.req,
1663 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1665 CTLFLAG_RD, &ss->tx.done,
1667 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1669 CTLFLAG_RD, &ss->tx.pkt_done,
1671 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1673 CTLFLAG_RD, &ss->tx.stall,
1675 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1677 CTLFLAG_RD, &ss->tx.wake,
1679 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1681 CTLFLAG_RD, &ss->tx.defrag,
1683 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1685 CTLFLAG_RD, &ss->tx.queue_active,
1686 0, "tx_queue_active");
1687 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1689 CTLFLAG_RD, &ss->tx.activate,
1691 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1693 CTLFLAG_RD, &ss->tx.deactivate,
1694 0, "tx_deactivate");
1698 /* copy an array of mcp_kreq_ether_send_t's to the mcp. Copy
1699 backwards one at a time and handle ring wraps */
1702 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1703 mcp_kreq_ether_send_t *src, int cnt)
1705 int idx, starting_slot;
1706 starting_slot = tx->req;
1709 idx = (starting_slot + cnt) & tx->mask;
1710 mxge_pio_copy(&tx->lanai[idx],
1711 &src[cnt], sizeof(*src));
1717 * copy an array of mcp_kreq_ether_send_t's to the mcp. Copy
1718 * at most 32 bytes at a time, so as to avoid involving the software
1719 * pio handler in the nic. We re-write the first segment's flags
1720 * to mark them valid only after writing the entire chain
1724 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1729 volatile uint32_t *dst_ints;
1730 mcp_kreq_ether_send_t *srcp;
1731 volatile mcp_kreq_ether_send_t *dstp, *dst;
1734 idx = tx->req & tx->mask;
1736 last_flags = src->flags;
1739 dst = dstp = &tx->lanai[idx];
1742 if ((idx + cnt) < tx->mask) {
1743 for (i = 0; i < (cnt - 1); i += 2) {
1744 mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1745 wmb(); /* force write every 32 bytes */
1750 /* submit all but the first request, and ensure
1751 that it is submitted below */
1752 mxge_submit_req_backwards(tx, src, cnt);
1756 /* submit the first request */
1757 mxge_pio_copy(dstp, srcp, sizeof(*src));
1758 wmb(); /* barrier before setting valid flag */
1761 /* re-write the last 32-bits with the valid flags */
1762 src->flags = last_flags;
1763 src_ints = (uint32_t *)src;
1765 dst_ints = (volatile uint32_t *)dst;
1767 *dst_ints = *src_ints;
1775 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1776 int busdma_seg_cnt, int ip_off)
1779 mcp_kreq_ether_send_t *req;
1780 bus_dma_segment_t *seg;
1783 uint32_t low, high_swapped;
1784 int len, seglen, cum_len, cum_len_next;
1785 int next_is_first, chop, cnt, rdma_count, small;
1786 uint16_t pseudo_hdr_offset, cksum_offset, mss;
1787 uint8_t flags, flags_next;
1790 mss = m->m_pkthdr.tso_segsz;
1792 /* negative cum_len signifies to the
1793 * send loop that we are still in the
1794 * header portion of the TSO packet.
1797 /* ensure we have the ethernet, IP and TCP
1798 header together in the first mbuf, copy
1799 it to a scratch buffer if not */
1800 if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1801 m_copydata(m, 0, ip_off + sizeof (*ip),
1803 ip = (struct ip *)(ss->scratch + ip_off);
1805 ip = (struct ip *)(mtod(m, char *) + ip_off);
1807 if (__predict_false(m->m_len < ip_off + (ip->ip_hl << 2)
1809 m_copydata(m, 0, ip_off + (ip->ip_hl << 2)
1810 + sizeof (*tcp), ss->scratch);
1811 ip = (struct ip *)(mtod(m, char *) + ip_off);
1814 tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1815 cum_len = -(ip_off + ((ip->ip_hl + tcp->th_off) << 2));
1817 /* TSO implies checksum offload on this hardware */
1818 cksum_offset = ip_off + (ip->ip_hl << 2);
1819 flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1822 /* for TSO, pseudo_hdr_offset holds mss.
1823 * The firmware figures out where to put
1824 * the checksum by parsing the header. */
1825 pseudo_hdr_offset = htobe16(mss);
1832 /* "rdma_count" is the number of RDMAs belonging to the
1833 * current packet BEFORE the current send request. For
1834 * non-TSO packets, this is equal to "count".
1835 * For TSO packets, rdma_count needs to be reset
1836 * to 0 after a segment cut.
1838 * The rdma_count field of the send request is
1839 * the number of RDMAs of the packet starting at
1840 * that request. For TSO send requests with one ore more cuts
1841 * in the middle, this is the number of RDMAs starting
1842 * after the last cut in the request. All previous
1843 * segments before the last cut implicitly have 1 RDMA.
1845 * Since the number of RDMAs is not known beforehand,
1846 * it must be filled-in retroactively - after each
1847 * segmentation cut or at the end of the entire packet.
1850 while (busdma_seg_cnt) {
1851 /* Break the busdma segment up into pieces*/
1852 low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1853 high_swapped = htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1857 flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1859 cum_len_next = cum_len + seglen;
1860 (req-rdma_count)->rdma_count = rdma_count + 1;
1861 if (__predict_true(cum_len >= 0)) {
1863 chop = (cum_len_next > mss);
1864 cum_len_next = cum_len_next % mss;
1865 next_is_first = (cum_len_next == 0);
1866 flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1867 flags_next |= next_is_first *
1869 rdma_count |= -(chop | next_is_first);
1870 rdma_count += chop & !next_is_first;
1871 } else if (cum_len_next >= 0) {
1876 small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1877 flags_next = MXGEFW_FLAGS_TSO_PLD |
1878 MXGEFW_FLAGS_FIRST |
1879 (small * MXGEFW_FLAGS_SMALL);
1882 req->addr_high = high_swapped;
1883 req->addr_low = htobe32(low);
1884 req->pseudo_hdr_offset = pseudo_hdr_offset;
1886 req->rdma_count = 1;
1887 req->length = htobe16(seglen);
1888 req->cksum_offset = cksum_offset;
1889 req->flags = flags | ((cum_len & 1) *
1890 MXGEFW_FLAGS_ALIGN_ODD);
1893 cum_len = cum_len_next;
1898 if (__predict_false(cksum_offset > seglen))
1899 cksum_offset -= seglen;
1902 if (__predict_false(cnt > tx->max_desc))
1908 (req-rdma_count)->rdma_count = rdma_count;
1912 req->flags |= MXGEFW_FLAGS_TSO_LAST;
1913 } while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1915 tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1916 mxge_submit_req(tx, tx->req_list, cnt);
1917 #ifdef IFNET_BUF_RING
1918 if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1919 /* tell the NIC to start polling this slice */
1921 tx->queue_active = 1;
1929 bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1933 kprintf("tx->max_desc exceeded via TSO!\n");
1934 kprintf("mss = %d, %ld, %d!\n", mss,
1935 (long)seg - (long)tx->seg_list, tx->max_desc);
1942 #endif /* IFCAP_TSO4 */
1944 #ifdef MXGE_NEW_VLAN_API
1946 * We reproduce the software vlan tag insertion from
1947 * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
1948 * vlan tag insertion. We need to advertise this in order to have the
1949 * vlan interface respect our csum offload flags.
1951 static struct mbuf *
1952 mxge_vlan_tag_insert(struct mbuf *m)
1954 struct ether_vlan_header *evl;
1956 M_PREPEND(m, EVL_ENCAPLEN, MB_DONTWAIT);
1957 if (__predict_false(m == NULL))
1959 if (m->m_len < sizeof(*evl)) {
1960 m = m_pullup(m, sizeof(*evl));
1961 if (__predict_false(m == NULL))
1965 * Transform the Ethernet header into an Ethernet header
1966 * with 802.1Q encapsulation.
1968 evl = mtod(m, struct ether_vlan_header *);
1969 bcopy((char *)evl + EVL_ENCAPLEN,
1970 (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
1971 evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
1972 evl->evl_tag = htons(m->m_pkthdr.ether_vlantag);
1973 m->m_flags &= ~M_VLANTAG;
1976 #endif /* MXGE_NEW_VLAN_API */
1979 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
1982 mcp_kreq_ether_send_t *req;
1983 bus_dma_segment_t *seg;
1988 int cnt, cum_len, err, i, idx, odd_flag, ip_off;
1989 uint16_t pseudo_hdr_offset;
1990 uint8_t flags, cksum_offset;
1997 ip_off = sizeof (struct ether_header);
1998 #ifdef MXGE_NEW_VLAN_API
1999 if (m->m_flags & M_VLANTAG) {
2000 m = mxge_vlan_tag_insert(m);
2001 if (__predict_false(m == NULL))
2003 ip_off += EVL_ENCAPLEN;
2006 /* (try to) map the frame for DMA */
2007 idx = tx->req & tx->mask;
2008 err = bus_dmamap_load_mbuf_segment(tx->dmat, tx->info[idx].map,
2009 m, tx->seg_list, 1, &cnt,
2011 if (__predict_false(err == EFBIG)) {
2012 /* Too many segments in the chain. Try
2014 m_tmp = m_defrag(m, M_NOWAIT);
2015 if (m_tmp == NULL) {
2020 err = bus_dmamap_load_mbuf_segment(tx->dmat,
2022 m, tx->seg_list, 1, &cnt,
2025 if (__predict_false(err != 0)) {
2026 device_printf(sc->dev, "bus_dmamap_load_mbuf_segment returned %d"
2027 " packet len = %d\n", err, m->m_pkthdr.len);
2030 bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2031 BUS_DMASYNC_PREWRITE);
2032 tx->info[idx].m = m;
2035 /* TSO is different enough, we handle it in another routine */
2036 if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2037 mxge_encap_tso(ss, m, cnt, ip_off);
2044 pseudo_hdr_offset = 0;
2045 flags = MXGEFW_FLAGS_NO_TSO;
2047 /* checksum offloading? */
2048 if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
2049 /* ensure ip header is in first mbuf, copy
2050 it to a scratch buffer if not */
2051 if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
2052 m_copydata(m, 0, ip_off + sizeof (*ip),
2054 ip = (struct ip *)(ss->scratch + ip_off);
2056 ip = (struct ip *)(mtod(m, char *) + ip_off);
2058 cksum_offset = ip_off + (ip->ip_hl << 2);
2059 pseudo_hdr_offset = cksum_offset + m->m_pkthdr.csum_data;
2060 pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2061 req->cksum_offset = cksum_offset;
2062 flags |= MXGEFW_FLAGS_CKSUM;
2063 odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2067 if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2068 flags |= MXGEFW_FLAGS_SMALL;
2070 /* convert segments into a request list */
2073 req->flags = MXGEFW_FLAGS_FIRST;
2074 for (i = 0; i < cnt; i++) {
2076 htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2078 htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2079 req->length = htobe16(seg->ds_len);
2080 req->cksum_offset = cksum_offset;
2081 if (cksum_offset > seg->ds_len)
2082 cksum_offset -= seg->ds_len;
2085 req->pseudo_hdr_offset = pseudo_hdr_offset;
2086 req->pad = 0; /* complete solid 16-byte block */
2087 req->rdma_count = 1;
2088 req->flags |= flags | ((cum_len & 1) * odd_flag);
2089 cum_len += seg->ds_len;
2095 /* pad runts to 60 bytes */
2099 htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2101 htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2102 req->length = htobe16(60 - cum_len);
2103 req->cksum_offset = 0;
2104 req->pseudo_hdr_offset = pseudo_hdr_offset;
2105 req->pad = 0; /* complete solid 16-byte block */
2106 req->rdma_count = 1;
2107 req->flags |= flags | ((cum_len & 1) * odd_flag);
2111 tx->req_list[0].rdma_count = cnt;
2113 /* print what the firmware will see */
2114 for (i = 0; i < cnt; i++) {
2115 kprintf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2116 "cso:%d, flags:0x%x, rdma:%d\n",
2117 i, (int)ntohl(tx->req_list[i].addr_high),
2118 (int)ntohl(tx->req_list[i].addr_low),
2119 (int)ntohs(tx->req_list[i].length),
2120 (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2121 tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2122 tx->req_list[i].rdma_count);
2124 kprintf("--------------\n");
2126 tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2127 mxge_submit_req(tx, tx->req_list, cnt);
2128 #ifdef IFNET_BUF_RING
2129 if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2130 /* tell the NIC to start polling this slice */
2132 tx->queue_active = 1;
2145 #ifdef IFNET_BUF_RING
2147 mxge_qflush(struct ifnet *ifp)
2149 mxge_softc_t *sc = ifp->if_softc;
2154 for (slice = 0; slice < sc->num_slices; slice++) {
2155 tx = &sc->ss[slice].tx;
2156 lwkt_serialize_enter(sc->ifp->if_serializer);
2157 while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2159 lwkt_serialize_exit(sc->ifp->if_serializer);
2165 mxge_start_locked(struct mxge_slice_state *ss)
2176 while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2177 m = drbr_dequeue(ifp, tx->br);
2181 /* let BPF see it */
2184 /* give it to the nic */
2187 /* ran out of transmit slots */
2188 if (((ss->if_flags & IFF_OACTIVE) == 0)
2189 && (!drbr_empty(ifp, tx->br))) {
2190 ss->if_flags |= IFF_OACTIVE;
2196 mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2207 if ((ss->if_flags & (IFF_RUNNING|IFF_OACTIVE)) !=
2209 err = drbr_enqueue(ifp, tx->br, m);
2213 if (drbr_empty(ifp, tx->br) &&
2214 ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2215 /* let BPF see it */
2217 /* give it to the nic */
2219 } else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2222 if (!drbr_empty(ifp, tx->br))
2223 mxge_start_locked(ss);
2228 mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2230 mxge_softc_t *sc = ifp->if_softc;
2231 struct mxge_slice_state *ss;
2237 slice = m->m_pkthdr.flowid;
2239 slice &= (sc->num_slices - 1); /* num_slices always power of 2 */
2241 ss = &sc->ss[slice];
2244 if(lwkt_serialize_try(ifp->if_serializer)) {
2245 err = mxge_transmit_locked(ss, m);
2246 lwkt_serialize_exit(ifp->if_serializer);
2248 err = drbr_enqueue(ifp, tx->br, m);
2257 mxge_start_locked(struct mxge_slice_state *ss)
2267 while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2268 m = ifq_dequeue(&ifp->if_snd, NULL);
2272 /* let BPF see it */
2275 /* give it to the nic */
2278 /* ran out of transmit slots */
2279 if ((sc->ifp->if_flags & IFF_OACTIVE) == 0) {
2280 sc->ifp->if_flags |= IFF_OACTIVE;
2286 mxge_start(struct ifnet *ifp)
2288 mxge_softc_t *sc = ifp->if_softc;
2289 struct mxge_slice_state *ss;
2291 /* only use the first slice for now */
2293 lwkt_serialize_enter(ifp->if_serializer);
2294 mxge_start_locked(ss);
2295 lwkt_serialize_exit(ifp->if_serializer);
2299 * copy an array of mcp_kreq_ether_recv_t's to the mcp. Copy
2300 * at most 32 bytes at a time, so as to avoid involving the software
2301 * pio handler in the nic. We re-write the first segment's low
2302 * DMA address to mark it valid only after we write the entire chunk
2306 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2307 mcp_kreq_ether_recv_t *src)
2311 low = src->addr_low;
2312 src->addr_low = 0xffffffff;
2313 mxge_pio_copy(dst, src, 4 * sizeof (*src));
2315 mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2317 src->addr_low = low;
2318 dst->addr_low = low;
2323 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2325 bus_dma_segment_t seg;
2327 mxge_rx_ring_t *rx = &ss->rx_small;
2330 m = m_gethdr(MB_DONTWAIT, MT_DATA);
2337 err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2338 &seg, 1, &cnt, BUS_DMA_NOWAIT);
2343 rx->info[idx].m = m;
2344 rx->shadow[idx].addr_low =
2345 htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2346 rx->shadow[idx].addr_high =
2347 htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2351 mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2356 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2358 bus_dma_segment_t seg[3];
2360 mxge_rx_ring_t *rx = &ss->rx_big;
2363 if (rx->cl_size == MCLBYTES)
2364 m = m_getcl(MB_DONTWAIT, MT_DATA, M_PKTHDR);
2366 m = m_getjcl(MB_DONTWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2372 m->m_len = rx->mlen;
2373 err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2374 seg, 1, &cnt, BUS_DMA_NOWAIT);
2379 rx->info[idx].m = m;
2380 rx->shadow[idx].addr_low =
2381 htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2382 rx->shadow[idx].addr_high =
2383 htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2385 #if MXGE_VIRT_JUMBOS
2386 for (i = 1; i < cnt; i++) {
2387 rx->shadow[idx + i].addr_low =
2388 htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2389 rx->shadow[idx + i].addr_high =
2390 htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2395 for (i = 0; i < rx->nbufs; i++) {
2396 if ((idx & 7) == 7) {
2397 mxge_submit_8rx(&rx->lanai[idx - 7],
2398 &rx->shadow[idx - 7]);
2406 * Myri10GE hardware checksums are not valid if the sender
2407 * padded the frame with non-zero padding. This is because
2408 * the firmware just does a simple 16-bit 1s complement
2409 * checksum across the entire frame, excluding the first 14
2410 * bytes. It is best to simply to check the checksum and
2411 * tell the stack about it only if the checksum is good
2414 static inline uint16_t
2415 mxge_rx_csum(struct mbuf *m, int csum)
2417 struct ether_header *eh;
2421 eh = mtod(m, struct ether_header *);
2423 /* only deal with IPv4 TCP & UDP for now */
2424 if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2426 ip = (struct ip *)(eh + 1);
2427 if (__predict_false(ip->ip_p != IPPROTO_TCP &&
2428 ip->ip_p != IPPROTO_UDP))
2431 c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2432 htonl(ntohs(csum) + ntohs(ip->ip_len) +
2433 - (ip->ip_hl << 2) + ip->ip_p));
2442 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2444 struct ether_vlan_header *evl;
2445 struct ether_header *eh;
2448 evl = mtod(m, struct ether_vlan_header *);
2449 eh = mtod(m, struct ether_header *);
2452 * fix checksum by subtracting EVL_ENCAPLEN bytes
2453 * after what the firmware thought was the end of the ethernet
2457 /* put checksum into host byte order */
2458 *csum = ntohs(*csum);
2459 partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2460 (*csum) += ~partial;
2461 (*csum) += ((*csum) < ~partial);
2462 (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2463 (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2465 /* restore checksum to network byte order;
2466 later consumers expect this */
2467 *csum = htons(*csum);
2470 #ifdef MXGE_NEW_VLAN_API
2471 m->m_pkthdr.ether_vlantag = ntohs(evl->evl_tag);
2475 mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2479 VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2480 m_tag_prepend(m, mtag);
2484 m->m_flags |= M_VLANTAG;
2487 * Remove the 802.1q header by copying the Ethernet
2488 * addresses over it and adjusting the beginning of
2489 * the data in the mbuf. The encapsulated Ethernet
2490 * type field is already in place.
2492 bcopy((char *)evl, (char *)evl + EVL_ENCAPLEN,
2493 ETHER_HDR_LEN - ETHER_TYPE_LEN);
2494 m_adj(m, EVL_ENCAPLEN);
2499 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2504 struct ether_header *eh;
2506 bus_dmamap_t old_map;
2508 uint16_t tcpudp_csum;
2513 idx = rx->cnt & rx->mask;
2514 rx->cnt += rx->nbufs;
2515 /* save a pointer to the received mbuf */
2516 m = rx->info[idx].m;
2517 /* try to replace the received mbuf */
2518 if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2519 /* drop the frame -- the old mbuf is re-cycled */
2524 /* unmap the received buffer */
2525 old_map = rx->info[idx].map;
2526 bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2527 bus_dmamap_unload(rx->dmat, old_map);
2529 /* swap the bus_dmamap_t's */
2530 rx->info[idx].map = rx->extra_map;
2531 rx->extra_map = old_map;
2533 /* mcp implicitly skips 1st 2 bytes so that packet is properly
2535 m->m_data += MXGEFW_PAD;
2537 m->m_pkthdr.rcvif = ifp;
2538 m->m_len = m->m_pkthdr.len = len;
2540 eh = mtod(m, struct ether_header *);
2541 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2542 mxge_vlan_tag_remove(m, &csum);
2544 /* if the checksum is valid, mark it in the mbuf header */
2545 if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2546 if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2548 /* otherwise, it was a UDP frame, or a TCP frame which
2549 we could not do LRO on. Tell the stack that the
2551 m->m_pkthdr.csum_data = 0xffff;
2552 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2555 /* flowid only valid if RSS hashing is enabled */
2556 if (sc->num_slices > 1) {
2557 m->m_pkthdr.flowid = (ss - sc->ss);
2558 m->m_flags |= M_FLOWID;
2561 /* pass the frame up the stack */
2562 (*ifp->if_input)(ifp, m);
2566 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2570 struct ether_header *eh;
2573 bus_dmamap_t old_map;
2575 uint16_t tcpudp_csum;
2580 idx = rx->cnt & rx->mask;
2582 /* save a pointer to the received mbuf */
2583 m = rx->info[idx].m;
2584 /* try to replace the received mbuf */
2585 if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2586 /* drop the frame -- the old mbuf is re-cycled */
2591 /* unmap the received buffer */
2592 old_map = rx->info[idx].map;
2593 bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2594 bus_dmamap_unload(rx->dmat, old_map);
2596 /* swap the bus_dmamap_t's */
2597 rx->info[idx].map = rx->extra_map;
2598 rx->extra_map = old_map;
2600 /* mcp implicitly skips 1st 2 bytes so that packet is properly
2602 m->m_data += MXGEFW_PAD;
2604 m->m_pkthdr.rcvif = ifp;
2605 m->m_len = m->m_pkthdr.len = len;
2607 eh = mtod(m, struct ether_header *);
2608 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2609 mxge_vlan_tag_remove(m, &csum);
2611 /* if the checksum is valid, mark it in the mbuf header */
2612 if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2613 if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2615 /* otherwise, it was a UDP frame, or a TCP frame which
2616 we could not do LRO on. Tell the stack that the
2618 m->m_pkthdr.csum_data = 0xffff;
2619 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2622 /* flowid only valid if RSS hashing is enabled */
2623 if (sc->num_slices > 1) {
2624 m->m_pkthdr.flowid = (ss - sc->ss);
2625 m->m_flags |= M_FLOWID;
2628 /* pass the frame up the stack */
2629 (*ifp->if_input)(ifp, m);
2633 mxge_clean_rx_done(struct mxge_slice_state *ss)
2635 mxge_rx_done_t *rx_done = &ss->rx_done;
2641 while (rx_done->entry[rx_done->idx].length != 0) {
2642 length = ntohs(rx_done->entry[rx_done->idx].length);
2643 rx_done->entry[rx_done->idx].length = 0;
2644 checksum = rx_done->entry[rx_done->idx].checksum;
2645 if (length <= (MHLEN - MXGEFW_PAD))
2646 mxge_rx_done_small(ss, length, checksum);
2648 mxge_rx_done_big(ss, length, checksum);
2650 rx_done->idx = rx_done->cnt & rx_done->mask;
2652 /* limit potential for livelock */
2653 if (__predict_false(++limit > rx_done->mask / 2))
2657 while (!SLIST_EMPTY(&ss->lro_active)) {
2658 struct lro_entry *lro = SLIST_FIRST(&ss->lro_active);
2659 SLIST_REMOVE_HEAD(&ss->lro_active, next);
2660 mxge_lro_flush(ss, lro);
2667 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2678 while (tx->pkt_done != mcp_idx) {
2679 idx = tx->done & tx->mask;
2681 m = tx->info[idx].m;
2682 /* mbuf and DMA map only attached to the first
2685 ss->obytes += m->m_pkthdr.len;
2686 if (m->m_flags & M_MCAST)
2689 tx->info[idx].m = NULL;
2690 map = tx->info[idx].map;
2691 bus_dmamap_unload(tx->dmat, map);
2694 if (tx->info[idx].flag) {
2695 tx->info[idx].flag = 0;
2700 /* If we have space, clear IFF_OACTIVE to tell the stack that
2701 its OK to send packets */
2702 #ifdef IFNET_BUF_RING
2703 flags = &ss->if_flags;
2705 flags = &ifp->if_flags;
2707 lwkt_serialize_enter(ifp->if_serializer);
2708 if ((*flags) & IFF_OACTIVE &&
2709 tx->req - tx->done < (tx->mask + 1)/4) {
2710 *(flags) &= ~IFF_OACTIVE;
2712 mxge_start_locked(ss);
2714 #ifdef IFNET_BUF_RING
2715 if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2716 /* let the NIC stop polling this queue, since there
2717 * are no more transmits pending */
2718 if (tx->req == tx->done) {
2720 tx->queue_active = 0;
2726 lwkt_serialize_exit(ifp->if_serializer);
2730 static struct mxge_media_type mxge_xfp_media_types[] =
2732 {IFM_10G_CX4, 0x7f, "10GBASE-CX4 (module)"},
2733 {IFM_10G_SR, (1 << 7), "10GBASE-SR"},
2734 {IFM_10G_LR, (1 << 6), "10GBASE-LR"},
2735 {0, (1 << 5), "10GBASE-ER"},
2736 {IFM_10G_LRM, (1 << 4), "10GBASE-LRM"},
2737 {0, (1 << 3), "10GBASE-SW"},
2738 {0, (1 << 2), "10GBASE-LW"},
2739 {0, (1 << 1), "10GBASE-EW"},
2740 {0, (1 << 0), "Reserved"}
2742 static struct mxge_media_type mxge_sfp_media_types[] =
2744 {0, (1 << 7), "Reserved"},
2745 {IFM_10G_LRM, (1 << 6), "10GBASE-LRM"},
2746 {IFM_10G_LR, (1 << 5), "10GBASE-LR"},
2747 {IFM_10G_SR, (1 << 4), "10GBASE-SR"}
2751 mxge_set_media(mxge_softc_t *sc, int type)
2753 sc->media_flags |= type;
2754 ifmedia_add(&sc->media, sc->media_flags, 0, NULL);
2755 ifmedia_set(&sc->media, sc->media_flags);
2760 * Determine the media type for a NIC. Some XFPs will identify
2761 * themselves only when their link is up, so this is initiated via a
2762 * link up interrupt. However, this can potentially take up to
2763 * several milliseconds, so it is run via the watchdog routine, rather
2764 * than in the interrupt handler itself. This need only be done
2765 * once, not each time the link is up.
2768 mxge_media_probe(mxge_softc_t *sc)
2773 struct mxge_media_type *mxge_media_types = NULL;
2774 int i, err, ms, mxge_media_type_entries;
2777 sc->need_media_probe = 0;
2779 /* if we've already set a media type, we're done */
2780 if (sc->media_flags != (IFM_ETHER | IFM_AUTO))
2784 * parse the product code to deterimine the interface type
2785 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2786 * after the 3rd dash in the driver's cached copy of the
2787 * EEPROM's product code string.
2789 ptr = sc->product_code_string;
2791 device_printf(sc->dev, "Missing product code\n");
2794 for (i = 0; i < 3; i++, ptr++) {
2795 ptr = index(ptr, '-');
2797 device_printf(sc->dev,
2798 "only %d dashes in PC?!?\n", i);
2804 mxge_set_media(sc, IFM_10G_CX4);
2807 else if (*ptr == 'Q') {
2808 /* -Q is Quad Ribbon Fiber */
2809 device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2810 /* FreeBSD has no media type for Quad ribbon fiber */
2816 mxge_media_types = mxge_xfp_media_types;
2817 mxge_media_type_entries =
2818 sizeof (mxge_xfp_media_types) /
2819 sizeof (mxge_xfp_media_types[0]);
2820 byte = MXGE_XFP_COMPLIANCE_BYTE;
2824 if (*ptr == 'S' || *(ptr +1) == 'S') {
2825 /* -S or -2S is SFP+ */
2826 mxge_media_types = mxge_sfp_media_types;
2827 mxge_media_type_entries =
2828 sizeof (mxge_sfp_media_types) /
2829 sizeof (mxge_sfp_media_types[0]);
2834 if (mxge_media_types == NULL) {
2835 device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2840 * At this point we know the NIC has an XFP cage, so now we
2841 * try to determine what is in the cage by using the
2842 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2843 * register. We read just one byte, which may take over
2847 cmd.data0 = 0; /* just fetch 1 byte, not all 256 */
2849 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2850 if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
2851 device_printf(sc->dev, "failed to read XFP\n");
2853 if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
2854 device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
2856 if (err != MXGEFW_CMD_OK) {
2860 /* now we wait for the data to be cached */
2862 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2863 for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
2866 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2868 if (err != MXGEFW_CMD_OK) {
2869 device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
2870 cage_type, err, ms);
2874 if (cmd.data0 == mxge_media_types[0].bitmask) {
2876 device_printf(sc->dev, "%s:%s\n", cage_type,
2877 mxge_media_types[0].name);
2878 mxge_set_media(sc, IFM_10G_CX4);
2881 for (i = 1; i < mxge_media_type_entries; i++) {
2882 if (cmd.data0 & mxge_media_types[i].bitmask) {
2884 device_printf(sc->dev, "%s:%s\n",
2886 mxge_media_types[i].name);
2888 mxge_set_media(sc, mxge_media_types[i].flag);
2892 device_printf(sc->dev, "%s media 0x%x unknown\n", cage_type,
2899 mxge_intr(void *arg)
2901 struct mxge_slice_state *ss = arg;
2902 mxge_softc_t *sc = ss->sc;
2903 mcp_irq_data_t *stats = ss->fw_stats;
2904 mxge_tx_ring_t *tx = &ss->tx;
2905 mxge_rx_done_t *rx_done = &ss->rx_done;
2906 uint32_t send_done_count;
2910 #ifndef IFNET_BUF_RING
2911 /* an interrupt on a non-zero slice is implicitly valid
2912 since MSI-X irqs are not shared */
2914 mxge_clean_rx_done(ss);
2915 *ss->irq_claim = be32toh(3);
2920 /* make sure the DMA has finished */
2921 if (!stats->valid) {
2924 valid = stats->valid;
2926 if (sc->legacy_irq) {
2927 /* lower legacy IRQ */
2928 *sc->irq_deassert = 0;
2929 if (!mxge_deassert_wait)
2930 /* don't wait for conf. that irq is low */
2936 /* loop while waiting for legacy irq deassertion */
2938 /* check for transmit completes and receives */
2939 send_done_count = be32toh(stats->send_done_count);
2940 while ((send_done_count != tx->pkt_done) ||
2941 (rx_done->entry[rx_done->idx].length != 0)) {
2942 if (send_done_count != tx->pkt_done)
2943 mxge_tx_done(ss, (int)send_done_count);
2944 mxge_clean_rx_done(ss);
2945 send_done_count = be32toh(stats->send_done_count);
2947 if (sc->legacy_irq && mxge_deassert_wait)
2949 } while (*((volatile uint8_t *) &stats->valid));
2951 /* fw link & error stats meaningful only on the first slice */
2952 if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
2953 if (sc->link_state != stats->link_up) {
2954 sc->link_state = stats->link_up;
2955 if (sc->link_state) {
2956 sc->ifp->if_link_state = LINK_STATE_UP;
2957 if_link_state_change(sc->ifp);
2959 device_printf(sc->dev, "link up\n");
2961 sc->ifp->if_link_state = LINK_STATE_DOWN;
2962 if_link_state_change(sc->ifp);
2964 device_printf(sc->dev, "link down\n");
2966 sc->need_media_probe = 1;
2968 if (sc->rdma_tags_available !=
2969 be32toh(stats->rdma_tags_available)) {
2970 sc->rdma_tags_available =
2971 be32toh(stats->rdma_tags_available);
2972 device_printf(sc->dev, "RDMA timed out! %d tags "
2973 "left\n", sc->rdma_tags_available);
2976 if (stats->link_down) {
2977 sc->down_cnt += stats->link_down;
2979 sc->ifp->if_link_state = LINK_STATE_DOWN;
2980 if_link_state_change(sc->ifp);
2984 /* check to see if we have rx token to pass back */
2986 *ss->irq_claim = be32toh(3);
2987 *(ss->irq_claim + 1) = be32toh(3);
2991 mxge_init(void *arg)
2998 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
3000 struct lro_entry *lro_entry;
3003 while (!SLIST_EMPTY(&ss->lro_free)) {
3004 lro_entry = SLIST_FIRST(&ss->lro_free);
3005 SLIST_REMOVE_HEAD(&ss->lro_free, next);
3006 kfree(lro_entry, M_DEVBUF);
3009 for (i = 0; i <= ss->rx_big.mask; i++) {
3010 if (ss->rx_big.info[i].m == NULL)
3012 bus_dmamap_unload(ss->rx_big.dmat,
3013 ss->rx_big.info[i].map);
3014 m_freem(ss->rx_big.info[i].m);
3015 ss->rx_big.info[i].m = NULL;
3018 for (i = 0; i <= ss->rx_small.mask; i++) {
3019 if (ss->rx_small.info[i].m == NULL)
3021 bus_dmamap_unload(ss->rx_small.dmat,
3022 ss->rx_small.info[i].map);
3023 m_freem(ss->rx_small.info[i].m);
3024 ss->rx_small.info[i].m = NULL;
3027 /* transmit ring used only on the first slice */
3028 if (ss->tx.info == NULL)
3031 for (i = 0; i <= ss->tx.mask; i++) {
3032 ss->tx.info[i].flag = 0;
3033 if (ss->tx.info[i].m == NULL)
3035 bus_dmamap_unload(ss->tx.dmat,
3036 ss->tx.info[i].map);
3037 m_freem(ss->tx.info[i].m);
3038 ss->tx.info[i].m = NULL;
3043 mxge_free_mbufs(mxge_softc_t *sc)
3047 for (slice = 0; slice < sc->num_slices; slice++)
3048 mxge_free_slice_mbufs(&sc->ss[slice]);
3052 mxge_free_slice_rings(struct mxge_slice_state *ss)
3057 if (ss->rx_done.entry != NULL)
3058 mxge_dma_free(&ss->rx_done.dma);
3059 ss->rx_done.entry = NULL;
3061 if (ss->tx.req_bytes != NULL)
3062 kfree(ss->tx.req_bytes, M_DEVBUF);
3063 ss->tx.req_bytes = NULL;
3065 if (ss->tx.seg_list != NULL)
3066 kfree(ss->tx.seg_list, M_DEVBUF);
3067 ss->tx.seg_list = NULL;
3069 if (ss->rx_small.shadow != NULL)
3070 kfree(ss->rx_small.shadow, M_DEVBUF);
3071 ss->rx_small.shadow = NULL;
3073 if (ss->rx_big.shadow != NULL)
3074 kfree(ss->rx_big.shadow, M_DEVBUF);
3075 ss->rx_big.shadow = NULL;
3077 if (ss->tx.info != NULL) {
3078 if (ss->tx.dmat != NULL) {
3079 for (i = 0; i <= ss->tx.mask; i++) {
3080 bus_dmamap_destroy(ss->tx.dmat,
3081 ss->tx.info[i].map);
3083 bus_dma_tag_destroy(ss->tx.dmat);
3085 kfree(ss->tx.info, M_DEVBUF);
3089 if (ss->rx_small.info != NULL) {
3090 if (ss->rx_small.dmat != NULL) {
3091 for (i = 0; i <= ss->rx_small.mask; i++) {
3092 bus_dmamap_destroy(ss->rx_small.dmat,
3093 ss->rx_small.info[i].map);
3095 bus_dmamap_destroy(ss->rx_small.dmat,
3096 ss->rx_small.extra_map);
3097 bus_dma_tag_destroy(ss->rx_small.dmat);
3099 kfree(ss->rx_small.info, M_DEVBUF);
3101 ss->rx_small.info = NULL;
3103 if (ss->rx_big.info != NULL) {
3104 if (ss->rx_big.dmat != NULL) {
3105 for (i = 0; i <= ss->rx_big.mask; i++) {
3106 bus_dmamap_destroy(ss->rx_big.dmat,
3107 ss->rx_big.info[i].map);
3109 bus_dmamap_destroy(ss->rx_big.dmat,
3110 ss->rx_big.extra_map);
3111 bus_dma_tag_destroy(ss->rx_big.dmat);
3113 kfree(ss->rx_big.info, M_DEVBUF);
3115 ss->rx_big.info = NULL;
3119 mxge_free_rings(mxge_softc_t *sc)
3123 for (slice = 0; slice < sc->num_slices; slice++)
3124 mxge_free_slice_rings(&sc->ss[slice]);
3128 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3129 int tx_ring_entries)
3131 mxge_softc_t *sc = ss->sc;
3137 /* allocate per-slice receive resources */
3139 ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3140 ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3142 /* allocate the rx shadow rings */
3143 bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3144 ss->rx_small.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3145 if (ss->rx_small.shadow == NULL)
3148 bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3149 ss->rx_big.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3150 if (ss->rx_big.shadow == NULL)
3153 /* allocate the rx host info rings */
3154 bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3155 ss->rx_small.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3156 if (ss->rx_small.info == NULL)
3159 bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3160 ss->rx_big.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3161 if (ss->rx_big.info == NULL)
3164 /* allocate the rx busdma resources */
3165 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3167 4096, /* boundary */
3168 BUS_SPACE_MAXADDR, /* low */
3169 BUS_SPACE_MAXADDR, /* high */
3170 NULL, NULL, /* filter */
3171 MHLEN, /* maxsize */
3173 MHLEN, /* maxsegsize */
3174 BUS_DMA_ALLOCNOW, /* flags */
3175 &ss->rx_small.dmat); /* tag */
3177 device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3182 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3184 #if MXGE_VIRT_JUMBOS
3185 4096, /* boundary */
3189 BUS_SPACE_MAXADDR, /* low */
3190 BUS_SPACE_MAXADDR, /* high */
3191 NULL, NULL, /* filter */
3192 3*4096, /* maxsize */
3193 #if MXGE_VIRT_JUMBOS
3195 4096, /* maxsegsize*/
3198 MJUM9BYTES, /* maxsegsize*/
3200 BUS_DMA_ALLOCNOW, /* flags */
3201 &ss->rx_big.dmat); /* tag */
3203 device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3207 for (i = 0; i <= ss->rx_small.mask; i++) {
3208 err = bus_dmamap_create(ss->rx_small.dmat, 0,
3209 &ss->rx_small.info[i].map);
3211 device_printf(sc->dev, "Err %d rx_small dmamap\n",
3216 err = bus_dmamap_create(ss->rx_small.dmat, 0,
3217 &ss->rx_small.extra_map);
3219 device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3224 for (i = 0; i <= ss->rx_big.mask; i++) {
3225 err = bus_dmamap_create(ss->rx_big.dmat, 0,
3226 &ss->rx_big.info[i].map);
3228 device_printf(sc->dev, "Err %d rx_big dmamap\n",
3233 err = bus_dmamap_create(ss->rx_big.dmat, 0,
3234 &ss->rx_big.extra_map);
3236 device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3241 /* now allocate TX resouces */
3243 #ifndef IFNET_BUF_RING
3244 /* only use a single TX ring for now */
3245 if (ss != ss->sc->ss)
3249 ss->tx.mask = tx_ring_entries - 1;
3250 ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3253 /* allocate the tx request copy block */
3255 sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3256 ss->tx.req_bytes = kmalloc(bytes, M_DEVBUF, M_WAITOK);
3257 if (ss->tx.req_bytes == NULL)
3259 /* ensure req_list entries are aligned to 8 bytes */
3260 ss->tx.req_list = (mcp_kreq_ether_send_t *)
3261 ((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3263 /* allocate the tx busdma segment list */
3264 bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3265 ss->tx.seg_list = (bus_dma_segment_t *)
3266 kmalloc(bytes, M_DEVBUF, M_WAITOK);
3267 if (ss->tx.seg_list == NULL)
3270 /* allocate the tx host info ring */
3271 bytes = tx_ring_entries * sizeof (*ss->tx.info);
3272 ss->tx.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3273 if (ss->tx.info == NULL)
3276 /* allocate the tx busdma resources */
3277 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3279 sc->tx_boundary, /* boundary */
3280 BUS_SPACE_MAXADDR, /* low */
3281 BUS_SPACE_MAXADDR, /* high */
3282 NULL, NULL, /* filter */
3283 65536 + 256, /* maxsize */
3284 ss->tx.max_desc - 2, /* num segs */
3285 sc->tx_boundary, /* maxsegsz */
3286 BUS_DMA_ALLOCNOW, /* flags */
3287 &ss->tx.dmat); /* tag */
3290 device_printf(sc->dev, "Err %d allocating tx dmat\n",
3295 /* now use these tags to setup dmamaps for each slot
3297 for (i = 0; i <= ss->tx.mask; i++) {
3298 err = bus_dmamap_create(ss->tx.dmat, 0,
3299 &ss->tx.info[i].map);
3301 device_printf(sc->dev, "Err %d tx dmamap\n",
3311 mxge_alloc_rings(mxge_softc_t *sc)
3315 int tx_ring_entries, rx_ring_entries;
3318 /* get ring sizes */
3319 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3320 tx_ring_size = cmd.data0;
3322 device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3326 tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3327 rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3328 ifq_set_maxlen(&sc->ifp->if_snd, tx_ring_entries - 1);
3329 ifq_set_ready(&sc->ifp->if_snd);
3331 for (slice = 0; slice < sc->num_slices; slice++) {
3332 err = mxge_alloc_slice_rings(&sc->ss[slice],
3341 mxge_free_rings(sc);
3348 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3350 int bufsize = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN + MXGEFW_PAD;
3352 if (bufsize < MCLBYTES) {
3353 /* easy, everything fits in a single buffer */
3354 *big_buf_size = MCLBYTES;
3355 *cl_size = MCLBYTES;
3360 if (bufsize < MJUMPAGESIZE) {
3361 /* still easy, everything still fits in a single buffer */
3362 *big_buf_size = MJUMPAGESIZE;
3363 *cl_size = MJUMPAGESIZE;
3367 #if MXGE_VIRT_JUMBOS
3368 /* now we need to use virtually contiguous buffers */
3369 *cl_size = MJUM9BYTES;
3370 *big_buf_size = 4096;
3371 *nbufs = mtu / 4096 + 1;
3372 /* needs to be a power of two, so round up */
3376 *cl_size = MJUM9BYTES;
3377 *big_buf_size = MJUM9BYTES;
3383 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3388 struct lro_entry *lro_entry;
3393 slice = ss - sc->ss;
3395 SLIST_INIT(&ss->lro_free);
3396 SLIST_INIT(&ss->lro_active);
3398 for (i = 0; i < sc->lro_cnt; i++) {
3399 lro_entry = (struct lro_entry *)
3400 kmalloc(sizeof (*lro_entry), M_DEVBUF,
3402 if (lro_entry == NULL) {
3406 SLIST_INSERT_HEAD(&ss->lro_free, lro_entry, next);
3408 /* get the lanai pointers to the send and receive rings */
3411 #ifndef IFNET_BUF_RING
3412 /* We currently only send from the first slice */
3416 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3418 (volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3419 ss->tx.send_go = (volatile uint32_t *)
3420 (sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3421 ss->tx.send_stop = (volatile uint32_t *)
3422 (sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3423 #ifndef IFNET_BUF_RING
3427 err |= mxge_send_cmd(sc,
3428 MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3429 ss->rx_small.lanai =
3430 (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3432 err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3434 (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3437 device_printf(sc->dev,
3438 "failed to get ring sizes or locations\n");
3442 /* stock receive rings */
3443 for (i = 0; i <= ss->rx_small.mask; i++) {
3444 map = ss->rx_small.info[i].map;
3445 err = mxge_get_buf_small(ss, map, i);
3447 device_printf(sc->dev, "alloced %d/%d smalls\n",
3448 i, ss->rx_small.mask + 1);
3452 for (i = 0; i <= ss->rx_big.mask; i++) {
3453 ss->rx_big.shadow[i].addr_low = 0xffffffff;
3454 ss->rx_big.shadow[i].addr_high = 0xffffffff;
3456 ss->rx_big.nbufs = nbufs;
3457 ss->rx_big.cl_size = cl_size;
3458 ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3459 EVL_ENCAPLEN + MXGEFW_PAD;
3460 for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3461 map = ss->rx_big.info[i].map;
3462 err = mxge_get_buf_big(ss, map, i);
3464 device_printf(sc->dev, "alloced %d/%d bigs\n",
3465 i, ss->rx_big.mask + 1);
3473 mxge_open(mxge_softc_t *sc)
3476 int err, big_bytes, nbufs, slice, cl_size, i;
3478 volatile uint8_t *itable;
3479 struct mxge_slice_state *ss;
3481 /* Copy the MAC address in case it was overridden */
3482 bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3484 err = mxge_reset(sc, 1);
3486 device_printf(sc->dev, "failed to reset\n");
3490 if (sc->num_slices > 1) {
3491 /* setup the indirection table */
3492 cmd.data0 = sc->num_slices;
3493 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3496 err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3499 device_printf(sc->dev,
3500 "failed to setup rss tables\n");
3504 /* just enable an identity mapping */
3505 itable = sc->sram + cmd.data0;
3506 for (i = 0; i < sc->num_slices; i++)
3507 itable[i] = (uint8_t)i;
3510 cmd.data1 = mxge_rss_hash_type;
3511 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3513 device_printf(sc->dev, "failed to enable slices\n");
3519 mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3522 err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3524 /* error is only meaningful if we're trying to set
3525 MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3526 if (err && nbufs > 1) {
3527 device_printf(sc->dev,
3528 "Failed to set alway-use-n to %d\n",
3532 /* Give the firmware the mtu and the big and small buffer
3533 sizes. The firmware wants the big buf size to be a power
3534 of two. Luckily, FreeBSD's clusters are powers of two */
3535 cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
3536 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3537 cmd.data0 = MHLEN - MXGEFW_PAD;
3538 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3540 cmd.data0 = big_bytes;
3541 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3544 device_printf(sc->dev, "failed to setup params\n");
3548 /* Now give him the pointer to the stats block */
3550 #ifdef IFNET_BUF_RING
3551 slice < sc->num_slices;
3556 ss = &sc->ss[slice];
3558 MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3560 MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3561 cmd.data2 = sizeof(struct mcp_irq_data);
3562 cmd.data2 |= (slice << 16);
3563 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3567 bus = sc->ss->fw_stats_dma.bus_addr;
3568 bus += offsetof(struct mcp_irq_data, send_done_count);
3569 cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3570 cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3571 err = mxge_send_cmd(sc,
3572 MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3574 /* Firmware cannot support multicast without STATS_DMA_V2 */
3575 sc->fw_multicast_support = 0;
3577 sc->fw_multicast_support = 1;
3581 device_printf(sc->dev, "failed to setup params\n");
3585 for (slice = 0; slice < sc->num_slices; slice++) {
3586 err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3588 device_printf(sc->dev, "couldn't open slice %d\n",
3594 /* Finally, start the firmware running */
3595 err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3597 device_printf(sc->dev, "Couldn't bring up link\n");
3600 #ifdef IFNET_BUF_RING
3601 for (slice = 0; slice < sc->num_slices; slice++) {
3602 ss = &sc->ss[slice];
3603 ss->if_flags |= IFF_RUNNING;
3604 ss->if_flags &= ~IFF_OACTIVE;
3607 sc->ifp->if_flags |= IFF_RUNNING;
3608 sc->ifp->if_flags &= ~IFF_OACTIVE;
3609 callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3615 mxge_free_mbufs(sc);
3621 mxge_close(mxge_softc_t *sc)
3624 int err, old_down_cnt;
3625 #ifdef IFNET_BUF_RING
3626 struct mxge_slice_state *ss;
3630 callout_stop(&sc->co_hdl);
3631 #ifdef IFNET_BUF_RING
3632 for (slice = 0; slice < sc->num_slices; slice++) {
3633 ss = &sc->ss[slice];
3634 ss->if_flags &= ~IFF_RUNNING;
3637 sc->ifp->if_flags &= ~IFF_RUNNING;
3638 old_down_cnt = sc->down_cnt;
3640 err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3642 device_printf(sc->dev, "Couldn't bring down link\n");
3644 if (old_down_cnt == sc->down_cnt) {
3645 /* wait for down irq */
3646 DELAY(10 * sc->intr_coal_delay);
3649 if (old_down_cnt == sc->down_cnt) {
3650 device_printf(sc->dev, "never got down irq\n");
3653 mxge_free_mbufs(sc);
3659 mxge_setup_cfg_space(mxge_softc_t *sc)
3661 device_t dev = sc->dev;
3663 uint16_t cmd, lnk, pectl;
3665 /* find the PCIe link width and set max read request to 4KB*/
3666 if (pci_find_extcap(dev, PCIY_EXPRESS, ®) == 0) {
3667 lnk = pci_read_config(dev, reg + 0x12, 2);
3668 sc->link_width = (lnk >> 4) & 0x3f;
3670 pectl = pci_read_config(dev, reg + 0x8, 2);
3671 pectl = (pectl & ~0x7000) | (5 << 12);
3672 pci_write_config(dev, reg + 0x8, pectl, 2);
3675 /* Enable DMA and Memory space access */
3676 pci_enable_busmaster(dev);
3677 cmd = pci_read_config(dev, PCIR_COMMAND, 2);
3678 cmd |= PCIM_CMD_MEMEN;
3679 pci_write_config(dev, PCIR_COMMAND, cmd, 2);
3683 mxge_read_reboot(mxge_softc_t *sc)
3685 device_t dev = sc->dev;
3688 /* find the vendor specific offset */
3689 if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
3690 device_printf(sc->dev,
3691 "could not find vendor specific offset\n");
3692 return (uint32_t)-1;
3694 /* enable read32 mode */
3695 pci_write_config(dev, vs + 0x10, 0x3, 1);
3696 /* tell NIC which register to read */
3697 pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3698 return (pci_read_config(dev, vs + 0x14, 4));
3702 mxge_watchdog_reset(mxge_softc_t *sc, int slice)
3704 struct pci_devinfo *dinfo;
3712 device_printf(sc->dev, "Watchdog reset!\n");
3715 * check to see if the NIC rebooted. If it did, then all of
3716 * PCI config space has been reset, and things like the
3717 * busmaster bit will be zero. If this is the case, then we
3718 * must restore PCI config space before the NIC can be used
3721 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3722 if (cmd == 0xffff) {
3724 * maybe the watchdog caught the NIC rebooting; wait
3725 * up to 100ms for it to finish. If it does not come
3726 * back, then give up
3729 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3730 if (cmd == 0xffff) {
3731 device_printf(sc->dev, "NIC disappeared!\n");
3735 if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3736 /* print the reboot status */
3737 reboot = mxge_read_reboot(sc);
3738 device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3740 /* restore PCI configuration space */
3741 dinfo = device_get_ivars(sc->dev);
3742 pci_cfg_restore(sc->dev, dinfo);
3744 /* and redo any changes we made to our config space */
3745 mxge_setup_cfg_space(sc);
3747 if (sc->ifp->if_flags & IFF_RUNNING) {
3749 err = mxge_open(sc);
3752 tx = &sc->ss[slice].tx;
3753 device_printf(sc->dev,
3754 "NIC did not reboot, slice %d ring state:\n",
3756 device_printf(sc->dev,
3757 "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3758 tx->req, tx->done, tx->queue_active);
3759 device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3760 tx->activate, tx->deactivate);
3761 device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3763 be32toh(sc->ss->fw_stats->send_done_count));
3764 device_printf(sc->dev, "not resetting\n");
3770 mxge_watchdog(mxge_softc_t *sc)
3773 uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3776 /* see if we have outstanding transmits, which
3777 have been pending for more than mxge_ticks */
3779 #ifdef IFNET_BUF_RING
3780 (i < sc->num_slices) && (err == 0);
3782 (i < 1) && (err == 0);
3786 if (tx->req != tx->done &&
3787 tx->watchdog_req != tx->watchdog_done &&
3788 tx->done == tx->watchdog_done) {
3789 /* check for pause blocking before resetting */
3790 if (tx->watchdog_rx_pause == rx_pause)
3791 err = mxge_watchdog_reset(sc, i);
3793 device_printf(sc->dev, "Flow control blocking "
3794 "xmits, check link partner\n");
3797 tx->watchdog_req = tx->req;
3798 tx->watchdog_done = tx->done;
3799 tx->watchdog_rx_pause = rx_pause;
3802 if (sc->need_media_probe)
3803 mxge_media_probe(sc);
3808 mxge_update_stats(mxge_softc_t *sc)
3810 struct mxge_slice_state *ss;
3811 u_long ipackets = 0;
3812 u_long opackets = 0;
3813 #ifdef IFNET_BUF_RING
3821 for (slice = 0; slice < sc->num_slices; slice++) {
3822 ss = &sc->ss[slice];
3823 ipackets += ss->ipackets;
3824 opackets += ss->opackets;
3825 #ifdef IFNET_BUF_RING
3826 obytes += ss->obytes;
3827 omcasts += ss->omcasts;
3828 odrops += ss->tx.br->br_drops;
3830 oerrors += ss->oerrors;
3832 sc->ifp->if_ipackets = ipackets;
3833 sc->ifp->if_opackets = opackets;
3834 #ifdef IFNET_BUF_RING
3835 sc->ifp->if_obytes = obytes;
3836 sc->ifp->if_omcasts = omcasts;
3837 sc->ifp->if_snd.ifq_drops = odrops;
3839 sc->ifp->if_oerrors = oerrors;
3843 mxge_tick(void *arg)
3845 mxge_softc_t *sc = arg;
3848 lwkt_serialize_enter(sc->ifp->if_serializer);
3849 /* aggregate stats from different slices */
3850 mxge_update_stats(sc);
3851 if (!sc->watchdog_countdown) {
3852 err = mxge_watchdog(sc);
3853 sc->watchdog_countdown = 4;
3855 sc->watchdog_countdown--;
3857 callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3858 lwkt_serialize_exit(sc->ifp->if_serializer);
3862 mxge_media_change(struct ifnet *ifp)
3868 mxge_change_mtu(mxge_softc_t *sc, int mtu)
3870 struct ifnet *ifp = sc->ifp;
3871 int real_mtu, old_mtu;
3875 real_mtu = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
3876 if ((real_mtu > sc->max_mtu) || real_mtu < 60)
3878 lwkt_serialize_enter(ifp->if_serializer);
3879 old_mtu = ifp->if_mtu;
3881 if (ifp->if_flags & IFF_RUNNING) {
3883 err = mxge_open(sc);
3885 ifp->if_mtu = old_mtu;
3887 (void) mxge_open(sc);
3890 lwkt_serialize_exit(ifp->if_serializer);
3895 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
3897 mxge_softc_t *sc = ifp->if_softc;
3902 ifmr->ifm_status = IFM_AVALID;
3903 ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
3904 ifmr->ifm_active = IFM_AUTO | IFM_ETHER;
3905 ifmr->ifm_active |= sc->link_state ? IFM_FDX : 0;
3909 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data, struct ucred *cr)
3911 mxge_softc_t *sc = ifp->if_softc;
3912 struct ifreq *ifr = (struct ifreq *)data;
3920 err = ether_ioctl(ifp, command, data);
3924 err = mxge_change_mtu(sc, ifr->ifr_mtu);
3928 lwkt_serialize_enter(sc->ifp->if_serializer);
3930 lwkt_serialize_exit(ifp->if_serializer);
3933 if (ifp->if_flags & IFF_UP) {
3934 if (!(ifp->if_flags & IFF_RUNNING)) {
3935 err = mxge_open(sc);
3937 /* take care of promis can allmulti
3939 mxge_change_promisc(sc,
3940 ifp->if_flags & IFF_PROMISC);
3941 mxge_set_multicast_list(sc);
3944 if (ifp->if_flags & IFF_RUNNING) {
3948 lwkt_serialize_exit(ifp->if_serializer);
3953 lwkt_serialize_enter(sc->ifp->if_serializer);
3954 mxge_set_multicast_list(sc);
3955 lwkt_serialize_exit(sc->ifp->if_serializer);
3959 lwkt_serialize_enter(sc->ifp->if_serializer);
3960 mask = ifr->ifr_reqcap ^ ifp->if_capenable;
3961 if (mask & IFCAP_TXCSUM) {
3962 if (IFCAP_TXCSUM & ifp->if_capenable) {
3963 ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
3964 ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
3967 ifp->if_capenable |= IFCAP_TXCSUM;
3968 ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
3970 } else if (mask & IFCAP_RXCSUM) {
3971 if (IFCAP_RXCSUM & ifp->if_capenable) {
3972 ifp->if_capenable &= ~IFCAP_RXCSUM;
3975 ifp->if_capenable |= IFCAP_RXCSUM;
3979 if (mask & IFCAP_TSO4) {
3980 if (IFCAP_TSO4 & ifp->if_capenable) {
3981 ifp->if_capenable &= ~IFCAP_TSO4;
3982 ifp->if_hwassist &= ~CSUM_TSO;
3983 } else if (IFCAP_TXCSUM & ifp->if_capenable) {
3984 ifp->if_capenable |= IFCAP_TSO4;
3985 ifp->if_hwassist |= CSUM_TSO;
3987 kprintf("mxge requires tx checksum offload"
3988 " be enabled to use TSO\n");
3992 if (mask & IFCAP_LRO) {
3993 if (IFCAP_LRO & ifp->if_capenable)
3994 err = mxge_change_lro_locked(sc, 0);
3996 err = mxge_change_lro_locked(sc, mxge_lro_cnt);
3998 if (mask & IFCAP_VLAN_HWTAGGING)
3999 ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
4000 lwkt_serialize_exit(sc->ifp->if_serializer);
4001 VLAN_CAPABILITIES(ifp);
4006 err = ifmedia_ioctl(ifp, (struct ifreq *)data,
4007 &sc->media, command);
4017 mxge_fetch_tunables(mxge_softc_t *sc)
4020 TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4021 TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4022 &mxge_flow_control);
4023 TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4024 &mxge_intr_coal_delay);
4025 TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4026 &mxge_nvidia_ecrc_enable);
4027 TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4028 &mxge_force_firmware);
4029 TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4030 &mxge_deassert_wait);
4031 TUNABLE_INT_FETCH("hw.mxge.verbose",
4033 TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4034 TUNABLE_INT_FETCH("hw.mxge.lro_cnt", &sc->lro_cnt);
4035 TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4036 TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4037 TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4038 if (sc->lro_cnt != 0)
4039 mxge_lro_cnt = sc->lro_cnt;
4043 if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4044 mxge_intr_coal_delay = 30;
4045 if (mxge_ticks == 0)
4046 mxge_ticks = hz / 2;
4047 sc->pause = mxge_flow_control;
4048 if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4049 || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4050 mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
4052 if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4053 mxge_initial_mtu < ETHER_MIN_LEN)
4054 mxge_initial_mtu = ETHERMTU_JUMBO;
4059 mxge_free_slices(mxge_softc_t *sc)
4061 struct mxge_slice_state *ss;
4068 for (i = 0; i < sc->num_slices; i++) {
4070 if (ss->fw_stats != NULL) {
4071 mxge_dma_free(&ss->fw_stats_dma);
4072 ss->fw_stats = NULL;
4073 #ifdef IFNET_BUF_RING
4074 if (ss->tx.br != NULL) {
4075 drbr_free(ss->tx.br, M_DEVBUF);
4080 if (ss->rx_done.entry != NULL) {
4081 mxge_dma_free(&ss->rx_done.dma);
4082 ss->rx_done.entry = NULL;
4085 kfree(sc->ss, M_DEVBUF);
4090 mxge_alloc_slices(mxge_softc_t *sc)
4093 struct mxge_slice_state *ss;
4095 int err, i, max_intr_slots;
4097 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4099 device_printf(sc->dev, "Cannot determine rx ring size\n");
4102 sc->rx_ring_size = cmd.data0;
4103 max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4105 bytes = sizeof (*sc->ss) * sc->num_slices;
4106 sc->ss = kmalloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4109 for (i = 0; i < sc->num_slices; i++) {
4114 /* allocate per-slice rx interrupt queues */
4116 bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4117 err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4120 ss->rx_done.entry = ss->rx_done.dma.addr;
4121 bzero(ss->rx_done.entry, bytes);
4124 * allocate the per-slice firmware stats; stats
4125 * (including tx) are used used only on the first
4128 #ifndef IFNET_BUF_RING
4133 bytes = sizeof (*ss->fw_stats);
4134 err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4135 sizeof (*ss->fw_stats), 64);
4138 ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4139 #ifdef IFNET_BUF_RING
4140 ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4148 mxge_free_slices(sc);
4153 mxge_slice_probe(mxge_softc_t *sc)
4157 int msix_cnt, status, max_intr_slots;
4161 * don't enable multiple slices if they are not enabled,
4162 * or if this is not an SMP system
4165 if (mxge_max_slices == 0 || mxge_max_slices == 1 || ncpus < 2)
4168 /* see how many MSI-X interrupts are available */
4169 msix_cnt = pci_msix_count(sc->dev);
4173 /* now load the slice aware firmware see what it supports */
4174 old_fw = sc->fw_name;
4175 if (old_fw == mxge_fw_aligned)
4176 sc->fw_name = mxge_fw_rss_aligned;
4178 sc->fw_name = mxge_fw_rss_unaligned;
4179 status = mxge_load_firmware(sc, 0);
4181 device_printf(sc->dev, "Falling back to a single slice\n");
4185 /* try to send a reset command to the card to see if it
4187 memset(&cmd, 0, sizeof (cmd));
4188 status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4190 device_printf(sc->dev, "failed reset\n");
4194 /* get rx ring size */
4195 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4197 device_printf(sc->dev, "Cannot determine rx ring size\n");
4200 max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4202 /* tell it the size of the interrupt queues */
4203 cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4204 status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4206 device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4210 /* ask the maximum number of slices it supports */
4211 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4213 device_printf(sc->dev,
4214 "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4217 sc->num_slices = cmd.data0;
4218 if (sc->num_slices > msix_cnt)
4219 sc->num_slices = msix_cnt;
4221 if (mxge_max_slices == -1) {
4222 /* cap to number of CPUs in system */
4223 if (sc->num_slices > ncpus)
4224 sc->num_slices = ncpus;
4226 if (sc->num_slices > mxge_max_slices)
4227 sc->num_slices = mxge_max_slices;
4229 /* make sure it is a power of two */
4230 while (sc->num_slices & (sc->num_slices - 1))
4234 device_printf(sc->dev, "using %d slices\n",
4240 sc->fw_name = old_fw;
4241 (void) mxge_load_firmware(sc, 0);
4245 mxge_add_msix_irqs(mxge_softc_t *sc)
4248 int count, err, i, rid;
4251 sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4254 if (sc->msix_table_res == NULL) {
4255 device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4259 count = sc->num_slices;
4260 err = pci_alloc_msix(sc->dev, &count);
4262 device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4263 "err = %d \n", sc->num_slices, err);
4264 goto abort_with_msix_table;
4266 if (count < sc->num_slices) {
4267 device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4268 count, sc->num_slices);
4269 device_printf(sc->dev,
4270 "Try setting hw.mxge.max_slices to %d\n",
4273 goto abort_with_msix;
4275 bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4276 sc->msix_irq_res = kmalloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4277 if (sc->msix_irq_res == NULL) {
4279 goto abort_with_msix;
4282 for (i = 0; i < sc->num_slices; i++) {
4284 sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4287 if (sc->msix_irq_res[i] == NULL) {
4288 device_printf(sc->dev, "couldn't allocate IRQ res"
4289 " for message %d\n", i);
4291 goto abort_with_res;
4295 bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4296 sc->msix_ih = kmalloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4298 for (i = 0; i < sc->num_slices; i++) {
4299 err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4301 mxge_intr, &sc->ss[i], &sc->msix_ih[i],
4302 sc->ifp->if_serializer);
4304 device_printf(sc->dev, "couldn't setup intr for "
4306 goto abort_with_intr;
4311 device_printf(sc->dev, "using %d msix IRQs:",
4313 for (i = 0; i < sc->num_slices; i++)
4314 kprintf(" %ld", rman_get_start(sc->msix_irq_res[i]));
4320 for (i = 0; i < sc->num_slices; i++) {
4321 if (sc->msix_ih[i] != NULL) {
4322 bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4324 sc->msix_ih[i] = NULL;
4327 kfree(sc->msix_ih, M_DEVBUF);
4331 for (i = 0; i < sc->num_slices; i++) {
4333 if (sc->msix_irq_res[i] != NULL)
4334 bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4335 sc->msix_irq_res[i]);
4336 sc->msix_irq_res[i] = NULL;
4338 kfree(sc->msix_irq_res, M_DEVBUF);
4342 pci_release_msi(sc->dev);
4344 abort_with_msix_table:
4345 bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4346 sc->msix_table_res);
4352 mxge_add_single_irq(mxge_softc_t *sc)
4354 int count, err, rid;
4356 count = pci_msi_count(sc->dev);
4357 if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4363 sc->irq_res = bus_alloc_resource(sc->dev, SYS_RES_IRQ, &rid, 0, ~0,
4364 1, RF_SHAREABLE | RF_ACTIVE);
4365 if (sc->irq_res == NULL) {
4366 device_printf(sc->dev, "could not alloc interrupt\n");
4370 device_printf(sc->dev, "using %s irq %ld\n",
4371 sc->legacy_irq ? "INTx" : "MSI",
4372 rman_get_start(sc->irq_res));
4373 err = bus_setup_intr(sc->dev, sc->irq_res,
4375 mxge_intr, &sc->ss[0], &sc->ih,
4376 sc->ifp->if_serializer);
4378 bus_release_resource(sc->dev, SYS_RES_IRQ,
4379 sc->legacy_irq ? 0 : 1, sc->irq_res);
4380 if (!sc->legacy_irq)
4381 pci_release_msi(sc->dev);
4387 mxge_rem_msix_irqs(mxge_softc_t *sc)
4391 for (i = 0; i < sc->num_slices; i++) {
4392 if (sc->msix_ih[i] != NULL) {
4393 bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4395 sc->msix_ih[i] = NULL;
4398 kfree(sc->msix_ih, M_DEVBUF);
4400 for (i = 0; i < sc->num_slices; i++) {
4402 if (sc->msix_irq_res[i] != NULL)
4403 bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4404 sc->msix_irq_res[i]);
4405 sc->msix_irq_res[i] = NULL;
4407 kfree(sc->msix_irq_res, M_DEVBUF);
4409 bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4410 sc->msix_table_res);
4412 pci_release_msi(sc->dev);
4417 mxge_rem_single_irq(mxge_softc_t *sc)
4419 bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4420 bus_release_resource(sc->dev, SYS_RES_IRQ,
4421 sc->legacy_irq ? 0 : 1, sc->irq_res);
4422 if (!sc->legacy_irq)
4423 pci_release_msi(sc->dev);
4427 mxge_rem_irq(mxge_softc_t *sc)
4429 if (sc->num_slices > 1)
4430 mxge_rem_msix_irqs(sc);
4432 mxge_rem_single_irq(sc);
4436 mxge_add_irq(mxge_softc_t *sc)
4440 if (sc->num_slices > 1)
4441 err = mxge_add_msix_irqs(sc);
4443 err = mxge_add_single_irq(sc);
4445 if (0 && err == 0 && sc->num_slices > 1) {
4446 mxge_rem_msix_irqs(sc);
4447 err = mxge_add_msix_irqs(sc);
4454 mxge_attach(device_t dev)
4456 mxge_softc_t *sc = device_get_softc(dev);
4457 struct ifnet *ifp = &sc->arpcom.ac_if;
4461 * avoid rewriting half the lines in this file to use
4462 * &sc->arpcom.ac_if instead
4466 mxge_fetch_tunables(sc);
4468 err = bus_dma_tag_create(NULL, /* parent */
4471 BUS_SPACE_MAXADDR, /* low */
4472 BUS_SPACE_MAXADDR, /* high */
4473 NULL, NULL, /* filter */
4474 65536 + 256, /* maxsize */
4475 MXGE_MAX_SEND_DESC, /* num segs */
4476 65536, /* maxsegsize */
4478 &sc->parent_dmat); /* tag */
4481 device_printf(sc->dev, "Err %d allocating parent dmat\n",
4483 goto abort_with_nothing;
4487 if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4489 callout_init(&sc->co_hdl);
4491 mxge_setup_cfg_space(sc);
4493 /* Map the board into the kernel */
4495 sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
4497 if (sc->mem_res == NULL) {
4498 device_printf(dev, "could not map memory\n");
4500 goto abort_with_nothing;
4502 sc->sram = rman_get_virtual(sc->mem_res);
4503 sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4504 if (sc->sram_size > rman_get_size(sc->mem_res)) {
4505 device_printf(dev, "impossible memory region size %ld\n",
4506 rman_get_size(sc->mem_res));
4508 goto abort_with_mem_res;
4511 /* make NULL terminated copy of the EEPROM strings section of
4513 bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4514 bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4515 rman_get_bushandle(sc->mem_res),
4516 sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4518 MXGE_EEPROM_STRINGS_SIZE - 2);
4519 err = mxge_parse_strings(sc);
4521 goto abort_with_mem_res;
4523 /* Enable write combining for efficient use of PCIe bus */
4526 /* Allocate the out of band dma memory */
4527 err = mxge_dma_alloc(sc, &sc->cmd_dma,
4528 sizeof (mxge_cmd_t), 64);
4530 goto abort_with_mem_res;
4531 sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4532 err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4534 goto abort_with_cmd_dma;
4536 err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4538 goto abort_with_zeropad_dma;
4540 /* select & load the firmware */
4541 err = mxge_select_firmware(sc);
4543 goto abort_with_dmabench;
4544 sc->intr_coal_delay = mxge_intr_coal_delay;
4546 mxge_slice_probe(sc);
4547 err = mxge_alloc_slices(sc);
4549 goto abort_with_dmabench;
4551 err = mxge_reset(sc, 0);
4553 goto abort_with_slices;
4555 err = mxge_alloc_rings(sc);
4557 device_printf(sc->dev, "failed to allocate rings\n");
4558 goto abort_with_dmabench;
4561 err = mxge_add_irq(sc);
4563 device_printf(sc->dev, "failed to add irq\n");
4564 goto abort_with_rings;
4567 ifp->if_baudrate = IF_Gbps(10UL);
4568 ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4571 ifp->if_capabilities |= IFCAP_LRO;
4574 #ifdef MXGE_NEW_VLAN_API
4575 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4578 sc->max_mtu = mxge_max_mtu(sc);
4579 if (sc->max_mtu >= 9000)
4580 ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4582 device_printf(dev, "MTU limited to %d. Install "
4583 "latest firmware for 9000 byte jumbo support\n",
4584 sc->max_mtu - ETHER_HDR_LEN);
4585 ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4586 ifp->if_capenable = ifp->if_capabilities;
4587 if (sc->lro_cnt == 0)
4588 ifp->if_capenable &= ~IFCAP_LRO;
4590 ifp->if_init = mxge_init;
4592 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4593 ifp->if_ioctl = mxge_ioctl;
4594 ifp->if_start = mxge_start;
4595 /* Initialise the ifmedia structure */
4596 ifmedia_init(&sc->media, 0, mxge_media_change,
4598 mxge_set_media(sc, IFM_ETHER | IFM_AUTO);
4599 mxge_media_probe(sc);
4601 ether_ifattach(ifp, sc->mac_addr, NULL);
4602 /* ether_ifattach sets mtu to ETHERMTU */
4603 if (mxge_initial_mtu != ETHERMTU)
4604 mxge_change_mtu(sc, mxge_initial_mtu);
4606 mxge_add_sysctls(sc);
4607 #ifdef IFNET_BUF_RING
4608 ifp->if_transmit = mxge_transmit;
4609 ifp->if_qflush = mxge_qflush;
4614 mxge_free_rings(sc);
4616 mxge_free_slices(sc);
4617 abort_with_dmabench:
4618 mxge_dma_free(&sc->dmabench_dma);
4619 abort_with_zeropad_dma:
4620 mxge_dma_free(&sc->zeropad_dma);
4622 mxge_dma_free(&sc->cmd_dma);
4624 bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4625 pci_disable_busmaster(dev);
4626 bus_dma_tag_destroy(sc->parent_dmat);
4632 mxge_detach(device_t dev)
4634 mxge_softc_t *sc = device_get_softc(dev);
4636 lwkt_serialize_enter(sc->ifp->if_serializer);
4638 if (sc->ifp->if_flags & IFF_RUNNING)
4640 lwkt_serialize_exit(sc->ifp->if_serializer);
4641 ether_ifdetach(sc->ifp);
4642 callout_drain(&sc->co_hdl);
4643 ifmedia_removeall(&sc->media);
4644 mxge_dummy_rdma(sc, 0);
4645 mxge_rem_sysctls(sc);
4647 mxge_free_rings(sc);
4648 mxge_free_slices(sc);
4649 mxge_dma_free(&sc->dmabench_dma);
4650 mxge_dma_free(&sc->zeropad_dma);
4651 mxge_dma_free(&sc->cmd_dma);
4652 bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4653 pci_disable_busmaster(dev);
4654 bus_dma_tag_destroy(sc->parent_dmat);
4659 mxge_shutdown(device_t dev)
4665 This file uses Myri10GE driver indentation.
4668 c-file-style:"linux"