1 /******************************************************************************
3 Copyright (c) 2006-2009, Myricom Inc.
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
9 1. Redistributions of source code must retain the above copyright notice,
10 this list of conditions and the following disclaimer.
12 2. Neither the name of the Myricom Inc, nor the names of its
13 contributors may be used to endorse or promote products derived from
14 this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
28 ***************************************************************************/
30 #include <sys/cdefs.h>
31 /*__FBSDID("$FreeBSD: src/sys/dev/mxge/if_mxge.c,v 1.63 2009/06/26 11:45:06 rwatson Exp $");*/
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/linker.h>
36 #include <sys/firmware.h>
37 #include <sys/endian.h>
38 #include <sys/in_cksum.h>
39 #include <sys/sockio.h>
41 #include <sys/malloc.h>
42 #include <sys/kernel.h>
43 #include <sys/module.h>
44 #include <sys/serialize.h>
45 #include <sys/socket.h>
46 #include <sys/sysctl.h>
48 /* count xmits ourselves, rather than via drbr */
51 #include <net/if_arp.h>
52 #include <net/ifq_var.h>
53 #include <net/ethernet.h>
54 #include <net/if_dl.h>
55 #include <net/if_media.h>
59 #include <net/if_types.h>
60 #include <net/vlan/if_vlan_var.h>
63 #include <netinet/in_systm.h>
64 #include <netinet/in.h>
65 #include <netinet/ip.h>
66 #include <netinet/tcp.h>
71 #include <bus/pci/pcireg.h>
72 #include <bus/pci/pcivar.h>
73 #include <bus/pci/pci_private.h> /* XXX for pci_cfg_restore */
75 #include <vm/vm.h> /* for pmap_mapdev() */
78 #if defined(__i386) || defined(__amd64)
79 #include <machine/specialreg.h>
82 #include <dev/netif/mxge/mxge_mcp.h>
83 #include <dev/netif/mxge/mcp_gen_header.h>
84 /*#define MXGE_FAKE_IFP*/
85 #include <dev/netif/mxge/if_mxge_var.h>
87 #include <sys/buf_ring.h>
93 static int mxge_nvidia_ecrc_enable = 1;
94 static int mxge_force_firmware = 0;
95 static int mxge_intr_coal_delay = 30;
96 static int mxge_deassert_wait = 1;
97 static int mxge_flow_control = 1;
98 static int mxge_verbose = 0;
99 static int mxge_lro_cnt = 8;
100 static int mxge_ticks;
101 static int mxge_max_slices = 1;
102 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
103 static int mxge_always_promisc = 0;
105 /* static int mxge_initial_mtu = ETHERMTU_JUMBO; */
106 static int mxge_initial_mtu = ETHERMTU;
107 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
108 static char *mxge_fw_aligned = "mxge_eth_z8e";
109 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
110 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
112 static int mxge_probe(device_t dev);
113 static int mxge_attach(device_t dev);
114 static int mxge_detach(device_t dev);
115 static int mxge_shutdown(device_t dev);
116 static void mxge_intr(void *arg);
118 static device_method_t mxge_methods[] =
120 /* Device interface */
121 DEVMETHOD(device_probe, mxge_probe),
122 DEVMETHOD(device_attach, mxge_attach),
123 DEVMETHOD(device_detach, mxge_detach),
124 DEVMETHOD(device_shutdown, mxge_shutdown),
128 static driver_t mxge_driver =
132 sizeof(mxge_softc_t),
135 static devclass_t mxge_devclass;
137 /* Declare ourselves to be a child of the PCI bus.*/
138 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
139 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
140 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
142 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
143 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
144 static int mxge_close(mxge_softc_t *sc);
145 static int mxge_open(mxge_softc_t *sc);
146 static void mxge_tick(void *arg);
148 /* XXX: we don't have Large Receive Offload support yet */
150 mxge_lro_rx(struct mxge_slice_state *ss, struct mbuf *m_head, uint32_t csum)
159 mxge_lro_flush(struct mxge_slice_state *ss, struct lro_entry *lro)
166 mxge_probe(device_t dev)
171 if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
172 ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
173 (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
174 rev = pci_get_revid(dev);
176 case MXGE_PCI_REV_Z8E:
177 device_set_desc(dev, "Myri10G-PCIE-8A");
179 case MXGE_PCI_REV_Z8ES:
180 device_set_desc(dev, "Myri10G-PCIE-8B");
183 device_set_desc(dev, "Myri10G-PCIE-8??");
184 device_printf(dev, "Unrecognized rev %d NIC\n",
194 mxge_enable_wc(mxge_softc_t *sc)
197 #if defined(__i386) || defined(__amd64)
202 len = rman_get_size(sc->mem_res);
203 err = pmap_change_attr((vm_offset_t) sc->sram,
204 len, PAT_WRITE_COMBINING);
206 device_printf(sc->dev, "pmap_change_attr failed, %d\n",
212 sc->wc = 0; /* TBD: PAT support */
217 /* callback to get our DMA address */
219 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
223 *(bus_addr_t *) arg = segs->ds_addr;
228 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
229 bus_size_t alignment)
232 device_t dev = sc->dev;
233 bus_size_t boundary, maxsegsize;
235 if (bytes > 4096 && alignment == 4096) {
243 /* allocate DMAable memory tags */
244 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
245 alignment, /* alignment */
246 boundary, /* boundary */
247 BUS_SPACE_MAXADDR, /* low */
248 BUS_SPACE_MAXADDR, /* high */
249 NULL, NULL, /* filter */
252 maxsegsize, /* maxsegsize */
253 BUS_DMA_COHERENT, /* flags */
254 &dma->dmat); /* tag */
256 device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
260 /* allocate DMAable memory & map */
261 err = bus_dmamem_alloc(dma->dmat, &dma->addr,
262 (BUS_DMA_WAITOK | BUS_DMA_COHERENT
263 | BUS_DMA_ZERO), &dma->map);
265 device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
266 goto abort_with_dmat;
269 /* load the memory */
270 err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
271 mxge_dmamap_callback,
272 (void *)&dma->bus_addr, 0);
274 device_printf(dev, "couldn't load map (err = %d)\n", err);
280 bus_dmamem_free(dma->dmat, dma->addr, dma->map);
282 (void)bus_dma_tag_destroy(dma->dmat);
288 mxge_dma_free(mxge_dma_t *dma)
290 bus_dmamap_unload(dma->dmat, dma->map);
291 bus_dmamem_free(dma->dmat, dma->addr, dma->map);
292 (void)bus_dma_tag_destroy(dma->dmat);
296 * The eeprom strings on the lanaiX have the format
303 mxge_parse_strings(mxge_softc_t *sc)
305 #define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
310 ptr = sc->eeprom_strings;
311 limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
313 while (ptr < limit && *ptr != '\0') {
314 if (memcmp(ptr, "MAC=", 4) == 0) {
316 sc->mac_addr_string = ptr;
317 for (i = 0; i < 6; i++) {
319 if ((ptr + 2) > limit)
321 sc->mac_addr[i] = strtoul(ptr, NULL, 16);
324 } else if (memcmp(ptr, "PC=", 3) == 0) {
326 strncpy(sc->product_code_string, ptr,
327 sizeof (sc->product_code_string) - 1);
328 } else if (memcmp(ptr, "SN=", 3) == 0) {
330 strncpy(sc->serial_number_string, ptr,
331 sizeof (sc->serial_number_string) - 1);
333 MXGE_NEXT_STRING(ptr);
340 device_printf(sc->dev, "failed to parse eeprom_strings\n");
345 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
347 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
350 unsigned long base, off;
352 device_t pdev, mcp55;
353 uint16_t vendor_id, device_id, word;
354 uintptr_t bus, slot, func, ivend, idev;
358 if (!mxge_nvidia_ecrc_enable)
361 pdev = device_get_parent(device_get_parent(sc->dev));
363 device_printf(sc->dev, "could not find parent?\n");
366 vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
367 device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
369 if (vendor_id != 0x10de)
374 if (device_id == 0x005d) {
375 /* ck804, base address is magic */
377 } else if (device_id >= 0x0374 && device_id <= 0x378) {
378 /* mcp55, base address stored in chipset */
379 mcp55 = pci_find_bsf(0, 0, 0);
381 0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
382 0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
383 word = pci_read_config(mcp55, 0x90, 2);
384 base = ((unsigned long)word & 0x7ffeU) << 25;
391 Test below is commented because it is believed that doing
392 config read/write beyond 0xff will access the config space
393 for the next larger function. Uncomment this and remove
394 the hacky pmap_mapdev() way of accessing config space when
395 FreeBSD grows support for extended pcie config space access
398 /* See if we can, by some miracle, access the extended
400 val = pci_read_config(pdev, 0x178, 4);
401 if (val != 0xffffffff) {
403 pci_write_config(pdev, 0x178, val, 4);
407 /* Rather than using normal pci config space writes, we must
408 * map the Nvidia config space ourselves. This is because on
409 * opteron/nvidia class machine the 0xe000000 mapping is
410 * handled by the nvidia chipset, that means the internal PCI
411 * device (the on-chip northbridge), or the amd-8131 bridge
412 * and things behind them are not visible by this method.
415 BUS_READ_IVAR(device_get_parent(pdev), pdev,
417 BUS_READ_IVAR(device_get_parent(pdev), pdev,
418 PCI_IVAR_SLOT, &slot);
419 BUS_READ_IVAR(device_get_parent(pdev), pdev,
420 PCI_IVAR_FUNCTION, &func);
421 BUS_READ_IVAR(device_get_parent(pdev), pdev,
422 PCI_IVAR_VENDOR, &ivend);
423 BUS_READ_IVAR(device_get_parent(pdev), pdev,
424 PCI_IVAR_DEVICE, &idev);
427 + 0x00100000UL * (unsigned long)bus
428 + 0x00001000UL * (unsigned long)(func
431 /* map it into the kernel */
432 va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
436 device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
439 /* get a pointer to the config space mapped into the kernel */
440 cfgptr = va + (off & PAGE_MASK);
442 /* make sure that we can really access it */
443 vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
444 device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
445 if (! (vendor_id == ivend && device_id == idev)) {
446 device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
447 vendor_id, device_id);
448 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
452 ptr32 = (uint32_t*)(cfgptr + 0x178);
455 if (val == 0xffffffff) {
456 device_printf(sc->dev, "extended mapping failed\n");
457 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
461 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
463 device_printf(sc->dev,
464 "Enabled ECRC on upstream Nvidia bridge "
466 (int)bus, (int)slot, (int)func);
471 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
473 device_printf(sc->dev,
474 "Nforce 4 chipset on non-x86/amd64!?!?!\n");
481 mxge_dma_test(mxge_softc_t *sc, int test_type)
484 bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
490 /* Run a small DMA test.
491 * The magic multipliers to the length tell the firmware
492 * to do DMA read, write, or read+write tests. The
493 * results are returned in cmd.data0. The upper 16
494 * bits of the return is the number of transfers completed.
495 * The lower 16 bits is the time in 0.5us ticks that the
496 * transfers took to complete.
499 len = sc->tx_boundary;
501 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
502 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
503 cmd.data2 = len * 0x10000;
504 status = mxge_send_cmd(sc, test_type, &cmd);
509 sc->read_dma = ((cmd.data0>>16) * len * 2) /
510 (cmd.data0 & 0xffff);
511 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
512 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
513 cmd.data2 = len * 0x1;
514 status = mxge_send_cmd(sc, test_type, &cmd);
519 sc->write_dma = ((cmd.data0>>16) * len * 2) /
520 (cmd.data0 & 0xffff);
522 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
523 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
524 cmd.data2 = len * 0x10001;
525 status = mxge_send_cmd(sc, test_type, &cmd);
530 sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
531 (cmd.data0 & 0xffff);
534 if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
535 device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
542 * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
543 * when the PCI-E Completion packets are aligned on an 8-byte
544 * boundary. Some PCI-E chip sets always align Completion packets; on
545 * the ones that do not, the alignment can be enforced by enabling
546 * ECRC generation (if supported).
548 * When PCI-E Completion packets are not aligned, it is actually more
549 * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
551 * If the driver can neither enable ECRC nor verify that it has
552 * already been enabled, then it must use a firmware image which works
553 * around unaligned completion packets (ethp_z8e.dat), and it should
554 * also ensure that it never gives the device a Read-DMA which is
555 * larger than 2KB by setting the tx_boundary to 2KB. If ECRC is
556 * enabled, then the driver should use the aligned (eth_z8e.dat)
557 * firmware image, and set tx_boundary to 4KB.
561 mxge_firmware_probe(mxge_softc_t *sc)
563 device_t dev = sc->dev;
567 sc->tx_boundary = 4096;
569 * Verify the max read request size was set to 4KB
570 * before trying the test with 4KB.
572 if (pci_find_extcap(dev, PCIY_EXPRESS, ®) == 0) {
573 pectl = pci_read_config(dev, reg + 0x8, 2);
574 if ((pectl & (5 << 12)) != (5 << 12)) {
575 device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
577 sc->tx_boundary = 2048;
582 * load the optimized firmware (which assumes aligned PCIe
583 * completions) in order to see if it works on this host.
585 sc->fw_name = mxge_fw_aligned;
586 status = mxge_load_firmware(sc, 1);
592 * Enable ECRC if possible
594 mxge_enable_nvidia_ecrc(sc);
597 * Run a DMA test which watches for unaligned completions and
598 * aborts on the first one seen.
601 status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
603 return 0; /* keep the aligned firmware */
606 device_printf(dev, "DMA test failed: %d\n", status);
607 if (status == ENOSYS)
608 device_printf(dev, "Falling back to ethp! "
609 "Please install up to date fw\n");
614 mxge_select_firmware(mxge_softc_t *sc)
619 if (mxge_force_firmware != 0) {
620 if (mxge_force_firmware == 1)
625 device_printf(sc->dev,
626 "Assuming %s completions (forced)\n",
627 aligned ? "aligned" : "unaligned");
631 /* if the PCIe link width is 4 or less, we can use the aligned
632 firmware and skip any checks */
633 if (sc->link_width != 0 && sc->link_width <= 4) {
634 device_printf(sc->dev,
635 "PCIe x%d Link, expect reduced performance\n",
641 if (0 == mxge_firmware_probe(sc))
646 sc->fw_name = mxge_fw_aligned;
647 sc->tx_boundary = 4096;
649 sc->fw_name = mxge_fw_unaligned;
650 sc->tx_boundary = 2048;
652 return (mxge_load_firmware(sc, 0));
662 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
666 if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
667 device_printf(sc->dev, "Bad firmware type: 0x%x\n",
668 be32toh(hdr->mcp_type));
672 /* save firmware version for sysctl */
673 strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
675 device_printf(sc->dev, "firmware id: %s\n", hdr->version);
677 ksscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
678 &sc->fw_ver_minor, &sc->fw_ver_tiny);
680 if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
681 && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
682 device_printf(sc->dev, "Found firmware version %s\n",
684 device_printf(sc->dev, "Driver needs %d.%d\n",
685 MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
694 z_alloc(void *nil, u_int items, u_int size)
698 ptr = kmalloc(items * size, M_TEMP, M_NOWAIT);
703 z_free(void *nil, void *ptr)
710 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
713 const mcp_gen_header_t *hdr;
720 fw = firmware_image_load(sc->fw_name, NULL);
722 device_printf(sc->dev, "Could not find firmware image %s\n",
727 /* setup zlib and decompress f/w */
728 bzero(&zs, sizeof (zs));
731 status = inflateInit(&zs);
732 if (status != Z_OK) {
737 /* the uncompressed size is stored as the firmware version,
738 which would otherwise go unused */
739 fw_len = (size_t) fw->version;
740 inflate_buffer = kmalloc(fw_len, M_TEMP, M_NOWAIT);
741 if (inflate_buffer == NULL)
743 zs.avail_in = fw->datasize;
744 zs.next_in = __DECONST(char *, fw->data);
745 zs.avail_out = fw_len;
746 zs.next_out = inflate_buffer;
747 status = inflate(&zs, Z_FINISH);
748 if (status != Z_STREAM_END) {
749 device_printf(sc->dev, "zlib %d\n", status);
751 goto abort_with_buffer;
754 fw_len = fw->fw_imglen;
756 hdr_offset = htobe32(*(const uint32_t *)
757 (fw->fw_image + MCP_HEADER_PTR_OFFSET));
758 if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
759 device_printf(sc->dev, "Bad firmware file");
763 hdr = (const void*)(fw->fw_image + hdr_offset);
765 status = mxge_validate_firmware(sc, hdr);
769 /* Copy the inflated firmware to NIC SRAM. */
770 for (i = 0; i < fw_len; i += 256) {
771 mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
773 min(256U, (unsigned)(fw_len - i)));
783 kfree(inflate_buffer, M_TEMP);
788 firmware_image_unload(fw);
793 * Enable or disable periodic RDMAs from the host to make certain
794 * chipsets resend dropped PCIe messages
798 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
801 volatile uint32_t *confirm;
802 volatile char *submit;
803 uint32_t *buf, dma_low, dma_high;
806 buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
808 /* clear confirmation addr */
809 confirm = (volatile uint32_t *)sc->cmd;
813 /* send an rdma command to the PCIe engine, and wait for the
814 response in the confirmation address. The firmware should
815 write a -1 there to indicate it is alive and well
818 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
819 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
820 buf[0] = htobe32(dma_high); /* confirm addr MSW */
821 buf[1] = htobe32(dma_low); /* confirm addr LSW */
822 buf[2] = htobe32(0xffffffff); /* confirm data */
823 dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
824 dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
825 buf[3] = htobe32(dma_high); /* dummy addr MSW */
826 buf[4] = htobe32(dma_low); /* dummy addr LSW */
827 buf[5] = htobe32(enable); /* enable? */
830 submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
832 mxge_pio_copy(submit, buf, 64);
837 while (*confirm != 0xffffffff && i < 20) {
841 if (*confirm != 0xffffffff) {
842 device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
843 (enable ? "enable" : "disable"), confirm,
850 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
853 char buf_bytes[sizeof(*buf) + 8];
854 volatile mcp_cmd_response_t *response = sc->cmd;
855 volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
856 uint32_t dma_low, dma_high;
857 int err, sleep_total = 0;
860 * We may be called during attach, before if_serializer is available.
861 * This is not a fast path, just check for NULL
864 if (sc->ifp->if_serializer)
865 ASSERT_SERIALIZED(sc->ifp->if_serializer);
867 /* ensure buf is aligned to 8 bytes */
868 buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
870 buf->data0 = htobe32(data->data0);
871 buf->data1 = htobe32(data->data1);
872 buf->data2 = htobe32(data->data2);
873 buf->cmd = htobe32(cmd);
874 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
875 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
877 buf->response_addr.low = htobe32(dma_low);
878 buf->response_addr.high = htobe32(dma_high);
881 response->result = 0xffffffff;
883 mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
885 /* wait up to 20ms */
887 for (sleep_total = 0; sleep_total < 20; sleep_total++) {
888 bus_dmamap_sync(sc->cmd_dma.dmat,
889 sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
891 switch (be32toh(response->result)) {
893 data->data0 = be32toh(response->data);
899 case MXGEFW_CMD_UNKNOWN:
902 case MXGEFW_CMD_ERROR_UNALIGNED:
905 case MXGEFW_CMD_ERROR_BUSY:
909 device_printf(sc->dev,
911 "failed, result = %d\n",
912 cmd, be32toh(response->result));
920 device_printf(sc->dev, "mxge: command %d timed out"
922 cmd, be32toh(response->result));
927 mxge_adopt_running_firmware(mxge_softc_t *sc)
929 struct mcp_gen_header *hdr;
930 const size_t bytes = sizeof (struct mcp_gen_header);
934 /* find running firmware header */
935 hdr_offset = htobe32(*(volatile uint32_t *)
936 (sc->sram + MCP_HEADER_PTR_OFFSET));
938 if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
939 device_printf(sc->dev,
940 "Running firmware has bad header offset (%d)\n",
945 /* copy header of running firmware from SRAM to host memory to
946 * validate firmware */
947 hdr = kmalloc(bytes, M_DEVBUF, M_NOWAIT);
949 device_printf(sc->dev, "could not kmalloc firmware hdr\n");
952 bus_space_read_region_1(rman_get_bustag(sc->mem_res),
953 rman_get_bushandle(sc->mem_res),
954 hdr_offset, (char *)hdr, bytes);
955 status = mxge_validate_firmware(sc, hdr);
956 kfree(hdr, M_DEVBUF);
959 * check to see if adopted firmware has bug where adopting
960 * it will cause broadcasts to be filtered unless the NIC
961 * is kept in ALLMULTI mode
963 if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
964 sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
965 sc->adopted_rx_filter_bug = 1;
966 device_printf(sc->dev, "Adopting fw %d.%d.%d: "
967 "working around rx filter bug\n",
968 sc->fw_ver_major, sc->fw_ver_minor,
977 mxge_load_firmware(mxge_softc_t *sc, int adopt)
979 volatile uint32_t *confirm;
980 volatile char *submit;
982 uint32_t *buf, size, dma_low, dma_high;
985 buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
987 size = sc->sram_size;
988 status = mxge_load_firmware_helper(sc, &size);
992 /* Try to use the currently running firmware, if
994 status = mxge_adopt_running_firmware(sc);
996 device_printf(sc->dev,
997 "failed to adopt running firmware\n");
1000 device_printf(sc->dev,
1001 "Successfully adopted running firmware\n");
1002 if (sc->tx_boundary == 4096) {
1003 device_printf(sc->dev,
1004 "Using firmware currently running on NIC"
1006 device_printf(sc->dev,
1007 "performance consider loading optimized "
1010 sc->fw_name = mxge_fw_unaligned;
1011 sc->tx_boundary = 2048;
1014 /* clear confirmation addr */
1015 confirm = (volatile uint32_t *)sc->cmd;
1018 /* send a reload command to the bootstrap MCP, and wait for the
1019 response in the confirmation address. The firmware should
1020 write a -1 there to indicate it is alive and well
1023 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
1024 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
1026 buf[0] = htobe32(dma_high); /* confirm addr MSW */
1027 buf[1] = htobe32(dma_low); /* confirm addr LSW */
1028 buf[2] = htobe32(0xffffffff); /* confirm data */
1030 /* FIX: All newest firmware should un-protect the bottom of
1031 the sram before handoff. However, the very first interfaces
1032 do not. Therefore the handoff copy must skip the first 8 bytes
1034 /* where the code starts*/
1035 buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1036 buf[4] = htobe32(size - 8); /* length of code */
1037 buf[5] = htobe32(8); /* where to copy to */
1038 buf[6] = htobe32(0); /* where to jump to */
1040 submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1041 mxge_pio_copy(submit, buf, 64);
1046 while (*confirm != 0xffffffff && i < 20) {
1049 bus_dmamap_sync(sc->cmd_dma.dmat,
1050 sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1052 if (*confirm != 0xffffffff) {
1053 device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1062 mxge_update_mac_address(mxge_softc_t *sc)
1065 uint8_t *addr = sc->mac_addr;
1069 cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1070 | (addr[2] << 8) | addr[3]);
1072 cmd.data1 = ((addr[4] << 8) | (addr[5]));
1074 status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1079 mxge_change_pause(mxge_softc_t *sc, int pause)
1085 status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1088 status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1092 device_printf(sc->dev, "Failed to set flow control mode\n");
1100 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1105 if( sc->ifp->if_serializer)
1106 ASSERT_SERIALIZED(sc->ifp->if_serializer);
1107 if (mxge_always_promisc)
1111 status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1114 status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1118 device_printf(sc->dev, "Failed to set promisc mode\n");
1123 mxge_set_multicast_list(mxge_softc_t *sc)
1126 struct ifmultiaddr *ifma;
1127 struct ifnet *ifp = sc->ifp;
1130 if (ifp->if_serializer)
1131 ASSERT_SERIALIZED(ifp->if_serializer);
1133 /* This firmware is known to not support multicast */
1134 if (!sc->fw_multicast_support)
1137 /* Disable multicast filtering while we play with the lists*/
1138 err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1140 device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1141 " error status: %d\n", err);
1145 if (sc->adopted_rx_filter_bug)
1148 if (ifp->if_flags & IFF_ALLMULTI)
1149 /* request to disable multicast filtering, so quit here */
1152 /* Flush all the filters */
1154 err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1156 device_printf(sc->dev,
1157 "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1158 ", error status: %d\n", err);
1162 /* Walk the multicast list, and add each address */
1164 LIST_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1165 if (ifma->ifma_addr->sa_family != AF_LINK)
1167 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1169 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1171 cmd.data0 = htonl(cmd.data0);
1172 cmd.data1 = htonl(cmd.data1);
1173 err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1175 device_printf(sc->dev, "Failed "
1176 "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1178 /* abort, leaving multicast filtering off */
1182 /* Enable multicast filtering */
1183 err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1185 device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1186 ", error status: %d\n", err);
1191 mxge_max_mtu(mxge_softc_t *sc)
1196 if (MJUMPAGESIZE - MXGEFW_PAD > MXGEFW_MAX_MTU)
1197 return MXGEFW_MAX_MTU - MXGEFW_PAD;
1199 /* try to set nbufs to see if it we can
1200 use virtually contiguous jumbos */
1202 status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1205 return MXGEFW_MAX_MTU - MXGEFW_PAD;
1207 /* otherwise, we're limited to MJUMPAGESIZE */
1208 return MJUMPAGESIZE - MXGEFW_PAD;
1212 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1214 struct mxge_slice_state *ss;
1215 mxge_rx_done_t *rx_done;
1216 volatile uint32_t *irq_claim;
1220 /* try to send a reset command to the card to see if it
1222 memset(&cmd, 0, sizeof (cmd));
1223 status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1225 device_printf(sc->dev, "failed reset\n");
1229 mxge_dummy_rdma(sc, 1);
1232 /* set the intrq size */
1233 cmd.data0 = sc->rx_ring_size;
1234 status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1237 * Even though we already know how many slices are supported
1238 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1239 * has magic side effects, and must be called after a reset.
1240 * It must be called prior to calling any RSS related cmds,
1241 * including assigning an interrupt queue for anything but
1242 * slice 0. It must also be called *after*
1243 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1244 * the firmware to compute offsets.
1247 if (sc->num_slices > 1) {
1248 /* ask the maximum number of slices it supports */
1249 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1252 device_printf(sc->dev,
1253 "failed to get number of slices\n");
1257 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1258 * to setting up the interrupt queue DMA
1260 cmd.data0 = sc->num_slices;
1261 cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1262 #ifdef IFNET_BUF_RING
1263 cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1265 status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1268 device_printf(sc->dev,
1269 "failed to set number of slices\n");
1275 if (interrupts_setup) {
1276 /* Now exchange information about interrupts */
1277 for (slice = 0; slice < sc->num_slices; slice++) {
1278 rx_done = &sc->ss[slice].rx_done;
1279 memset(rx_done->entry, 0, sc->rx_ring_size);
1280 cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1281 cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1283 status |= mxge_send_cmd(sc,
1284 MXGEFW_CMD_SET_INTRQ_DMA,
1289 status |= mxge_send_cmd(sc,
1290 MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1293 sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1295 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1296 irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1299 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1301 sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1303 device_printf(sc->dev, "failed set interrupt parameters\n");
1308 *sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1311 /* run a DMA benchmark */
1312 (void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1314 for (slice = 0; slice < sc->num_slices; slice++) {
1315 ss = &sc->ss[slice];
1317 ss->irq_claim = irq_claim + (2 * slice);
1318 /* reset mcp/driver shared state back to 0 */
1319 ss->rx_done.idx = 0;
1320 ss->rx_done.cnt = 0;
1323 ss->tx.pkt_done = 0;
1324 ss->tx.queue_active = 0;
1325 ss->tx.activate = 0;
1326 ss->tx.deactivate = 0;
1331 ss->rx_small.cnt = 0;
1332 ss->lro_bad_csum = 0;
1334 ss->lro_flushed = 0;
1335 if (ss->fw_stats != NULL) {
1336 ss->fw_stats->valid = 0;
1337 ss->fw_stats->send_done_count = 0;
1340 sc->rdma_tags_available = 15;
1341 status = mxge_update_mac_address(sc);
1342 mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1343 mxge_change_pause(sc, sc->pause);
1344 mxge_set_multicast_list(sc);
1349 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1352 unsigned int intr_coal_delay;
1356 intr_coal_delay = sc->intr_coal_delay;
1357 err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1361 if (intr_coal_delay == sc->intr_coal_delay)
1364 if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1367 lwkt_serialize_enter(sc->ifp->if_serializer);
1368 *sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1369 sc->intr_coal_delay = intr_coal_delay;
1371 lwkt_serialize_exit(sc->ifp->if_serializer);
1376 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1379 unsigned int enabled;
1383 enabled = sc->pause;
1384 err = sysctl_handle_int(oidp, &enabled, arg2, req);
1388 if (enabled == sc->pause)
1391 lwkt_serialize_enter(sc->ifp->if_serializer);
1392 err = mxge_change_pause(sc, enabled);
1393 lwkt_serialize_exit(sc->ifp->if_serializer);
1398 mxge_change_lro_locked(mxge_softc_t *sc, int lro_cnt)
1405 ifp->if_capenable &= ~IFCAP_LRO;
1407 ifp->if_capenable |= IFCAP_LRO;
1408 sc->lro_cnt = lro_cnt;
1409 if (ifp->if_flags & IFF_RUNNING) {
1411 err = mxge_open(sc);
1417 mxge_change_lro(SYSCTL_HANDLER_ARGS)
1420 unsigned int lro_cnt;
1424 lro_cnt = sc->lro_cnt;
1425 err = sysctl_handle_int(oidp, &lro_cnt, arg2, req);
1429 if (lro_cnt == sc->lro_cnt)
1435 lwkt_serialize_enter(sc->ifp->if_serializer);
1436 err = mxge_change_lro_locked(sc, lro_cnt);
1437 lwkt_serialize_exit(sc->ifp->if_serializer);
1442 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1448 arg2 = be32toh(*(int *)arg1);
1450 err = sysctl_handle_int(oidp, arg1, arg2, req);
1456 mxge_rem_sysctls(mxge_softc_t *sc)
1458 struct mxge_slice_state *ss;
1461 if (sc->slice_sysctl_tree == NULL)
1464 for (slice = 0; slice < sc->num_slices; slice++) {
1465 ss = &sc->ss[slice];
1466 if (ss == NULL || ss->sysctl_tree == NULL)
1468 sysctl_ctx_free(&ss->sysctl_ctx);
1469 ss->sysctl_tree = NULL;
1471 sysctl_ctx_free(&sc->slice_sysctl_ctx);
1472 sc->slice_sysctl_tree = NULL;
1473 sysctl_ctx_free(&sc->sysctl_ctx);
1474 sc->sysctl_tree = NULL;
1479 mxge_add_sysctls(mxge_softc_t *sc)
1481 struct sysctl_ctx_list *ctx;
1482 struct sysctl_oid_list *children;
1484 struct mxge_slice_state *ss;
1488 ctx = &sc->sysctl_ctx;
1489 sysctl_ctx_init(ctx);
1490 sc->sysctl_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_STATIC_CHILDREN(_hw),
1492 device_get_nameunit(sc->dev),
1494 if (sc->sysctl_tree == NULL) {
1495 device_printf(sc->dev, "can't add sysctl node\n");
1499 children = SYSCTL_CHILDREN(sc->sysctl_tree);
1500 fw = sc->ss[0].fw_stats;
1502 /* random information */
1503 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1505 CTLFLAG_RD, &sc->fw_version,
1506 0, "firmware version");
1507 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1509 CTLFLAG_RD, &sc->serial_number_string,
1510 0, "serial number");
1511 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1513 CTLFLAG_RD, &sc->product_code_string,
1515 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1517 CTLFLAG_RD, &sc->link_width,
1519 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1521 CTLFLAG_RD, &sc->tx_boundary,
1523 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1525 CTLFLAG_RD, &sc->wc,
1526 0, "write combining PIO?");
1527 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1529 CTLFLAG_RD, &sc->read_dma,
1530 0, "DMA Read speed in MB/s");
1531 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1533 CTLFLAG_RD, &sc->write_dma,
1534 0, "DMA Write speed in MB/s");
1535 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1536 "read_write_dma_MBs",
1537 CTLFLAG_RD, &sc->read_write_dma,
1538 0, "DMA concurrent Read/Write speed in MB/s");
1541 /* performance related tunables */
1542 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1544 CTLTYPE_INT|CTLFLAG_RW, sc,
1545 0, mxge_change_intr_coal,
1546 "I", "interrupt coalescing delay in usecs");
1548 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1549 "flow_control_enabled",
1550 CTLTYPE_INT|CTLFLAG_RW, sc,
1551 0, mxge_change_flow_control,
1552 "I", "interrupt coalescing delay in usecs");
1554 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1556 CTLFLAG_RW, &mxge_deassert_wait,
1557 0, "Wait for IRQ line to go low in ihandler");
1559 /* stats block from firmware is in network byte order.
1561 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1563 CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1564 0, mxge_handle_be32,
1566 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1567 "rdma_tags_available",
1568 CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1569 0, mxge_handle_be32,
1570 "I", "rdma_tags_available");
1571 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1572 "dropped_bad_crc32",
1573 CTLTYPE_INT|CTLFLAG_RD,
1574 &fw->dropped_bad_crc32,
1575 0, mxge_handle_be32,
1576 "I", "dropped_bad_crc32");
1577 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1579 CTLTYPE_INT|CTLFLAG_RD,
1580 &fw->dropped_bad_phy,
1581 0, mxge_handle_be32,
1582 "I", "dropped_bad_phy");
1583 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1584 "dropped_link_error_or_filtered",
1585 CTLTYPE_INT|CTLFLAG_RD,
1586 &fw->dropped_link_error_or_filtered,
1587 0, mxge_handle_be32,
1588 "I", "dropped_link_error_or_filtered");
1589 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1590 "dropped_link_overflow",
1591 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1592 0, mxge_handle_be32,
1593 "I", "dropped_link_overflow");
1594 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1595 "dropped_multicast_filtered",
1596 CTLTYPE_INT|CTLFLAG_RD,
1597 &fw->dropped_multicast_filtered,
1598 0, mxge_handle_be32,
1599 "I", "dropped_multicast_filtered");
1600 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1601 "dropped_no_big_buffer",
1602 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1603 0, mxge_handle_be32,
1604 "I", "dropped_no_big_buffer");
1605 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1606 "dropped_no_small_buffer",
1607 CTLTYPE_INT|CTLFLAG_RD,
1608 &fw->dropped_no_small_buffer,
1609 0, mxge_handle_be32,
1610 "I", "dropped_no_small_buffer");
1611 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1613 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1614 0, mxge_handle_be32,
1615 "I", "dropped_overrun");
1616 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1618 CTLTYPE_INT|CTLFLAG_RD,
1620 0, mxge_handle_be32,
1621 "I", "dropped_pause");
1622 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1624 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1625 0, mxge_handle_be32,
1626 "I", "dropped_runt");
1628 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1629 "dropped_unicast_filtered",
1630 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1631 0, mxge_handle_be32,
1632 "I", "dropped_unicast_filtered");
1634 /* verbose printing? */
1635 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1637 CTLFLAG_RW, &mxge_verbose,
1638 0, "verbose printing");
1641 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1643 CTLTYPE_INT|CTLFLAG_RW, sc,
1645 "I", "number of lro merge queues");
1648 /* add counters exported for debugging from all slices */
1649 sysctl_ctx_init(&sc->slice_sysctl_ctx);
1650 sc->slice_sysctl_tree =
1651 SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1652 "slice", CTLFLAG_RD, 0, "");
1654 for (slice = 0; slice < sc->num_slices; slice++) {
1655 ss = &sc->ss[slice];
1656 sysctl_ctx_init(&ss->sysctl_ctx);
1657 ctx = &ss->sysctl_ctx;
1658 children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1659 ksprintf(slice_num, "%d", slice);
1661 SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1663 children = SYSCTL_CHILDREN(ss->sysctl_tree);
1664 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1666 CTLFLAG_RD, &ss->rx_small.cnt,
1668 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1670 CTLFLAG_RD, &ss->rx_big.cnt,
1672 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1673 "lro_flushed", CTLFLAG_RD, &ss->lro_flushed,
1674 0, "number of lro merge queues flushed");
1676 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1677 "lro_queued", CTLFLAG_RD, &ss->lro_queued,
1678 0, "number of frames appended to lro merge"
1681 #ifndef IFNET_BUF_RING
1682 /* only transmit from slice 0 for now */
1686 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1688 CTLFLAG_RD, &ss->tx.req,
1691 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1693 CTLFLAG_RD, &ss->tx.done,
1695 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1697 CTLFLAG_RD, &ss->tx.pkt_done,
1699 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1701 CTLFLAG_RD, &ss->tx.stall,
1703 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1705 CTLFLAG_RD, &ss->tx.wake,
1707 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1709 CTLFLAG_RD, &ss->tx.defrag,
1711 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1713 CTLFLAG_RD, &ss->tx.queue_active,
1714 0, "tx_queue_active");
1715 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1717 CTLFLAG_RD, &ss->tx.activate,
1719 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1721 CTLFLAG_RD, &ss->tx.deactivate,
1722 0, "tx_deactivate");
1726 /* copy an array of mcp_kreq_ether_send_t's to the mcp. Copy
1727 backwards one at a time and handle ring wraps */
1730 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1731 mcp_kreq_ether_send_t *src, int cnt)
1733 int idx, starting_slot;
1734 starting_slot = tx->req;
1737 idx = (starting_slot + cnt) & tx->mask;
1738 mxge_pio_copy(&tx->lanai[idx],
1739 &src[cnt], sizeof(*src));
1745 * copy an array of mcp_kreq_ether_send_t's to the mcp. Copy
1746 * at most 32 bytes at a time, so as to avoid involving the software
1747 * pio handler in the nic. We re-write the first segment's flags
1748 * to mark them valid only after writing the entire chain
1752 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1757 volatile uint32_t *dst_ints;
1758 mcp_kreq_ether_send_t *srcp;
1759 volatile mcp_kreq_ether_send_t *dstp, *dst;
1762 idx = tx->req & tx->mask;
1764 last_flags = src->flags;
1767 dst = dstp = &tx->lanai[idx];
1770 if ((idx + cnt) < tx->mask) {
1771 for (i = 0; i < (cnt - 1); i += 2) {
1772 mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1773 wmb(); /* force write every 32 bytes */
1778 /* submit all but the first request, and ensure
1779 that it is submitted below */
1780 mxge_submit_req_backwards(tx, src, cnt);
1784 /* submit the first request */
1785 mxge_pio_copy(dstp, srcp, sizeof(*src));
1786 wmb(); /* barrier before setting valid flag */
1789 /* re-write the last 32-bits with the valid flags */
1790 src->flags = last_flags;
1791 src_ints = (uint32_t *)src;
1793 dst_ints = (volatile uint32_t *)dst;
1795 *dst_ints = *src_ints;
1803 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1804 int busdma_seg_cnt, int ip_off)
1807 mcp_kreq_ether_send_t *req;
1808 bus_dma_segment_t *seg;
1811 uint32_t low, high_swapped;
1812 int len, seglen, cum_len, cum_len_next;
1813 int next_is_first, chop, cnt, rdma_count, small;
1814 uint16_t pseudo_hdr_offset, cksum_offset, mss;
1815 uint8_t flags, flags_next;
1818 mss = m->m_pkthdr.tso_segsz;
1820 /* negative cum_len signifies to the
1821 * send loop that we are still in the
1822 * header portion of the TSO packet.
1825 /* ensure we have the ethernet, IP and TCP
1826 header together in the first mbuf, copy
1827 it to a scratch buffer if not */
1828 if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1829 m_copydata(m, 0, ip_off + sizeof (*ip),
1831 ip = (struct ip *)(ss->scratch + ip_off);
1833 ip = (struct ip *)(mtod(m, char *) + ip_off);
1835 if (__predict_false(m->m_len < ip_off + (ip->ip_hl << 2)
1837 m_copydata(m, 0, ip_off + (ip->ip_hl << 2)
1838 + sizeof (*tcp), ss->scratch);
1839 ip = (struct ip *)(mtod(m, char *) + ip_off);
1842 tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1843 cum_len = -(ip_off + ((ip->ip_hl + tcp->th_off) << 2));
1845 /* TSO implies checksum offload on this hardware */
1846 cksum_offset = ip_off + (ip->ip_hl << 2);
1847 flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1850 /* for TSO, pseudo_hdr_offset holds mss.
1851 * The firmware figures out where to put
1852 * the checksum by parsing the header. */
1853 pseudo_hdr_offset = htobe16(mss);
1860 /* "rdma_count" is the number of RDMAs belonging to the
1861 * current packet BEFORE the current send request. For
1862 * non-TSO packets, this is equal to "count".
1863 * For TSO packets, rdma_count needs to be reset
1864 * to 0 after a segment cut.
1866 * The rdma_count field of the send request is
1867 * the number of RDMAs of the packet starting at
1868 * that request. For TSO send requests with one ore more cuts
1869 * in the middle, this is the number of RDMAs starting
1870 * after the last cut in the request. All previous
1871 * segments before the last cut implicitly have 1 RDMA.
1873 * Since the number of RDMAs is not known beforehand,
1874 * it must be filled-in retroactively - after each
1875 * segmentation cut or at the end of the entire packet.
1878 while (busdma_seg_cnt) {
1879 /* Break the busdma segment up into pieces*/
1880 low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1881 high_swapped = htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1885 flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1887 cum_len_next = cum_len + seglen;
1888 (req-rdma_count)->rdma_count = rdma_count + 1;
1889 if (__predict_true(cum_len >= 0)) {
1891 chop = (cum_len_next > mss);
1892 cum_len_next = cum_len_next % mss;
1893 next_is_first = (cum_len_next == 0);
1894 flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1895 flags_next |= next_is_first *
1897 rdma_count |= -(chop | next_is_first);
1898 rdma_count += chop & !next_is_first;
1899 } else if (cum_len_next >= 0) {
1904 small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1905 flags_next = MXGEFW_FLAGS_TSO_PLD |
1906 MXGEFW_FLAGS_FIRST |
1907 (small * MXGEFW_FLAGS_SMALL);
1910 req->addr_high = high_swapped;
1911 req->addr_low = htobe32(low);
1912 req->pseudo_hdr_offset = pseudo_hdr_offset;
1914 req->rdma_count = 1;
1915 req->length = htobe16(seglen);
1916 req->cksum_offset = cksum_offset;
1917 req->flags = flags | ((cum_len & 1) *
1918 MXGEFW_FLAGS_ALIGN_ODD);
1921 cum_len = cum_len_next;
1926 if (__predict_false(cksum_offset > seglen))
1927 cksum_offset -= seglen;
1930 if (__predict_false(cnt > tx->max_desc))
1936 (req-rdma_count)->rdma_count = rdma_count;
1940 req->flags |= MXGEFW_FLAGS_TSO_LAST;
1941 } while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1943 tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1944 mxge_submit_req(tx, tx->req_list, cnt);
1945 #ifdef IFNET_BUF_RING
1946 if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1947 /* tell the NIC to start polling this slice */
1949 tx->queue_active = 1;
1957 bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1961 kprintf("tx->max_desc exceeded via TSO!\n");
1962 kprintf("mss = %d, %ld, %d!\n", mss,
1963 (long)seg - (long)tx->seg_list, tx->max_desc);
1970 #endif /* IFCAP_TSO4 */
1972 #ifdef MXGE_NEW_VLAN_API
1974 * We reproduce the software vlan tag insertion from
1975 * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
1976 * vlan tag insertion. We need to advertise this in order to have the
1977 * vlan interface respect our csum offload flags.
1979 static struct mbuf *
1980 mxge_vlan_tag_insert(struct mbuf *m)
1982 struct ether_vlan_header *evl;
1984 M_PREPEND(m, EVL_ENCAPLEN, MB_DONTWAIT);
1985 if (__predict_false(m == NULL))
1987 if (m->m_len < sizeof(*evl)) {
1988 m = m_pullup(m, sizeof(*evl));
1989 if (__predict_false(m == NULL))
1993 * Transform the Ethernet header into an Ethernet header
1994 * with 802.1Q encapsulation.
1996 evl = mtod(m, struct ether_vlan_header *);
1997 bcopy((char *)evl + EVL_ENCAPLEN,
1998 (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
1999 evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
2000 evl->evl_tag = htons(m->m_pkthdr.ether_vlantag);
2001 m->m_flags &= ~M_VLANTAG;
2004 #endif /* MXGE_NEW_VLAN_API */
2007 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
2010 mcp_kreq_ether_send_t *req;
2011 bus_dma_segment_t *seg;
2016 int cnt, cum_len, err, i, idx, odd_flag, ip_off;
2017 uint16_t pseudo_hdr_offset;
2018 uint8_t flags, cksum_offset;
2025 ip_off = sizeof (struct ether_header);
2026 #ifdef MXGE_NEW_VLAN_API
2027 if (m->m_flags & M_VLANTAG) {
2028 m = mxge_vlan_tag_insert(m);
2029 if (__predict_false(m == NULL))
2031 ip_off += EVL_ENCAPLEN;
2034 /* (try to) map the frame for DMA */
2035 idx = tx->req & tx->mask;
2036 err = bus_dmamap_load_mbuf_segment(tx->dmat, tx->info[idx].map,
2037 m, tx->seg_list, 1, &cnt,
2039 if (__predict_false(err == EFBIG)) {
2040 /* Too many segments in the chain. Try
2042 m_tmp = m_defrag(m, M_NOWAIT);
2043 if (m_tmp == NULL) {
2048 err = bus_dmamap_load_mbuf_segment(tx->dmat,
2050 m, tx->seg_list, 1, &cnt,
2053 if (__predict_false(err != 0)) {
2054 device_printf(sc->dev, "bus_dmamap_load_mbuf_segment returned %d"
2055 " packet len = %d\n", err, m->m_pkthdr.len);
2058 bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2059 BUS_DMASYNC_PREWRITE);
2060 tx->info[idx].m = m;
2063 /* TSO is different enough, we handle it in another routine */
2064 if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2065 mxge_encap_tso(ss, m, cnt, ip_off);
2072 pseudo_hdr_offset = 0;
2073 flags = MXGEFW_FLAGS_NO_TSO;
2075 /* checksum offloading? */
2076 if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
2077 /* ensure ip header is in first mbuf, copy
2078 it to a scratch buffer if not */
2079 if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
2080 m_copydata(m, 0, ip_off + sizeof (*ip),
2082 ip = (struct ip *)(ss->scratch + ip_off);
2084 ip = (struct ip *)(mtod(m, char *) + ip_off);
2086 cksum_offset = ip_off + (ip->ip_hl << 2);
2087 pseudo_hdr_offset = cksum_offset + m->m_pkthdr.csum_data;
2088 pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2089 req->cksum_offset = cksum_offset;
2090 flags |= MXGEFW_FLAGS_CKSUM;
2091 odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2095 if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2096 flags |= MXGEFW_FLAGS_SMALL;
2098 /* convert segments into a request list */
2101 req->flags = MXGEFW_FLAGS_FIRST;
2102 for (i = 0; i < cnt; i++) {
2104 htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2106 htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2107 req->length = htobe16(seg->ds_len);
2108 req->cksum_offset = cksum_offset;
2109 if (cksum_offset > seg->ds_len)
2110 cksum_offset -= seg->ds_len;
2113 req->pseudo_hdr_offset = pseudo_hdr_offset;
2114 req->pad = 0; /* complete solid 16-byte block */
2115 req->rdma_count = 1;
2116 req->flags |= flags | ((cum_len & 1) * odd_flag);
2117 cum_len += seg->ds_len;
2123 /* pad runts to 60 bytes */
2127 htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2129 htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2130 req->length = htobe16(60 - cum_len);
2131 req->cksum_offset = 0;
2132 req->pseudo_hdr_offset = pseudo_hdr_offset;
2133 req->pad = 0; /* complete solid 16-byte block */
2134 req->rdma_count = 1;
2135 req->flags |= flags | ((cum_len & 1) * odd_flag);
2139 tx->req_list[0].rdma_count = cnt;
2141 /* print what the firmware will see */
2142 for (i = 0; i < cnt; i++) {
2143 kprintf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2144 "cso:%d, flags:0x%x, rdma:%d\n",
2145 i, (int)ntohl(tx->req_list[i].addr_high),
2146 (int)ntohl(tx->req_list[i].addr_low),
2147 (int)ntohs(tx->req_list[i].length),
2148 (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2149 tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2150 tx->req_list[i].rdma_count);
2152 kprintf("--------------\n");
2154 tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2155 mxge_submit_req(tx, tx->req_list, cnt);
2156 #ifdef IFNET_BUF_RING
2157 if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2158 /* tell the NIC to start polling this slice */
2160 tx->queue_active = 1;
2173 #ifdef IFNET_BUF_RING
2175 mxge_qflush(struct ifnet *ifp)
2177 mxge_softc_t *sc = ifp->if_softc;
2182 for (slice = 0; slice < sc->num_slices; slice++) {
2183 tx = &sc->ss[slice].tx;
2184 lwkt_serialize_enter(sc->ifp->if_serializer);
2185 while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2187 lwkt_serialize_exit(sc->ifp->if_serializer);
2193 mxge_start_locked(struct mxge_slice_state *ss)
2204 while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2205 m = drbr_dequeue(ifp, tx->br);
2209 /* let BPF see it */
2212 /* give it to the nic */
2215 /* ran out of transmit slots */
2216 if (((ss->if_flags & IFF_OACTIVE) == 0)
2217 && (!drbr_empty(ifp, tx->br))) {
2218 ss->if_flags |= IFF_OACTIVE;
2224 mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2235 if ((ss->if_flags & (IFF_RUNNING|IFF_OACTIVE)) !=
2237 err = drbr_enqueue(ifp, tx->br, m);
2241 if (drbr_empty(ifp, tx->br) &&
2242 ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2243 /* let BPF see it */
2245 /* give it to the nic */
2247 } else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2250 if (!drbr_empty(ifp, tx->br))
2251 mxge_start_locked(ss);
2256 mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2258 mxge_softc_t *sc = ifp->if_softc;
2259 struct mxge_slice_state *ss;
2265 slice = m->m_pkthdr.flowid;
2267 slice &= (sc->num_slices - 1); /* num_slices always power of 2 */
2269 ss = &sc->ss[slice];
2272 if(lwkt_serialize_try(ifp->if_serializer)) {
2273 err = mxge_transmit_locked(ss, m);
2274 lwkt_serialize_exit(ifp->if_serializer);
2276 err = drbr_enqueue(ifp, tx->br, m);
2285 mxge_start_locked(struct mxge_slice_state *ss)
2295 while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2296 m = ifq_dequeue(&ifp->if_snd, NULL);
2300 /* let BPF see it */
2303 /* give it to the nic */
2306 /* ran out of transmit slots */
2307 if ((sc->ifp->if_flags & IFF_OACTIVE) == 0) {
2308 sc->ifp->if_flags |= IFF_OACTIVE;
2314 mxge_start(struct ifnet *ifp)
2316 mxge_softc_t *sc = ifp->if_softc;
2317 struct mxge_slice_state *ss;
2319 ASSERT_SERIALIZED(sc->ifp->if_serializer);
2320 /* only use the first slice for now */
2322 mxge_start_locked(ss);
2326 * copy an array of mcp_kreq_ether_recv_t's to the mcp. Copy
2327 * at most 32 bytes at a time, so as to avoid involving the software
2328 * pio handler in the nic. We re-write the first segment's low
2329 * DMA address to mark it valid only after we write the entire chunk
2333 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2334 mcp_kreq_ether_recv_t *src)
2338 low = src->addr_low;
2339 src->addr_low = 0xffffffff;
2340 mxge_pio_copy(dst, src, 4 * sizeof (*src));
2342 mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2344 src->addr_low = low;
2345 dst->addr_low = low;
2350 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2352 bus_dma_segment_t seg;
2354 mxge_rx_ring_t *rx = &ss->rx_small;
2357 m = m_gethdr(MB_DONTWAIT, MT_DATA);
2364 err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2365 &seg, 1, &cnt, BUS_DMA_NOWAIT);
2370 rx->info[idx].m = m;
2371 rx->shadow[idx].addr_low =
2372 htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2373 rx->shadow[idx].addr_high =
2374 htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2378 mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2384 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2386 bus_dma_segment_t seg[3];
2388 mxge_rx_ring_t *rx = &ss->rx_big;
2391 if (rx->cl_size == MCLBYTES)
2392 m = m_getcl(MB_DONTWAIT, MT_DATA, M_PKTHDR);
2395 m = m_getjcl(MB_DONTWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2398 * XXX: allocate normal sized buffers for big buffers.
2399 * We should be fine as long as we don't get any jumbo frames
2401 m = m_getcl(MB_DONTWAIT, MT_DATA, M_PKTHDR);
2409 m->m_len = rx->mlen;
2410 err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2411 seg, 1, &cnt, BUS_DMA_NOWAIT);
2416 rx->info[idx].m = m;
2417 rx->shadow[idx].addr_low =
2418 htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2419 rx->shadow[idx].addr_high =
2420 htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2422 #if MXGE_VIRT_JUMBOS
2423 for (i = 1; i < cnt; i++) {
2424 rx->shadow[idx + i].addr_low =
2425 htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2426 rx->shadow[idx + i].addr_high =
2427 htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2432 for (i = 0; i < rx->nbufs; i++) {
2433 if ((idx & 7) == 7) {
2434 mxge_submit_8rx(&rx->lanai[idx - 7],
2435 &rx->shadow[idx - 7]);
2443 * Myri10GE hardware checksums are not valid if the sender
2444 * padded the frame with non-zero padding. This is because
2445 * the firmware just does a simple 16-bit 1s complement
2446 * checksum across the entire frame, excluding the first 14
2447 * bytes. It is best to simply to check the checksum and
2448 * tell the stack about it only if the checksum is good
2451 static inline uint16_t
2452 mxge_rx_csum(struct mbuf *m, int csum)
2454 struct ether_header *eh;
2458 eh = mtod(m, struct ether_header *);
2460 /* only deal with IPv4 TCP & UDP for now */
2461 if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2463 ip = (struct ip *)(eh + 1);
2464 if (__predict_false(ip->ip_p != IPPROTO_TCP &&
2465 ip->ip_p != IPPROTO_UDP))
2468 c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2469 htonl(ntohs(csum) + ntohs(ip->ip_len) +
2470 - (ip->ip_hl << 2) + ip->ip_p));
2479 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2481 struct ether_vlan_header *evl;
2482 struct ether_header *eh;
2485 evl = mtod(m, struct ether_vlan_header *);
2486 eh = mtod(m, struct ether_header *);
2489 * fix checksum by subtracting EVL_ENCAPLEN bytes
2490 * after what the firmware thought was the end of the ethernet
2494 /* put checksum into host byte order */
2495 *csum = ntohs(*csum);
2496 partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2497 (*csum) += ~partial;
2498 (*csum) += ((*csum) < ~partial);
2499 (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2500 (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2502 /* restore checksum to network byte order;
2503 later consumers expect this */
2504 *csum = htons(*csum);
2507 #ifdef MXGE_NEW_VLAN_API
2508 m->m_pkthdr.ether_vlantag = ntohs(evl->evl_tag);
2512 mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2516 VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2517 m_tag_prepend(m, mtag);
2521 m->m_flags |= M_VLANTAG;
2524 * Remove the 802.1q header by copying the Ethernet
2525 * addresses over it and adjusting the beginning of
2526 * the data in the mbuf. The encapsulated Ethernet
2527 * type field is already in place.
2529 bcopy((char *)evl, (char *)evl + EVL_ENCAPLEN,
2530 ETHER_HDR_LEN - ETHER_TYPE_LEN);
2531 m_adj(m, EVL_ENCAPLEN);
2536 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2541 struct ether_header *eh;
2543 bus_dmamap_t old_map;
2545 uint16_t tcpudp_csum;
2550 idx = rx->cnt & rx->mask;
2551 rx->cnt += rx->nbufs;
2552 /* save a pointer to the received mbuf */
2553 m = rx->info[idx].m;
2554 /* try to replace the received mbuf */
2555 if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2556 /* drop the frame -- the old mbuf is re-cycled */
2561 /* unmap the received buffer */
2562 old_map = rx->info[idx].map;
2563 bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2564 bus_dmamap_unload(rx->dmat, old_map);
2566 /* swap the bus_dmamap_t's */
2567 rx->info[idx].map = rx->extra_map;
2568 rx->extra_map = old_map;
2570 /* mcp implicitly skips 1st 2 bytes so that packet is properly
2572 m->m_data += MXGEFW_PAD;
2574 m->m_pkthdr.rcvif = ifp;
2575 m->m_len = m->m_pkthdr.len = len;
2577 eh = mtod(m, struct ether_header *);
2578 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2579 mxge_vlan_tag_remove(m, &csum);
2581 /* if the checksum is valid, mark it in the mbuf header */
2582 if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2583 if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2585 /* otherwise, it was a UDP frame, or a TCP frame which
2586 we could not do LRO on. Tell the stack that the
2588 m->m_pkthdr.csum_data = 0xffff;
2589 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2592 /* flowid only valid if RSS hashing is enabled */
2593 if (sc->num_slices > 1) {
2594 m->m_pkthdr.flowid = (ss - sc->ss);
2595 m->m_flags |= M_FLOWID;
2598 /* pass the frame up the stack */
2599 (*ifp->if_input)(ifp, m);
2603 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2607 struct ether_header *eh;
2610 bus_dmamap_t old_map;
2612 uint16_t tcpudp_csum;
2617 idx = rx->cnt & rx->mask;
2619 /* save a pointer to the received mbuf */
2620 m = rx->info[idx].m;
2621 /* try to replace the received mbuf */
2622 if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2623 /* drop the frame -- the old mbuf is re-cycled */
2628 /* unmap the received buffer */
2629 old_map = rx->info[idx].map;
2630 bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2631 bus_dmamap_unload(rx->dmat, old_map);
2633 /* swap the bus_dmamap_t's */
2634 rx->info[idx].map = rx->extra_map;
2635 rx->extra_map = old_map;
2637 /* mcp implicitly skips 1st 2 bytes so that packet is properly
2639 m->m_data += MXGEFW_PAD;
2641 m->m_pkthdr.rcvif = ifp;
2642 m->m_len = m->m_pkthdr.len = len;
2644 eh = mtod(m, struct ether_header *);
2645 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2646 mxge_vlan_tag_remove(m, &csum);
2648 /* if the checksum is valid, mark it in the mbuf header */
2649 if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2650 if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2652 /* otherwise, it was a UDP frame, or a TCP frame which
2653 we could not do LRO on. Tell the stack that the
2655 m->m_pkthdr.csum_data = 0xffff;
2656 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2659 /* flowid only valid if RSS hashing is enabled */
2660 if (sc->num_slices > 1) {
2661 m->m_pkthdr.flowid = (ss - sc->ss);
2662 m->m_flags |= M_FLOWID;
2665 /* pass the frame up the stack */
2666 (*ifp->if_input)(ifp, m);
2670 mxge_clean_rx_done(struct mxge_slice_state *ss)
2672 mxge_rx_done_t *rx_done = &ss->rx_done;
2678 while (rx_done->entry[rx_done->idx].length != 0) {
2679 length = ntohs(rx_done->entry[rx_done->idx].length);
2680 rx_done->entry[rx_done->idx].length = 0;
2681 checksum = rx_done->entry[rx_done->idx].checksum;
2682 if (length <= (MHLEN - MXGEFW_PAD))
2683 mxge_rx_done_small(ss, length, checksum);
2685 mxge_rx_done_big(ss, length, checksum);
2687 rx_done->idx = rx_done->cnt & rx_done->mask;
2689 /* limit potential for livelock */
2690 if (__predict_false(++limit > rx_done->mask / 2))
2694 while (!SLIST_EMPTY(&ss->lro_active)) {
2695 struct lro_entry *lro = SLIST_FIRST(&ss->lro_active);
2696 SLIST_REMOVE_HEAD(&ss->lro_active, next);
2697 mxge_lro_flush(ss, lro);
2704 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2715 ASSERT_SERIALIZED(ifp->if_serializer);
2716 while (tx->pkt_done != mcp_idx) {
2717 idx = tx->done & tx->mask;
2719 m = tx->info[idx].m;
2720 /* mbuf and DMA map only attached to the first
2723 ss->obytes += m->m_pkthdr.len;
2724 if (m->m_flags & M_MCAST)
2727 tx->info[idx].m = NULL;
2728 map = tx->info[idx].map;
2729 bus_dmamap_unload(tx->dmat, map);
2732 if (tx->info[idx].flag) {
2733 tx->info[idx].flag = 0;
2738 /* If we have space, clear IFF_OACTIVE to tell the stack that
2739 its OK to send packets */
2740 #ifdef IFNET_BUF_RING
2741 flags = &ss->if_flags;
2743 flags = &ifp->if_flags;
2745 if ((*flags) & IFF_OACTIVE &&
2746 tx->req - tx->done < (tx->mask + 1)/4) {
2747 *(flags) &= ~IFF_OACTIVE;
2749 mxge_start_locked(ss);
2751 #ifdef IFNET_BUF_RING
2752 if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2753 /* let the NIC stop polling this queue, since there
2754 * are no more transmits pending */
2755 if (tx->req == tx->done) {
2757 tx->queue_active = 0;
2766 static struct mxge_media_type mxge_xfp_media_types[] =
2768 {IFM_10G_CX4, 0x7f, "10GBASE-CX4 (module)"},
2769 {IFM_10G_SR, (1 << 7), "10GBASE-SR"},
2770 {IFM_10G_LR, (1 << 6), "10GBASE-LR"},
2771 {0, (1 << 5), "10GBASE-ER"},
2772 {IFM_10G_LRM, (1 << 4), "10GBASE-LRM"},
2773 {0, (1 << 3), "10GBASE-SW"},
2774 {0, (1 << 2), "10GBASE-LW"},
2775 {0, (1 << 1), "10GBASE-EW"},
2776 {0, (1 << 0), "Reserved"}
2778 static struct mxge_media_type mxge_sfp_media_types[] =
2780 {0, (1 << 7), "Reserved"},
2781 {IFM_10G_LRM, (1 << 6), "10GBASE-LRM"},
2782 {IFM_10G_LR, (1 << 5), "10GBASE-LR"},
2783 {IFM_10G_SR, (1 << 4), "10GBASE-SR"}
2787 mxge_set_media(mxge_softc_t *sc, int type)
2789 sc->media_flags |= type;
2790 ifmedia_add(&sc->media, sc->media_flags, 0, NULL);
2791 ifmedia_set(&sc->media, sc->media_flags);
2796 * Determine the media type for a NIC. Some XFPs will identify
2797 * themselves only when their link is up, so this is initiated via a
2798 * link up interrupt. However, this can potentially take up to
2799 * several milliseconds, so it is run via the watchdog routine, rather
2800 * than in the interrupt handler itself. This need only be done
2801 * once, not each time the link is up.
2804 mxge_media_probe(mxge_softc_t *sc)
2809 struct mxge_media_type *mxge_media_types = NULL;
2810 int i, err, ms, mxge_media_type_entries;
2813 sc->need_media_probe = 0;
2815 /* if we've already set a media type, we're done */
2816 if (sc->media_flags != (IFM_ETHER | IFM_AUTO))
2820 * parse the product code to deterimine the interface type
2821 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2822 * after the 3rd dash in the driver's cached copy of the
2823 * EEPROM's product code string.
2825 ptr = sc->product_code_string;
2827 device_printf(sc->dev, "Missing product code\n");
2830 for (i = 0; i < 3; i++, ptr++) {
2831 ptr = index(ptr, '-');
2833 device_printf(sc->dev,
2834 "only %d dashes in PC?!?\n", i);
2840 mxge_set_media(sc, IFM_10G_CX4);
2843 else if (*ptr == 'Q') {
2844 /* -Q is Quad Ribbon Fiber */
2845 device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2846 /* FreeBSD has no media type for Quad ribbon fiber */
2852 mxge_media_types = mxge_xfp_media_types;
2853 mxge_media_type_entries =
2854 sizeof (mxge_xfp_media_types) /
2855 sizeof (mxge_xfp_media_types[0]);
2856 byte = MXGE_XFP_COMPLIANCE_BYTE;
2860 if (*ptr == 'S' || *(ptr +1) == 'S') {
2861 /* -S or -2S is SFP+ */
2862 mxge_media_types = mxge_sfp_media_types;
2863 mxge_media_type_entries =
2864 sizeof (mxge_sfp_media_types) /
2865 sizeof (mxge_sfp_media_types[0]);
2870 if (mxge_media_types == NULL) {
2871 device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2876 * At this point we know the NIC has an XFP cage, so now we
2877 * try to determine what is in the cage by using the
2878 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2879 * register. We read just one byte, which may take over
2883 cmd.data0 = 0; /* just fetch 1 byte, not all 256 */
2885 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2886 if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
2887 device_printf(sc->dev, "failed to read XFP\n");
2889 if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
2890 device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
2892 if (err != MXGEFW_CMD_OK) {
2896 /* now we wait for the data to be cached */
2898 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2899 for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
2902 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2904 if (err != MXGEFW_CMD_OK) {
2905 device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
2906 cage_type, err, ms);
2910 if (cmd.data0 == mxge_media_types[0].bitmask) {
2912 device_printf(sc->dev, "%s:%s\n", cage_type,
2913 mxge_media_types[0].name);
2914 mxge_set_media(sc, IFM_10G_CX4);
2917 for (i = 1; i < mxge_media_type_entries; i++) {
2918 if (cmd.data0 & mxge_media_types[i].bitmask) {
2920 device_printf(sc->dev, "%s:%s\n",
2922 mxge_media_types[i].name);
2924 mxge_set_media(sc, mxge_media_types[i].flag);
2928 device_printf(sc->dev, "%s media 0x%x unknown\n", cage_type,
2935 mxge_intr(void *arg)
2937 struct mxge_slice_state *ss = arg;
2938 mxge_softc_t *sc = ss->sc;
2939 mcp_irq_data_t *stats = ss->fw_stats;
2940 mxge_tx_ring_t *tx = &ss->tx;
2941 mxge_rx_done_t *rx_done = &ss->rx_done;
2942 uint32_t send_done_count;
2946 #ifndef IFNET_BUF_RING
2947 /* an interrupt on a non-zero slice is implicitly valid
2948 since MSI-X irqs are not shared */
2950 mxge_clean_rx_done(ss);
2951 *ss->irq_claim = be32toh(3);
2956 /* make sure the DMA has finished */
2957 if (!stats->valid) {
2960 valid = stats->valid;
2962 if (sc->legacy_irq) {
2963 /* lower legacy IRQ */
2964 *sc->irq_deassert = 0;
2965 if (!mxge_deassert_wait)
2966 /* don't wait for conf. that irq is low */
2972 /* loop while waiting for legacy irq deassertion */
2974 /* check for transmit completes and receives */
2975 send_done_count = be32toh(stats->send_done_count);
2976 while ((send_done_count != tx->pkt_done) ||
2977 (rx_done->entry[rx_done->idx].length != 0)) {
2978 if (send_done_count != tx->pkt_done)
2979 mxge_tx_done(ss, (int)send_done_count);
2980 mxge_clean_rx_done(ss);
2981 send_done_count = be32toh(stats->send_done_count);
2983 if (sc->legacy_irq && mxge_deassert_wait)
2985 } while (*((volatile uint8_t *) &stats->valid));
2987 /* fw link & error stats meaningful only on the first slice */
2988 if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
2989 if (sc->link_state != stats->link_up) {
2990 sc->link_state = stats->link_up;
2991 if (sc->link_state) {
2992 sc->ifp->if_link_state = LINK_STATE_UP;
2993 if_link_state_change(sc->ifp);
2995 device_printf(sc->dev, "link up\n");
2997 sc->ifp->if_link_state = LINK_STATE_DOWN;
2998 if_link_state_change(sc->ifp);
3000 device_printf(sc->dev, "link down\n");
3002 sc->need_media_probe = 1;
3004 if (sc->rdma_tags_available !=
3005 be32toh(stats->rdma_tags_available)) {
3006 sc->rdma_tags_available =
3007 be32toh(stats->rdma_tags_available);
3008 device_printf(sc->dev, "RDMA timed out! %d tags "
3009 "left\n", sc->rdma_tags_available);
3012 if (stats->link_down) {
3013 sc->down_cnt += stats->link_down;
3015 sc->ifp->if_link_state = LINK_STATE_DOWN;
3016 if_link_state_change(sc->ifp);
3020 /* check to see if we have rx token to pass back */
3022 *ss->irq_claim = be32toh(3);
3023 *(ss->irq_claim + 1) = be32toh(3);
3027 mxge_init(void *arg)
3034 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
3036 struct lro_entry *lro_entry;
3039 while (!SLIST_EMPTY(&ss->lro_free)) {
3040 lro_entry = SLIST_FIRST(&ss->lro_free);
3041 SLIST_REMOVE_HEAD(&ss->lro_free, next);
3042 kfree(lro_entry, M_DEVBUF);
3045 for (i = 0; i <= ss->rx_big.mask; i++) {
3046 if (ss->rx_big.info[i].m == NULL)
3048 bus_dmamap_unload(ss->rx_big.dmat,
3049 ss->rx_big.info[i].map);
3050 m_freem(ss->rx_big.info[i].m);
3051 ss->rx_big.info[i].m = NULL;
3054 for (i = 0; i <= ss->rx_small.mask; i++) {
3055 if (ss->rx_small.info[i].m == NULL)
3057 bus_dmamap_unload(ss->rx_small.dmat,
3058 ss->rx_small.info[i].map);
3059 m_freem(ss->rx_small.info[i].m);
3060 ss->rx_small.info[i].m = NULL;
3063 /* transmit ring used only on the first slice */
3064 if (ss->tx.info == NULL)
3067 for (i = 0; i <= ss->tx.mask; i++) {
3068 ss->tx.info[i].flag = 0;
3069 if (ss->tx.info[i].m == NULL)
3071 bus_dmamap_unload(ss->tx.dmat,
3072 ss->tx.info[i].map);
3073 m_freem(ss->tx.info[i].m);
3074 ss->tx.info[i].m = NULL;
3079 mxge_free_mbufs(mxge_softc_t *sc)
3083 for (slice = 0; slice < sc->num_slices; slice++)
3084 mxge_free_slice_mbufs(&sc->ss[slice]);
3088 mxge_free_slice_rings(struct mxge_slice_state *ss)
3093 if (ss->rx_done.entry != NULL)
3094 mxge_dma_free(&ss->rx_done.dma);
3095 ss->rx_done.entry = NULL;
3097 if (ss->tx.req_bytes != NULL)
3098 kfree(ss->tx.req_bytes, M_DEVBUF);
3099 ss->tx.req_bytes = NULL;
3101 if (ss->tx.seg_list != NULL)
3102 kfree(ss->tx.seg_list, M_DEVBUF);
3103 ss->tx.seg_list = NULL;
3105 if (ss->rx_small.shadow != NULL)
3106 kfree(ss->rx_small.shadow, M_DEVBUF);
3107 ss->rx_small.shadow = NULL;
3109 if (ss->rx_big.shadow != NULL)
3110 kfree(ss->rx_big.shadow, M_DEVBUF);
3111 ss->rx_big.shadow = NULL;
3113 if (ss->tx.info != NULL) {
3114 if (ss->tx.dmat != NULL) {
3115 for (i = 0; i <= ss->tx.mask; i++) {
3116 bus_dmamap_destroy(ss->tx.dmat,
3117 ss->tx.info[i].map);
3119 bus_dma_tag_destroy(ss->tx.dmat);
3121 kfree(ss->tx.info, M_DEVBUF);
3125 if (ss->rx_small.info != NULL) {
3126 if (ss->rx_small.dmat != NULL) {
3127 for (i = 0; i <= ss->rx_small.mask; i++) {
3128 bus_dmamap_destroy(ss->rx_small.dmat,
3129 ss->rx_small.info[i].map);
3131 bus_dmamap_destroy(ss->rx_small.dmat,
3132 ss->rx_small.extra_map);
3133 bus_dma_tag_destroy(ss->rx_small.dmat);
3135 kfree(ss->rx_small.info, M_DEVBUF);
3137 ss->rx_small.info = NULL;
3139 if (ss->rx_big.info != NULL) {
3140 if (ss->rx_big.dmat != NULL) {
3141 for (i = 0; i <= ss->rx_big.mask; i++) {
3142 bus_dmamap_destroy(ss->rx_big.dmat,
3143 ss->rx_big.info[i].map);
3145 bus_dmamap_destroy(ss->rx_big.dmat,
3146 ss->rx_big.extra_map);
3147 bus_dma_tag_destroy(ss->rx_big.dmat);
3149 kfree(ss->rx_big.info, M_DEVBUF);
3151 ss->rx_big.info = NULL;
3155 mxge_free_rings(mxge_softc_t *sc)
3159 for (slice = 0; slice < sc->num_slices; slice++)
3160 mxge_free_slice_rings(&sc->ss[slice]);
3164 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3165 int tx_ring_entries)
3167 mxge_softc_t *sc = ss->sc;
3173 /* allocate per-slice receive resources */
3175 ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3176 ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3178 /* allocate the rx shadow rings */
3179 bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3180 ss->rx_small.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3181 if (ss->rx_small.shadow == NULL)
3184 bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3185 ss->rx_big.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3186 if (ss->rx_big.shadow == NULL)
3189 /* allocate the rx host info rings */
3190 bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3191 ss->rx_small.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3192 if (ss->rx_small.info == NULL)
3195 bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3196 ss->rx_big.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3197 if (ss->rx_big.info == NULL)
3200 /* allocate the rx busdma resources */
3201 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3203 4096, /* boundary */
3204 BUS_SPACE_MAXADDR, /* low */
3205 BUS_SPACE_MAXADDR, /* high */
3206 NULL, NULL, /* filter */
3207 MHLEN, /* maxsize */
3209 MHLEN, /* maxsegsize */
3210 BUS_DMA_ALLOCNOW, /* flags */
3211 &ss->rx_small.dmat); /* tag */
3213 device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3218 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3220 #if MXGE_VIRT_JUMBOS
3221 4096, /* boundary */
3225 BUS_SPACE_MAXADDR, /* low */
3226 BUS_SPACE_MAXADDR, /* high */
3227 NULL, NULL, /* filter */
3228 3*4096, /* maxsize */
3229 #if MXGE_VIRT_JUMBOS
3231 4096, /* maxsegsize*/
3234 MJUM9BYTES, /* maxsegsize*/
3236 BUS_DMA_ALLOCNOW, /* flags */
3237 &ss->rx_big.dmat); /* tag */
3239 device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3243 for (i = 0; i <= ss->rx_small.mask; i++) {
3244 err = bus_dmamap_create(ss->rx_small.dmat, 0,
3245 &ss->rx_small.info[i].map);
3247 device_printf(sc->dev, "Err %d rx_small dmamap\n",
3252 err = bus_dmamap_create(ss->rx_small.dmat, 0,
3253 &ss->rx_small.extra_map);
3255 device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3260 for (i = 0; i <= ss->rx_big.mask; i++) {
3261 err = bus_dmamap_create(ss->rx_big.dmat, 0,
3262 &ss->rx_big.info[i].map);
3264 device_printf(sc->dev, "Err %d rx_big dmamap\n",
3269 err = bus_dmamap_create(ss->rx_big.dmat, 0,
3270 &ss->rx_big.extra_map);
3272 device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3277 /* now allocate TX resouces */
3279 #ifndef IFNET_BUF_RING
3280 /* only use a single TX ring for now */
3281 if (ss != ss->sc->ss)
3285 ss->tx.mask = tx_ring_entries - 1;
3286 ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3289 /* allocate the tx request copy block */
3291 sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3292 ss->tx.req_bytes = kmalloc(bytes, M_DEVBUF, M_WAITOK);
3293 if (ss->tx.req_bytes == NULL)
3295 /* ensure req_list entries are aligned to 8 bytes */
3296 ss->tx.req_list = (mcp_kreq_ether_send_t *)
3297 ((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3299 /* allocate the tx busdma segment list */
3300 bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3301 ss->tx.seg_list = (bus_dma_segment_t *)
3302 kmalloc(bytes, M_DEVBUF, M_WAITOK);
3303 if (ss->tx.seg_list == NULL)
3306 /* allocate the tx host info ring */
3307 bytes = tx_ring_entries * sizeof (*ss->tx.info);
3308 ss->tx.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3309 if (ss->tx.info == NULL)
3312 /* allocate the tx busdma resources */
3313 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3315 sc->tx_boundary, /* boundary */
3316 BUS_SPACE_MAXADDR, /* low */
3317 BUS_SPACE_MAXADDR, /* high */
3318 NULL, NULL, /* filter */
3319 65536 + 256, /* maxsize */
3320 ss->tx.max_desc - 2, /* num segs */
3321 sc->tx_boundary, /* maxsegsz */
3322 BUS_DMA_ALLOCNOW, /* flags */
3323 &ss->tx.dmat); /* tag */
3326 device_printf(sc->dev, "Err %d allocating tx dmat\n",
3331 /* now use these tags to setup dmamaps for each slot
3333 for (i = 0; i <= ss->tx.mask; i++) {
3334 err = bus_dmamap_create(ss->tx.dmat, 0,
3335 &ss->tx.info[i].map);
3337 device_printf(sc->dev, "Err %d tx dmamap\n",
3347 mxge_alloc_rings(mxge_softc_t *sc)
3351 int tx_ring_entries, rx_ring_entries;
3354 /* get ring sizes */
3355 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3356 tx_ring_size = cmd.data0;
3358 device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3362 tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3363 rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3364 ifq_set_maxlen(&sc->ifp->if_snd, tx_ring_entries - 1);
3365 ifq_set_ready(&sc->ifp->if_snd);
3367 for (slice = 0; slice < sc->num_slices; slice++) {
3368 err = mxge_alloc_slice_rings(&sc->ss[slice],
3377 mxge_free_rings(sc);
3384 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3386 int bufsize = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN + MXGEFW_PAD;
3388 if (bufsize < MCLBYTES) {
3389 /* easy, everything fits in a single buffer */
3390 *big_buf_size = MCLBYTES;
3391 *cl_size = MCLBYTES;
3396 if (bufsize < MJUMPAGESIZE) {
3397 /* still easy, everything still fits in a single buffer */
3398 *big_buf_size = MJUMPAGESIZE;
3399 *cl_size = MJUMPAGESIZE;
3403 #if MXGE_VIRT_JUMBOS
3404 /* now we need to use virtually contiguous buffers */
3405 *cl_size = MJUM9BYTES;
3406 *big_buf_size = 4096;
3407 *nbufs = mtu / 4096 + 1;
3408 /* needs to be a power of two, so round up */
3412 *cl_size = MJUM9BYTES;
3413 *big_buf_size = MJUM9BYTES;
3419 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3424 struct lro_entry *lro_entry;
3429 slice = ss - sc->ss;
3431 SLIST_INIT(&ss->lro_free);
3432 SLIST_INIT(&ss->lro_active);
3434 for (i = 0; i < sc->lro_cnt; i++) {
3435 lro_entry = (struct lro_entry *)
3436 kmalloc(sizeof (*lro_entry), M_DEVBUF,
3438 if (lro_entry == NULL) {
3442 SLIST_INSERT_HEAD(&ss->lro_free, lro_entry, next);
3444 /* get the lanai pointers to the send and receive rings */
3447 #ifndef IFNET_BUF_RING
3448 /* We currently only send from the first slice */
3452 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3454 (volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3455 ss->tx.send_go = (volatile uint32_t *)
3456 (sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3457 ss->tx.send_stop = (volatile uint32_t *)
3458 (sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3459 #ifndef IFNET_BUF_RING
3463 err |= mxge_send_cmd(sc,
3464 MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3465 ss->rx_small.lanai =
3466 (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3468 err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3470 (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3473 device_printf(sc->dev,
3474 "failed to get ring sizes or locations\n");
3478 /* stock receive rings */
3479 for (i = 0; i <= ss->rx_small.mask; i++) {
3480 map = ss->rx_small.info[i].map;
3481 err = mxge_get_buf_small(ss, map, i);
3483 device_printf(sc->dev, "alloced %d/%d smalls\n",
3484 i, ss->rx_small.mask + 1);
3488 for (i = 0; i <= ss->rx_big.mask; i++) {
3489 ss->rx_big.shadow[i].addr_low = 0xffffffff;
3490 ss->rx_big.shadow[i].addr_high = 0xffffffff;
3492 ss->rx_big.nbufs = nbufs;
3493 ss->rx_big.cl_size = cl_size;
3494 ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3495 EVL_ENCAPLEN + MXGEFW_PAD;
3496 for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3497 map = ss->rx_big.info[i].map;
3498 err = mxge_get_buf_big(ss, map, i);
3500 device_printf(sc->dev, "alloced %d/%d bigs\n",
3501 i, ss->rx_big.mask + 1);
3509 mxge_open(mxge_softc_t *sc)
3512 int err, big_bytes, nbufs, slice, cl_size, i;
3514 volatile uint8_t *itable;
3515 struct mxge_slice_state *ss;
3517 ASSERT_SERIALIZED(sc->ifp->if_serializer);
3518 /* Copy the MAC address in case it was overridden */
3519 bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3521 err = mxge_reset(sc, 1);
3523 device_printf(sc->dev, "failed to reset\n");
3527 if (sc->num_slices > 1) {
3528 /* setup the indirection table */
3529 cmd.data0 = sc->num_slices;
3530 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3533 err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3536 device_printf(sc->dev,
3537 "failed to setup rss tables\n");
3541 /* just enable an identity mapping */
3542 itable = sc->sram + cmd.data0;
3543 for (i = 0; i < sc->num_slices; i++)
3544 itable[i] = (uint8_t)i;
3547 cmd.data1 = mxge_rss_hash_type;
3548 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3550 device_printf(sc->dev, "failed to enable slices\n");
3556 mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3559 err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3561 /* error is only meaningful if we're trying to set
3562 MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3563 if (err && nbufs > 1) {
3564 device_printf(sc->dev,
3565 "Failed to set alway-use-n to %d\n",
3569 /* Give the firmware the mtu and the big and small buffer
3570 sizes. The firmware wants the big buf size to be a power
3571 of two. Luckily, FreeBSD's clusters are powers of two */
3572 cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
3573 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3574 cmd.data0 = MHLEN - MXGEFW_PAD;
3575 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3577 cmd.data0 = big_bytes;
3578 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3581 device_printf(sc->dev, "failed to setup params\n");
3585 /* Now give him the pointer to the stats block */
3587 #ifdef IFNET_BUF_RING
3588 slice < sc->num_slices;
3593 ss = &sc->ss[slice];
3595 MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3597 MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3598 cmd.data2 = sizeof(struct mcp_irq_data);
3599 cmd.data2 |= (slice << 16);
3600 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3604 bus = sc->ss->fw_stats_dma.bus_addr;
3605 bus += offsetof(struct mcp_irq_data, send_done_count);
3606 cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3607 cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3608 err = mxge_send_cmd(sc,
3609 MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3611 /* Firmware cannot support multicast without STATS_DMA_V2 */
3612 sc->fw_multicast_support = 0;
3614 sc->fw_multicast_support = 1;
3618 device_printf(sc->dev, "failed to setup params\n");
3622 for (slice = 0; slice < sc->num_slices; slice++) {
3623 err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3625 device_printf(sc->dev, "couldn't open slice %d\n",
3631 /* Finally, start the firmware running */
3632 err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3634 device_printf(sc->dev, "Couldn't bring up link\n");
3637 #ifdef IFNET_BUF_RING
3638 for (slice = 0; slice < sc->num_slices; slice++) {
3639 ss = &sc->ss[slice];
3640 ss->if_flags |= IFF_RUNNING;
3641 ss->if_flags &= ~IFF_OACTIVE;
3644 sc->ifp->if_flags |= IFF_RUNNING;
3645 sc->ifp->if_flags &= ~IFF_OACTIVE;
3646 callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3652 mxge_free_mbufs(sc);
3658 mxge_close(mxge_softc_t *sc)
3661 int err, old_down_cnt;
3662 #ifdef IFNET_BUF_RING
3663 struct mxge_slice_state *ss;
3667 ASSERT_SERIALIZED(sc->ifp->if_serializer);
3668 callout_stop(&sc->co_hdl);
3669 #ifdef IFNET_BUF_RING
3670 for (slice = 0; slice < sc->num_slices; slice++) {
3671 ss = &sc->ss[slice];
3672 ss->if_flags &= ~IFF_RUNNING;
3675 sc->ifp->if_flags &= ~IFF_RUNNING;
3676 old_down_cnt = sc->down_cnt;
3678 err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3680 device_printf(sc->dev, "Couldn't bring down link\n");
3682 if (old_down_cnt == sc->down_cnt) {
3683 /* wait for down irq */
3684 DELAY(10 * sc->intr_coal_delay);
3687 if (old_down_cnt == sc->down_cnt) {
3688 device_printf(sc->dev, "never got down irq\n");
3691 mxge_free_mbufs(sc);
3697 mxge_setup_cfg_space(mxge_softc_t *sc)
3699 device_t dev = sc->dev;
3701 uint16_t cmd, lnk, pectl;
3703 /* find the PCIe link width and set max read request to 4KB*/
3704 if (pci_find_extcap(dev, PCIY_EXPRESS, ®) == 0) {
3705 lnk = pci_read_config(dev, reg + 0x12, 2);
3706 sc->link_width = (lnk >> 4) & 0x3f;
3708 pectl = pci_read_config(dev, reg + 0x8, 2);
3709 pectl = (pectl & ~0x7000) | (5 << 12);
3710 pci_write_config(dev, reg + 0x8, pectl, 2);
3713 /* Enable DMA and Memory space access */
3714 pci_enable_busmaster(dev);
3715 cmd = pci_read_config(dev, PCIR_COMMAND, 2);
3716 cmd |= PCIM_CMD_MEMEN;
3717 pci_write_config(dev, PCIR_COMMAND, cmd, 2);
3721 mxge_read_reboot(mxge_softc_t *sc)
3723 device_t dev = sc->dev;
3726 /* find the vendor specific offset */
3727 if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
3728 device_printf(sc->dev,
3729 "could not find vendor specific offset\n");
3730 return (uint32_t)-1;
3732 /* enable read32 mode */
3733 pci_write_config(dev, vs + 0x10, 0x3, 1);
3734 /* tell NIC which register to read */
3735 pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3736 return (pci_read_config(dev, vs + 0x14, 4));
3740 mxge_watchdog_reset(mxge_softc_t *sc, int slice)
3742 struct pci_devinfo *dinfo;
3750 device_printf(sc->dev, "Watchdog reset!\n");
3753 * check to see if the NIC rebooted. If it did, then all of
3754 * PCI config space has been reset, and things like the
3755 * busmaster bit will be zero. If this is the case, then we
3756 * must restore PCI config space before the NIC can be used
3759 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3760 if (cmd == 0xffff) {
3762 * maybe the watchdog caught the NIC rebooting; wait
3763 * up to 100ms for it to finish. If it does not come
3764 * back, then give up
3767 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3768 if (cmd == 0xffff) {
3769 device_printf(sc->dev, "NIC disappeared!\n");
3773 if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3774 /* print the reboot status */
3775 reboot = mxge_read_reboot(sc);
3776 device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3778 /* restore PCI configuration space */
3779 dinfo = device_get_ivars(sc->dev);
3780 pci_cfg_restore(sc->dev, dinfo);
3782 /* and redo any changes we made to our config space */
3783 mxge_setup_cfg_space(sc);
3785 if (sc->ifp->if_flags & IFF_RUNNING) {
3787 err = mxge_open(sc);
3790 tx = &sc->ss[slice].tx;
3791 device_printf(sc->dev,
3792 "NIC did not reboot, slice %d ring state:\n",
3794 device_printf(sc->dev,
3795 "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3796 tx->req, tx->done, tx->queue_active);
3797 device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3798 tx->activate, tx->deactivate);
3799 device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3801 be32toh(sc->ss->fw_stats->send_done_count));
3802 device_printf(sc->dev, "not resetting\n");
3808 mxge_watchdog(mxge_softc_t *sc)
3811 uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3814 /* see if we have outstanding transmits, which
3815 have been pending for more than mxge_ticks */
3817 #ifdef IFNET_BUF_RING
3818 (i < sc->num_slices) && (err == 0);
3820 (i < 1) && (err == 0);
3824 if (tx->req != tx->done &&
3825 tx->watchdog_req != tx->watchdog_done &&
3826 tx->done == tx->watchdog_done) {
3827 /* check for pause blocking before resetting */
3828 if (tx->watchdog_rx_pause == rx_pause)
3829 err = mxge_watchdog_reset(sc, i);
3831 device_printf(sc->dev, "Flow control blocking "
3832 "xmits, check link partner\n");
3835 tx->watchdog_req = tx->req;
3836 tx->watchdog_done = tx->done;
3837 tx->watchdog_rx_pause = rx_pause;
3840 if (sc->need_media_probe)
3841 mxge_media_probe(sc);
3846 mxge_update_stats(mxge_softc_t *sc)
3848 struct mxge_slice_state *ss;
3849 u_long ipackets = 0;
3850 u_long opackets = 0;
3851 #ifdef IFNET_BUF_RING
3859 for (slice = 0; slice < sc->num_slices; slice++) {
3860 ss = &sc->ss[slice];
3861 ipackets += ss->ipackets;
3862 opackets += ss->opackets;
3863 #ifdef IFNET_BUF_RING
3864 obytes += ss->obytes;
3865 omcasts += ss->omcasts;
3866 odrops += ss->tx.br->br_drops;
3868 oerrors += ss->oerrors;
3870 sc->ifp->if_ipackets = ipackets;
3871 sc->ifp->if_opackets = opackets;
3872 #ifdef IFNET_BUF_RING
3873 sc->ifp->if_obytes = obytes;
3874 sc->ifp->if_omcasts = omcasts;
3875 sc->ifp->if_snd.ifq_drops = odrops;
3877 sc->ifp->if_oerrors = oerrors;
3881 mxge_tick(void *arg)
3883 mxge_softc_t *sc = arg;
3886 lwkt_serialize_enter(sc->ifp->if_serializer);
3887 /* aggregate stats from different slices */
3888 mxge_update_stats(sc);
3889 if (!sc->watchdog_countdown) {
3890 err = mxge_watchdog(sc);
3891 sc->watchdog_countdown = 4;
3893 sc->watchdog_countdown--;
3895 callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3896 lwkt_serialize_exit(sc->ifp->if_serializer);
3900 mxge_media_change(struct ifnet *ifp)
3906 mxge_change_mtu(mxge_softc_t *sc, int mtu)
3908 struct ifnet *ifp = sc->ifp;
3909 int real_mtu, old_mtu;
3912 if (ifp->if_serializer)
3913 ASSERT_SERIALIZED(ifp->if_serializer);
3915 real_mtu = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
3916 if ((real_mtu > sc->max_mtu) || real_mtu < 60)
3918 old_mtu = ifp->if_mtu;
3920 if (ifp->if_flags & IFF_RUNNING) {
3922 err = mxge_open(sc);
3924 ifp->if_mtu = old_mtu;
3926 (void) mxge_open(sc);
3933 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
3935 mxge_softc_t *sc = ifp->if_softc;
3940 ifmr->ifm_status = IFM_AVALID;
3941 ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
3942 ifmr->ifm_active = IFM_AUTO | IFM_ETHER;
3943 ifmr->ifm_active |= sc->link_state ? IFM_FDX : 0;
3947 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data, struct ucred *cr)
3949 mxge_softc_t *sc = ifp->if_softc;
3950 struct ifreq *ifr = (struct ifreq *)data;
3955 ASSERT_SERIALIZED(ifp->if_serializer);
3959 err = ether_ioctl(ifp, command, data);
3963 err = mxge_change_mtu(sc, ifr->ifr_mtu);
3970 if (ifp->if_flags & IFF_UP) {
3971 if (!(ifp->if_flags & IFF_RUNNING)) {
3972 err = mxge_open(sc);
3974 /* take care of promis can allmulti
3976 mxge_change_promisc(sc,
3977 ifp->if_flags & IFF_PROMISC);
3978 mxge_set_multicast_list(sc);
3981 if (ifp->if_flags & IFF_RUNNING) {
3989 mxge_set_multicast_list(sc);
3993 mask = ifr->ifr_reqcap ^ ifp->if_capenable;
3994 if (mask & IFCAP_TXCSUM) {
3995 if (IFCAP_TXCSUM & ifp->if_capenable) {
3996 ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
3997 ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
4000 ifp->if_capenable |= IFCAP_TXCSUM;
4001 ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
4003 } else if (mask & IFCAP_RXCSUM) {
4004 if (IFCAP_RXCSUM & ifp->if_capenable) {
4005 ifp->if_capenable &= ~IFCAP_RXCSUM;
4008 ifp->if_capenable |= IFCAP_RXCSUM;
4012 if (mask & IFCAP_TSO4) {
4013 if (IFCAP_TSO4 & ifp->if_capenable) {
4014 ifp->if_capenable &= ~IFCAP_TSO4;
4015 ifp->if_hwassist &= ~CSUM_TSO;
4016 } else if (IFCAP_TXCSUM & ifp->if_capenable) {
4017 ifp->if_capenable |= IFCAP_TSO4;
4018 ifp->if_hwassist |= CSUM_TSO;
4020 kprintf("mxge requires tx checksum offload"
4021 " be enabled to use TSO\n");
4025 if (mask & IFCAP_LRO) {
4026 if (IFCAP_LRO & ifp->if_capenable)
4027 err = mxge_change_lro_locked(sc, 0);
4029 err = mxge_change_lro_locked(sc, mxge_lro_cnt);
4031 if (mask & IFCAP_VLAN_HWTAGGING)
4032 ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
4033 VLAN_CAPABILITIES(ifp);
4038 err = ifmedia_ioctl(ifp, (struct ifreq *)data,
4039 &sc->media, command);
4049 mxge_fetch_tunables(mxge_softc_t *sc)
4052 TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4053 TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4054 &mxge_flow_control);
4055 TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4056 &mxge_intr_coal_delay);
4057 TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4058 &mxge_nvidia_ecrc_enable);
4059 TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4060 &mxge_force_firmware);
4061 TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4062 &mxge_deassert_wait);
4063 TUNABLE_INT_FETCH("hw.mxge.verbose",
4065 TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4066 TUNABLE_INT_FETCH("hw.mxge.lro_cnt", &sc->lro_cnt);
4067 TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4068 TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4069 TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4070 if (sc->lro_cnt != 0)
4071 mxge_lro_cnt = sc->lro_cnt;
4075 if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4076 mxge_intr_coal_delay = 30;
4077 if (mxge_ticks == 0)
4078 mxge_ticks = hz / 2;
4079 sc->pause = mxge_flow_control;
4080 if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4081 || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4082 mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
4084 if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4085 mxge_initial_mtu < ETHER_MIN_LEN)
4086 mxge_initial_mtu = ETHERMTU_JUMBO;
4091 mxge_free_slices(mxge_softc_t *sc)
4093 struct mxge_slice_state *ss;
4100 for (i = 0; i < sc->num_slices; i++) {
4102 if (ss->fw_stats != NULL) {
4103 mxge_dma_free(&ss->fw_stats_dma);
4104 ss->fw_stats = NULL;
4105 #ifdef IFNET_BUF_RING
4106 if (ss->tx.br != NULL) {
4107 drbr_free(ss->tx.br, M_DEVBUF);
4112 if (ss->rx_done.entry != NULL) {
4113 mxge_dma_free(&ss->rx_done.dma);
4114 ss->rx_done.entry = NULL;
4117 kfree(sc->ss, M_DEVBUF);
4122 mxge_alloc_slices(mxge_softc_t *sc)
4125 struct mxge_slice_state *ss;
4127 int err, i, max_intr_slots;
4129 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4131 device_printf(sc->dev, "Cannot determine rx ring size\n");
4134 sc->rx_ring_size = cmd.data0;
4135 max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4137 bytes = sizeof (*sc->ss) * sc->num_slices;
4138 sc->ss = kmalloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4141 for (i = 0; i < sc->num_slices; i++) {
4146 /* allocate per-slice rx interrupt queues */
4148 bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4149 err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4152 ss->rx_done.entry = ss->rx_done.dma.addr;
4153 bzero(ss->rx_done.entry, bytes);
4156 * allocate the per-slice firmware stats; stats
4157 * (including tx) are used used only on the first
4160 #ifndef IFNET_BUF_RING
4165 bytes = sizeof (*ss->fw_stats);
4166 err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4167 sizeof (*ss->fw_stats), 64);
4170 ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4171 #ifdef IFNET_BUF_RING
4172 ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4180 mxge_free_slices(sc);
4185 mxge_slice_probe(mxge_softc_t *sc)
4189 int msix_cnt, status, max_intr_slots;
4193 * don't enable multiple slices if they are not enabled,
4194 * or if this is not an SMP system
4197 if (mxge_max_slices == 0 || mxge_max_slices == 1 || ncpus < 2)
4200 /* see how many MSI-X interrupts are available */
4201 msix_cnt = pci_msix_count(sc->dev);
4205 /* now load the slice aware firmware see what it supports */
4206 old_fw = sc->fw_name;
4207 if (old_fw == mxge_fw_aligned)
4208 sc->fw_name = mxge_fw_rss_aligned;
4210 sc->fw_name = mxge_fw_rss_unaligned;
4211 status = mxge_load_firmware(sc, 0);
4213 device_printf(sc->dev, "Falling back to a single slice\n");
4217 /* try to send a reset command to the card to see if it
4219 memset(&cmd, 0, sizeof (cmd));
4220 status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4222 device_printf(sc->dev, "failed reset\n");
4226 /* get rx ring size */
4227 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4229 device_printf(sc->dev, "Cannot determine rx ring size\n");
4232 max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4234 /* tell it the size of the interrupt queues */
4235 cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4236 status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4238 device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4242 /* ask the maximum number of slices it supports */
4243 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4245 device_printf(sc->dev,
4246 "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4249 sc->num_slices = cmd.data0;
4250 if (sc->num_slices > msix_cnt)
4251 sc->num_slices = msix_cnt;
4253 if (mxge_max_slices == -1) {
4254 /* cap to number of CPUs in system */
4255 if (sc->num_slices > ncpus)
4256 sc->num_slices = ncpus;
4258 if (sc->num_slices > mxge_max_slices)
4259 sc->num_slices = mxge_max_slices;
4261 /* make sure it is a power of two */
4262 while (sc->num_slices & (sc->num_slices - 1))
4266 device_printf(sc->dev, "using %d slices\n",
4272 sc->fw_name = old_fw;
4273 (void) mxge_load_firmware(sc, 0);
4277 mxge_add_msix_irqs(mxge_softc_t *sc)
4280 int count, err, i, rid;
4283 sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4286 if (sc->msix_table_res == NULL) {
4287 device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4291 count = sc->num_slices;
4292 err = pci_alloc_msix(sc->dev, &count);
4294 device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4295 "err = %d \n", sc->num_slices, err);
4296 goto abort_with_msix_table;
4298 if (count < sc->num_slices) {
4299 device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4300 count, sc->num_slices);
4301 device_printf(sc->dev,
4302 "Try setting hw.mxge.max_slices to %d\n",
4305 goto abort_with_msix;
4307 bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4308 sc->msix_irq_res = kmalloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4309 if (sc->msix_irq_res == NULL) {
4311 goto abort_with_msix;
4314 for (i = 0; i < sc->num_slices; i++) {
4316 sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4319 if (sc->msix_irq_res[i] == NULL) {
4320 device_printf(sc->dev, "couldn't allocate IRQ res"
4321 " for message %d\n", i);
4323 goto abort_with_res;
4327 bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4328 sc->msix_ih = kmalloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4330 for (i = 0; i < sc->num_slices; i++) {
4331 err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4333 mxge_intr, &sc->ss[i], &sc->msix_ih[i],
4334 sc->ifp->if_serializer);
4336 device_printf(sc->dev, "couldn't setup intr for "
4338 goto abort_with_intr;
4343 device_printf(sc->dev, "using %d msix IRQs:",
4345 for (i = 0; i < sc->num_slices; i++)
4346 kprintf(" %ld", rman_get_start(sc->msix_irq_res[i]));
4352 for (i = 0; i < sc->num_slices; i++) {
4353 if (sc->msix_ih[i] != NULL) {
4354 bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4356 sc->msix_ih[i] = NULL;
4359 kfree(sc->msix_ih, M_DEVBUF);
4363 for (i = 0; i < sc->num_slices; i++) {
4365 if (sc->msix_irq_res[i] != NULL)
4366 bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4367 sc->msix_irq_res[i]);
4368 sc->msix_irq_res[i] = NULL;
4370 kfree(sc->msix_irq_res, M_DEVBUF);
4374 pci_release_msi(sc->dev);
4376 abort_with_msix_table:
4377 bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4378 sc->msix_table_res);
4384 mxge_add_single_irq(mxge_softc_t *sc)
4386 int count, err, rid;
4388 count = pci_msi_count(sc->dev);
4389 if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4395 sc->irq_res = bus_alloc_resource(sc->dev, SYS_RES_IRQ, &rid, 0, ~0,
4396 1, RF_SHAREABLE | RF_ACTIVE);
4397 if (sc->irq_res == NULL) {
4398 device_printf(sc->dev, "could not alloc interrupt\n");
4402 device_printf(sc->dev, "using %s irq %ld\n",
4403 sc->legacy_irq ? "INTx" : "MSI",
4404 rman_get_start(sc->irq_res));
4405 err = bus_setup_intr(sc->dev, sc->irq_res,
4407 mxge_intr, &sc->ss[0], &sc->ih,
4408 sc->ifp->if_serializer);
4410 bus_release_resource(sc->dev, SYS_RES_IRQ,
4411 sc->legacy_irq ? 0 : 1, sc->irq_res);
4412 if (!sc->legacy_irq)
4413 pci_release_msi(sc->dev);
4419 mxge_rem_msix_irqs(mxge_softc_t *sc)
4423 for (i = 0; i < sc->num_slices; i++) {
4424 if (sc->msix_ih[i] != NULL) {
4425 bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4427 sc->msix_ih[i] = NULL;
4430 kfree(sc->msix_ih, M_DEVBUF);
4432 for (i = 0; i < sc->num_slices; i++) {
4434 if (sc->msix_irq_res[i] != NULL)
4435 bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4436 sc->msix_irq_res[i]);
4437 sc->msix_irq_res[i] = NULL;
4439 kfree(sc->msix_irq_res, M_DEVBUF);
4441 bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4442 sc->msix_table_res);
4444 pci_release_msi(sc->dev);
4449 mxge_rem_single_irq(mxge_softc_t *sc)
4451 bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4452 bus_release_resource(sc->dev, SYS_RES_IRQ,
4453 sc->legacy_irq ? 0 : 1, sc->irq_res);
4454 if (!sc->legacy_irq)
4455 pci_release_msi(sc->dev);
4459 mxge_rem_irq(mxge_softc_t *sc)
4461 if (sc->num_slices > 1)
4462 mxge_rem_msix_irqs(sc);
4464 mxge_rem_single_irq(sc);
4468 mxge_add_irq(mxge_softc_t *sc)
4472 if (sc->num_slices > 1)
4473 err = mxge_add_msix_irqs(sc);
4475 err = mxge_add_single_irq(sc);
4477 if (0 && err == 0 && sc->num_slices > 1) {
4478 mxge_rem_msix_irqs(sc);
4479 err = mxge_add_msix_irqs(sc);
4486 mxge_attach(device_t dev)
4488 mxge_softc_t *sc = device_get_softc(dev);
4489 struct ifnet *ifp = &sc->arpcom.ac_if;
4493 * avoid rewriting half the lines in this file to use
4494 * &sc->arpcom.ac_if instead
4498 mxge_fetch_tunables(sc);
4500 err = bus_dma_tag_create(NULL, /* parent */
4503 BUS_SPACE_MAXADDR, /* low */
4504 BUS_SPACE_MAXADDR, /* high */
4505 NULL, NULL, /* filter */
4506 65536 + 256, /* maxsize */
4507 MXGE_MAX_SEND_DESC, /* num segs */
4508 65536, /* maxsegsize */
4510 &sc->parent_dmat); /* tag */
4513 device_printf(sc->dev, "Err %d allocating parent dmat\n",
4515 goto abort_with_nothing;
4519 if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4521 callout_init_mp(&sc->co_hdl);
4523 mxge_setup_cfg_space(sc);
4525 /* Map the board into the kernel */
4527 sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
4529 if (sc->mem_res == NULL) {
4530 device_printf(dev, "could not map memory\n");
4532 goto abort_with_nothing;
4534 sc->sram = rman_get_virtual(sc->mem_res);
4535 sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4536 if (sc->sram_size > rman_get_size(sc->mem_res)) {
4537 device_printf(dev, "impossible memory region size %ld\n",
4538 rman_get_size(sc->mem_res));
4540 goto abort_with_mem_res;
4543 /* make NULL terminated copy of the EEPROM strings section of
4545 bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4546 bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4547 rman_get_bushandle(sc->mem_res),
4548 sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4550 MXGE_EEPROM_STRINGS_SIZE - 2);
4551 err = mxge_parse_strings(sc);
4553 goto abort_with_mem_res;
4555 /* Enable write combining for efficient use of PCIe bus */
4558 /* Allocate the out of band dma memory */
4559 err = mxge_dma_alloc(sc, &sc->cmd_dma,
4560 sizeof (mxge_cmd_t), 64);
4562 goto abort_with_mem_res;
4563 sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4564 err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4566 goto abort_with_cmd_dma;
4568 err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4570 goto abort_with_zeropad_dma;
4572 /* select & load the firmware */
4573 err = mxge_select_firmware(sc);
4575 goto abort_with_dmabench;
4576 sc->intr_coal_delay = mxge_intr_coal_delay;
4578 mxge_slice_probe(sc);
4579 err = mxge_alloc_slices(sc);
4581 goto abort_with_dmabench;
4583 err = mxge_reset(sc, 0);
4585 goto abort_with_slices;
4587 err = mxge_alloc_rings(sc);
4589 device_printf(sc->dev, "failed to allocate rings\n");
4590 goto abort_with_dmabench;
4593 err = mxge_add_irq(sc);
4595 device_printf(sc->dev, "failed to add irq\n");
4596 goto abort_with_rings;
4599 ifp->if_baudrate = IF_Gbps(10UL);
4600 ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4603 ifp->if_capabilities |= IFCAP_LRO;
4606 #ifdef MXGE_NEW_VLAN_API
4607 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4610 sc->max_mtu = mxge_max_mtu(sc);
4611 if (sc->max_mtu >= 9000)
4612 ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4614 device_printf(dev, "MTU limited to %d. Install "
4615 "latest firmware for 9000 byte jumbo support\n",
4616 sc->max_mtu - ETHER_HDR_LEN);
4617 ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4618 ifp->if_capenable = ifp->if_capabilities;
4619 if (sc->lro_cnt == 0)
4620 ifp->if_capenable &= ~IFCAP_LRO;
4622 ifp->if_init = mxge_init;
4624 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4625 ifp->if_ioctl = mxge_ioctl;
4626 ifp->if_start = mxge_start;
4627 /* Initialise the ifmedia structure */
4628 ifmedia_init(&sc->media, 0, mxge_media_change,
4630 mxge_set_media(sc, IFM_ETHER | IFM_AUTO);
4631 mxge_media_probe(sc);
4633 ether_ifattach(ifp, sc->mac_addr, NULL);
4634 /* ether_ifattach sets mtu to ETHERMTU */
4635 if (mxge_initial_mtu != ETHERMTU) {
4636 lwkt_serialize_enter(ifp->if_serializer);
4637 mxge_change_mtu(sc, mxge_initial_mtu);
4638 lwkt_serialize_exit(ifp->if_serializer);
4641 mxge_add_sysctls(sc);
4642 #ifdef IFNET_BUF_RING
4643 ifp->if_transmit = mxge_transmit;
4644 ifp->if_qflush = mxge_qflush;
4649 mxge_free_rings(sc);
4651 mxge_free_slices(sc);
4652 abort_with_dmabench:
4653 mxge_dma_free(&sc->dmabench_dma);
4654 abort_with_zeropad_dma:
4655 mxge_dma_free(&sc->zeropad_dma);
4657 mxge_dma_free(&sc->cmd_dma);
4659 bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4660 pci_disable_busmaster(dev);
4661 bus_dma_tag_destroy(sc->parent_dmat);
4667 mxge_detach(device_t dev)
4669 mxge_softc_t *sc = device_get_softc(dev);
4671 lwkt_serialize_enter(sc->ifp->if_serializer);
4673 if (sc->ifp->if_flags & IFF_RUNNING)
4676 * XXX: race: the callout callback could be spinning on
4677 * the serializer and run anyway
4679 callout_stop(&sc->co_hdl);
4680 lwkt_serialize_exit(sc->ifp->if_serializer);
4682 ether_ifdetach(sc->ifp);
4683 ifmedia_removeall(&sc->media);
4684 mxge_dummy_rdma(sc, 0);
4685 mxge_rem_sysctls(sc);
4687 mxge_free_rings(sc);
4688 mxge_free_slices(sc);
4689 mxge_dma_free(&sc->dmabench_dma);
4690 mxge_dma_free(&sc->zeropad_dma);
4691 mxge_dma_free(&sc->cmd_dma);
4692 bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4693 pci_disable_busmaster(dev);
4694 bus_dma_tag_destroy(sc->parent_dmat);
4699 mxge_shutdown(device_t dev)
4705 This file uses Myri10GE driver indentation.
4708 c-file-style:"linux"