1 /******************************************************************************
3 Copyright (c) 2006-2009, Myricom Inc.
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
9 1. Redistributions of source code must retain the above copyright notice,
10 this list of conditions and the following disclaimer.
12 2. Neither the name of the Myricom Inc, nor the names of its
13 contributors may be used to endorse or promote products derived from
14 this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
28 ***************************************************************************/
30 #include <sys/cdefs.h>
31 /*__FBSDID("$FreeBSD: src/sys/dev/mxge/if_mxge.c,v 1.63 2009/06/26 11:45:06 rwatson Exp $");*/
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/linker.h>
36 #include <sys/firmware.h>
37 #include <sys/endian.h>
38 #include <sys/sockio.h>
40 #include <sys/malloc.h>
41 #include <sys/kernel.h>
43 #include <sys/module.h>
44 #include <sys/socket.h>
45 #include <sys/sysctl.h>
47 /* count xmits ourselves, rather than via drbr */
50 #include <net/if_arp.h>
51 #include <net/ethernet.h>
52 #include <net/if_dl.h>
53 #include <net/if_media.h>
57 #include <net/if_types.h>
58 #include <net/vlan/if_vlan_var.h>
61 #include <netinet/in_systm.h>
62 #include <netinet/in.h>
63 #include <netinet/ip.h>
64 #include <netinet/tcp.h>
66 #include <machine/resource.h>
70 #include <bus/pci/pcireg.h>
71 #include <bus/pci/pcivar.h>
72 #include <bus/pci/pci_private.h> /* XXX for pci_cfg_restore */
74 #include <vm/vm.h> /* for pmap_mapdev() */
77 #if defined(__i386) || defined(__amd64)
78 #include <machine/specialreg.h>
81 #include <dev/netif/mxge/mxge_mcp.h>
82 #include <dev/netif/mxge/mcp_gen_header.h>
83 /*#define MXGE_FAKE_IFP*/
84 #include <dev/netif/mxge/if_mxge_var.h>
86 #include <sys/buf_ring.h>
92 static int mxge_nvidia_ecrc_enable = 1;
93 static int mxge_force_firmware = 0;
94 static int mxge_intr_coal_delay = 30;
95 static int mxge_deassert_wait = 1;
96 static int mxge_flow_control = 1;
97 static int mxge_verbose = 0;
98 static int mxge_lro_cnt = 8;
99 static int mxge_ticks;
100 static int mxge_max_slices = 1;
101 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
102 static int mxge_always_promisc = 0;
103 static int mxge_initial_mtu = ETHERMTU_JUMBO;
104 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
105 static char *mxge_fw_aligned = "mxge_eth_z8e";
106 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
107 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
109 static int mxge_probe(device_t dev);
110 static int mxge_attach(device_t dev);
111 static int mxge_detach(device_t dev);
112 static int mxge_shutdown(device_t dev);
113 static void mxge_intr(void *arg);
115 static device_method_t mxge_methods[] =
117 /* Device interface */
118 DEVMETHOD(device_probe, mxge_probe),
119 DEVMETHOD(device_attach, mxge_attach),
120 DEVMETHOD(device_detach, mxge_detach),
121 DEVMETHOD(device_shutdown, mxge_shutdown),
125 static driver_t mxge_driver =
129 sizeof(mxge_softc_t),
132 static devclass_t mxge_devclass;
134 /* Declare ourselves to be a child of the PCI bus.*/
135 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
136 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
137 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
139 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
140 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
141 static int mxge_close(mxge_softc_t *sc);
142 static int mxge_open(mxge_softc_t *sc);
143 static void mxge_tick(void *arg);
146 mxge_probe(device_t dev)
151 if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
152 ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
153 (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
154 rev = pci_get_revid(dev);
156 case MXGE_PCI_REV_Z8E:
157 device_set_desc(dev, "Myri10G-PCIE-8A");
159 case MXGE_PCI_REV_Z8ES:
160 device_set_desc(dev, "Myri10G-PCIE-8B");
163 device_set_desc(dev, "Myri10G-PCIE-8??");
164 device_printf(dev, "Unrecognized rev %d NIC\n",
174 mxge_enable_wc(mxge_softc_t *sc)
176 #if defined(__i386) || defined(__amd64)
181 len = rman_get_size(sc->mem_res);
182 err = pmap_change_attr((vm_offset_t) sc->sram,
183 len, PAT_WRITE_COMBINING);
185 device_printf(sc->dev, "pmap_change_attr failed, %d\n",
193 /* callback to get our DMA address */
195 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
199 *(bus_addr_t *) arg = segs->ds_addr;
204 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
205 bus_size_t alignment)
208 device_t dev = sc->dev;
209 bus_size_t boundary, maxsegsize;
211 if (bytes > 4096 && alignment == 4096) {
219 /* allocate DMAable memory tags */
220 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
221 alignment, /* alignment */
222 boundary, /* boundary */
223 BUS_SPACE_MAXADDR, /* low */
224 BUS_SPACE_MAXADDR, /* high */
225 NULL, NULL, /* filter */
228 maxsegsize, /* maxsegsize */
229 BUS_DMA_COHERENT, /* flags */
230 NULL, NULL, /* lock */
231 &dma->dmat); /* tag */
233 device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
237 /* allocate DMAable memory & map */
238 err = bus_dmamem_alloc(dma->dmat, &dma->addr,
239 (BUS_DMA_WAITOK | BUS_DMA_COHERENT
240 | BUS_DMA_ZERO), &dma->map);
242 device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
243 goto abort_with_dmat;
246 /* load the memory */
247 err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
248 mxge_dmamap_callback,
249 (void *)&dma->bus_addr, 0);
251 device_printf(dev, "couldn't load map (err = %d)\n", err);
257 bus_dmamem_free(dma->dmat, dma->addr, dma->map);
259 (void)bus_dma_tag_destroy(dma->dmat);
265 mxge_dma_free(mxge_dma_t *dma)
267 bus_dmamap_unload(dma->dmat, dma->map);
268 bus_dmamem_free(dma->dmat, dma->addr, dma->map);
269 (void)bus_dma_tag_destroy(dma->dmat);
273 * The eeprom strings on the lanaiX have the format
280 mxge_parse_strings(mxge_softc_t *sc)
282 #define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
287 ptr = sc->eeprom_strings;
288 limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
290 while (ptr < limit && *ptr != '\0') {
291 if (memcmp(ptr, "MAC=", 4) == 0) {
293 sc->mac_addr_string = ptr;
294 for (i = 0; i < 6; i++) {
296 if ((ptr + 2) > limit)
298 sc->mac_addr[i] = strtoul(ptr, NULL, 16);
301 } else if (memcmp(ptr, "PC=", 3) == 0) {
303 strncpy(sc->product_code_string, ptr,
304 sizeof (sc->product_code_string) - 1);
305 } else if (memcmp(ptr, "SN=", 3) == 0) {
307 strncpy(sc->serial_number_string, ptr,
308 sizeof (sc->serial_number_string) - 1);
310 MXGE_NEXT_STRING(ptr);
317 device_printf(sc->dev, "failed to parse eeprom_strings\n");
322 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
324 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
327 unsigned long base, off;
329 device_t pdev, mcp55;
330 uint16_t vendor_id, device_id, word;
331 uintptr_t bus, slot, func, ivend, idev;
335 if (!mxge_nvidia_ecrc_enable)
338 pdev = device_get_parent(device_get_parent(sc->dev));
340 device_printf(sc->dev, "could not find parent?\n");
343 vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
344 device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
346 if (vendor_id != 0x10de)
351 if (device_id == 0x005d) {
352 /* ck804, base address is magic */
354 } else if (device_id >= 0x0374 && device_id <= 0x378) {
355 /* mcp55, base address stored in chipset */
356 mcp55 = pci_find_bsf(0, 0, 0);
358 0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
359 0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
360 word = pci_read_config(mcp55, 0x90, 2);
361 base = ((unsigned long)word & 0x7ffeU) << 25;
368 Test below is commented because it is believed that doing
369 config read/write beyond 0xff will access the config space
370 for the next larger function. Uncomment this and remove
371 the hacky pmap_mapdev() way of accessing config space when
372 FreeBSD grows support for extended pcie config space access
375 /* See if we can, by some miracle, access the extended
377 val = pci_read_config(pdev, 0x178, 4);
378 if (val != 0xffffffff) {
380 pci_write_config(pdev, 0x178, val, 4);
384 /* Rather than using normal pci config space writes, we must
385 * map the Nvidia config space ourselves. This is because on
386 * opteron/nvidia class machine the 0xe000000 mapping is
387 * handled by the nvidia chipset, that means the internal PCI
388 * device (the on-chip northbridge), or the amd-8131 bridge
389 * and things behind them are not visible by this method.
392 BUS_READ_IVAR(device_get_parent(pdev), pdev,
394 BUS_READ_IVAR(device_get_parent(pdev), pdev,
395 PCI_IVAR_SLOT, &slot);
396 BUS_READ_IVAR(device_get_parent(pdev), pdev,
397 PCI_IVAR_FUNCTION, &func);
398 BUS_READ_IVAR(device_get_parent(pdev), pdev,
399 PCI_IVAR_VENDOR, &ivend);
400 BUS_READ_IVAR(device_get_parent(pdev), pdev,
401 PCI_IVAR_DEVICE, &idev);
404 + 0x00100000UL * (unsigned long)bus
405 + 0x00001000UL * (unsigned long)(func
408 /* map it into the kernel */
409 va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
413 device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
416 /* get a pointer to the config space mapped into the kernel */
417 cfgptr = va + (off & PAGE_MASK);
419 /* make sure that we can really access it */
420 vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
421 device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
422 if (! (vendor_id == ivend && device_id == idev)) {
423 device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
424 vendor_id, device_id);
425 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
429 ptr32 = (uint32_t*)(cfgptr + 0x178);
432 if (val == 0xffffffff) {
433 device_printf(sc->dev, "extended mapping failed\n");
434 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
438 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
440 device_printf(sc->dev,
441 "Enabled ECRC on upstream Nvidia bridge "
443 (int)bus, (int)slot, (int)func);
448 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
450 device_printf(sc->dev,
451 "Nforce 4 chipset on non-x86/amd64!?!?!\n");
458 mxge_dma_test(mxge_softc_t *sc, int test_type)
461 bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
467 /* Run a small DMA test.
468 * The magic multipliers to the length tell the firmware
469 * to do DMA read, write, or read+write tests. The
470 * results are returned in cmd.data0. The upper 16
471 * bits of the return is the number of transfers completed.
472 * The lower 16 bits is the time in 0.5us ticks that the
473 * transfers took to complete.
476 len = sc->tx_boundary;
478 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
479 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
480 cmd.data2 = len * 0x10000;
481 status = mxge_send_cmd(sc, test_type, &cmd);
486 sc->read_dma = ((cmd.data0>>16) * len * 2) /
487 (cmd.data0 & 0xffff);
488 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
489 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
490 cmd.data2 = len * 0x1;
491 status = mxge_send_cmd(sc, test_type, &cmd);
496 sc->write_dma = ((cmd.data0>>16) * len * 2) /
497 (cmd.data0 & 0xffff);
499 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
500 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
501 cmd.data2 = len * 0x10001;
502 status = mxge_send_cmd(sc, test_type, &cmd);
507 sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
508 (cmd.data0 & 0xffff);
511 if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
512 device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
519 * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
520 * when the PCI-E Completion packets are aligned on an 8-byte
521 * boundary. Some PCI-E chip sets always align Completion packets; on
522 * the ones that do not, the alignment can be enforced by enabling
523 * ECRC generation (if supported).
525 * When PCI-E Completion packets are not aligned, it is actually more
526 * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
528 * If the driver can neither enable ECRC nor verify that it has
529 * already been enabled, then it must use a firmware image which works
530 * around unaligned completion packets (ethp_z8e.dat), and it should
531 * also ensure that it never gives the device a Read-DMA which is
532 * larger than 2KB by setting the tx_boundary to 2KB. If ECRC is
533 * enabled, then the driver should use the aligned (eth_z8e.dat)
534 * firmware image, and set tx_boundary to 4KB.
538 mxge_firmware_probe(mxge_softc_t *sc)
540 device_t dev = sc->dev;
544 sc->tx_boundary = 4096;
546 * Verify the max read request size was set to 4KB
547 * before trying the test with 4KB.
549 if (pci_find_extcap(dev, PCIY_EXPRESS, ®) == 0) {
550 pectl = pci_read_config(dev, reg + 0x8, 2);
551 if ((pectl & (5 << 12)) != (5 << 12)) {
552 device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
554 sc->tx_boundary = 2048;
559 * load the optimized firmware (which assumes aligned PCIe
560 * completions) in order to see if it works on this host.
562 sc->fw_name = mxge_fw_aligned;
563 status = mxge_load_firmware(sc, 1);
569 * Enable ECRC if possible
571 mxge_enable_nvidia_ecrc(sc);
574 * Run a DMA test which watches for unaligned completions and
575 * aborts on the first one seen.
578 status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
580 return 0; /* keep the aligned firmware */
583 device_printf(dev, "DMA test failed: %d\n", status);
584 if (status == ENOSYS)
585 device_printf(dev, "Falling back to ethp! "
586 "Please install up to date fw\n");
591 mxge_select_firmware(mxge_softc_t *sc)
596 if (mxge_force_firmware != 0) {
597 if (mxge_force_firmware == 1)
602 device_printf(sc->dev,
603 "Assuming %s completions (forced)\n",
604 aligned ? "aligned" : "unaligned");
608 /* if the PCIe link width is 4 or less, we can use the aligned
609 firmware and skip any checks */
610 if (sc->link_width != 0 && sc->link_width <= 4) {
611 device_printf(sc->dev,
612 "PCIe x%d Link, expect reduced performance\n",
618 if (0 == mxge_firmware_probe(sc))
623 sc->fw_name = mxge_fw_aligned;
624 sc->tx_boundary = 4096;
626 sc->fw_name = mxge_fw_unaligned;
627 sc->tx_boundary = 2048;
629 return (mxge_load_firmware(sc, 0));
639 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
643 if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
644 device_printf(sc->dev, "Bad firmware type: 0x%x\n",
645 be32toh(hdr->mcp_type));
649 /* save firmware version for sysctl */
650 strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
652 device_printf(sc->dev, "firmware id: %s\n", hdr->version);
654 sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
655 &sc->fw_ver_minor, &sc->fw_ver_tiny);
657 if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
658 && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
659 device_printf(sc->dev, "Found firmware version %s\n",
661 device_printf(sc->dev, "Driver needs %d.%d\n",
662 MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
670 z_alloc(void *nil, u_int items, u_int size)
674 ptr = kmalloc(items * size, M_TEMP, M_NOWAIT);
679 z_free(void *nil, void *ptr)
686 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
689 char *inflate_buffer;
690 const struct firmware *fw;
691 const mcp_gen_header_t *hdr;
698 fw = firmware_get(sc->fw_name);
700 device_printf(sc->dev, "Could not find firmware image %s\n",
707 /* setup zlib and decompress f/w */
708 bzero(&zs, sizeof (zs));
711 status = inflateInit(&zs);
712 if (status != Z_OK) {
717 /* the uncompressed size is stored as the firmware version,
718 which would otherwise go unused */
719 fw_len = (size_t) fw->version;
720 inflate_buffer = kmalloc(fw_len, M_TEMP, M_NOWAIT);
721 if (inflate_buffer == NULL)
723 zs.avail_in = fw->datasize;
724 zs.next_in = __DECONST(char *, fw->data);
725 zs.avail_out = fw_len;
726 zs.next_out = inflate_buffer;
727 status = inflate(&zs, Z_FINISH);
728 if (status != Z_STREAM_END) {
729 device_printf(sc->dev, "zlib %d\n", status);
731 goto abort_with_buffer;
735 hdr_offset = htobe32(*(const uint32_t *)
736 (inflate_buffer + MCP_HEADER_PTR_OFFSET));
737 if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
738 device_printf(sc->dev, "Bad firmware file");
740 goto abort_with_buffer;
742 hdr = (const void*)(inflate_buffer + hdr_offset);
744 status = mxge_validate_firmware(sc, hdr);
746 goto abort_with_buffer;
748 /* Copy the inflated firmware to NIC SRAM. */
749 for (i = 0; i < fw_len; i += 256) {
750 mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
752 min(256U, (unsigned)(fw_len - i)));
761 kfree(inflate_buffer, M_TEMP);
765 firmware_put(fw, FIRMWARE_UNLOAD);
770 * Enable or disable periodic RDMAs from the host to make certain
771 * chipsets resend dropped PCIe messages
775 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
778 volatile uint32_t *confirm;
779 volatile char *submit;
780 uint32_t *buf, dma_low, dma_high;
783 buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
785 /* clear confirmation addr */
786 confirm = (volatile uint32_t *)sc->cmd;
790 /* send an rdma command to the PCIe engine, and wait for the
791 response in the confirmation address. The firmware should
792 write a -1 there to indicate it is alive and well
795 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
796 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
797 buf[0] = htobe32(dma_high); /* confirm addr MSW */
798 buf[1] = htobe32(dma_low); /* confirm addr LSW */
799 buf[2] = htobe32(0xffffffff); /* confirm data */
800 dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
801 dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
802 buf[3] = htobe32(dma_high); /* dummy addr MSW */
803 buf[4] = htobe32(dma_low); /* dummy addr LSW */
804 buf[5] = htobe32(enable); /* enable? */
807 submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
809 mxge_pio_copy(submit, buf, 64);
814 while (*confirm != 0xffffffff && i < 20) {
818 if (*confirm != 0xffffffff) {
819 device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
820 (enable ? "enable" : "disable"), confirm,
827 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
830 char buf_bytes[sizeof(*buf) + 8];
831 volatile mcp_cmd_response_t *response = sc->cmd;
832 volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
833 uint32_t dma_low, dma_high;
834 int err, sleep_total = 0;
836 /* ensure buf is aligned to 8 bytes */
837 buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
839 buf->data0 = htobe32(data->data0);
840 buf->data1 = htobe32(data->data1);
841 buf->data2 = htobe32(data->data2);
842 buf->cmd = htobe32(cmd);
843 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
844 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
846 buf->response_addr.low = htobe32(dma_low);
847 buf->response_addr.high = htobe32(dma_high);
848 mtx_lock(&sc->cmd_mtx);
849 response->result = 0xffffffff;
851 mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
853 /* wait up to 20ms */
855 for (sleep_total = 0; sleep_total < 20; sleep_total++) {
856 bus_dmamap_sync(sc->cmd_dma.dmat,
857 sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
859 switch (be32toh(response->result)) {
861 data->data0 = be32toh(response->data);
867 case MXGEFW_CMD_UNKNOWN:
870 case MXGEFW_CMD_ERROR_UNALIGNED:
873 case MXGEFW_CMD_ERROR_BUSY:
877 device_printf(sc->dev,
879 "failed, result = %d\n",
880 cmd, be32toh(response->result));
888 device_printf(sc->dev, "mxge: command %d timed out"
890 cmd, be32toh(response->result));
891 mtx_unlock(&sc->cmd_mtx);
896 mxge_adopt_running_firmware(mxge_softc_t *sc)
898 struct mcp_gen_header *hdr;
899 const size_t bytes = sizeof (struct mcp_gen_header);
903 /* find running firmware header */
904 hdr_offset = htobe32(*(volatile uint32_t *)
905 (sc->sram + MCP_HEADER_PTR_OFFSET));
907 if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
908 device_printf(sc->dev,
909 "Running firmware has bad header offset (%d)\n",
914 /* copy header of running firmware from SRAM to host memory to
915 * validate firmware */
916 hdr = kmalloc(bytes, M_DEVBUF, M_NOWAIT);
918 device_printf(sc->dev, "could not kmalloc firmware hdr\n");
921 bus_space_read_region_1(rman_get_bustag(sc->mem_res),
922 rman_get_bushandle(sc->mem_res),
923 hdr_offset, (char *)hdr, bytes);
924 status = mxge_validate_firmware(sc, hdr);
925 kfree(hdr, M_DEVBUF);
928 * check to see if adopted firmware has bug where adopting
929 * it will cause broadcasts to be filtered unless the NIC
930 * is kept in ALLMULTI mode
932 if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
933 sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
934 sc->adopted_rx_filter_bug = 1;
935 device_printf(sc->dev, "Adopting fw %d.%d.%d: "
936 "working around rx filter bug\n",
937 sc->fw_ver_major, sc->fw_ver_minor,
946 mxge_load_firmware(mxge_softc_t *sc, int adopt)
948 volatile uint32_t *confirm;
949 volatile char *submit;
951 uint32_t *buf, size, dma_low, dma_high;
954 buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
956 size = sc->sram_size;
957 status = mxge_load_firmware_helper(sc, &size);
961 /* Try to use the currently running firmware, if
963 status = mxge_adopt_running_firmware(sc);
965 device_printf(sc->dev,
966 "failed to adopt running firmware\n");
969 device_printf(sc->dev,
970 "Successfully adopted running firmware\n");
971 if (sc->tx_boundary == 4096) {
972 device_printf(sc->dev,
973 "Using firmware currently running on NIC"
975 device_printf(sc->dev,
976 "performance consider loading optimized "
979 sc->fw_name = mxge_fw_unaligned;
980 sc->tx_boundary = 2048;
983 /* clear confirmation addr */
984 confirm = (volatile uint32_t *)sc->cmd;
987 /* send a reload command to the bootstrap MCP, and wait for the
988 response in the confirmation address. The firmware should
989 write a -1 there to indicate it is alive and well
992 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
993 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
995 buf[0] = htobe32(dma_high); /* confirm addr MSW */
996 buf[1] = htobe32(dma_low); /* confirm addr LSW */
997 buf[2] = htobe32(0xffffffff); /* confirm data */
999 /* FIX: All newest firmware should un-protect the bottom of
1000 the sram before handoff. However, the very first interfaces
1001 do not. Therefore the handoff copy must skip the first 8 bytes
1003 /* where the code starts*/
1004 buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1005 buf[4] = htobe32(size - 8); /* length of code */
1006 buf[5] = htobe32(8); /* where to copy to */
1007 buf[6] = htobe32(0); /* where to jump to */
1009 submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1010 mxge_pio_copy(submit, buf, 64);
1015 while (*confirm != 0xffffffff && i < 20) {
1018 bus_dmamap_sync(sc->cmd_dma.dmat,
1019 sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1021 if (*confirm != 0xffffffff) {
1022 device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1031 mxge_update_mac_address(mxge_softc_t *sc)
1034 uint8_t *addr = sc->mac_addr;
1038 cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1039 | (addr[2] << 8) | addr[3]);
1041 cmd.data1 = ((addr[4] << 8) | (addr[5]));
1043 status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1048 mxge_change_pause(mxge_softc_t *sc, int pause)
1054 status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1057 status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1061 device_printf(sc->dev, "Failed to set flow control mode\n");
1069 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1074 if (mxge_always_promisc)
1078 status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1081 status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1085 device_printf(sc->dev, "Failed to set promisc mode\n");
1090 mxge_set_multicast_list(mxge_softc_t *sc)
1093 struct ifmultiaddr *ifma;
1094 struct ifnet *ifp = sc->ifp;
1097 /* This firmware is known to not support multicast */
1098 if (!sc->fw_multicast_support)
1101 /* Disable multicast filtering while we play with the lists*/
1102 err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1104 device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1105 " error status: %d\n", err);
1109 if (sc->adopted_rx_filter_bug)
1112 if (ifp->if_flags & IFF_ALLMULTI)
1113 /* request to disable multicast filtering, so quit here */
1116 /* Flush all the filters */
1118 err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1120 device_printf(sc->dev,
1121 "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1122 ", error status: %d\n", err);
1126 /* Walk the multicast list, and add each address */
1128 if_maddr_rlock(ifp);
1129 TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1130 if (ifma->ifma_addr->sa_family != AF_LINK)
1132 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1134 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1136 cmd.data0 = htonl(cmd.data0);
1137 cmd.data1 = htonl(cmd.data1);
1138 err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1140 device_printf(sc->dev, "Failed "
1141 "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1143 /* abort, leaving multicast filtering off */
1144 if_maddr_runlock(ifp);
1148 if_maddr_runlock(ifp);
1149 /* Enable multicast filtering */
1150 err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1152 device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1153 ", error status: %d\n", err);
1158 mxge_max_mtu(mxge_softc_t *sc)
1163 if (MJUMPAGESIZE - MXGEFW_PAD > MXGEFW_MAX_MTU)
1164 return MXGEFW_MAX_MTU - MXGEFW_PAD;
1166 /* try to set nbufs to see if it we can
1167 use virtually contiguous jumbos */
1169 status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1172 return MXGEFW_MAX_MTU - MXGEFW_PAD;
1174 /* otherwise, we're limited to MJUMPAGESIZE */
1175 return MJUMPAGESIZE - MXGEFW_PAD;
1179 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1181 struct mxge_slice_state *ss;
1182 mxge_rx_done_t *rx_done;
1183 volatile uint32_t *irq_claim;
1187 /* try to send a reset command to the card to see if it
1189 memset(&cmd, 0, sizeof (cmd));
1190 status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1192 device_printf(sc->dev, "failed reset\n");
1196 mxge_dummy_rdma(sc, 1);
1199 /* set the intrq size */
1200 cmd.data0 = sc->rx_ring_size;
1201 status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1204 * Even though we already know how many slices are supported
1205 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1206 * has magic side effects, and must be called after a reset.
1207 * It must be called prior to calling any RSS related cmds,
1208 * including assigning an interrupt queue for anything but
1209 * slice 0. It must also be called *after*
1210 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1211 * the firmware to compute offsets.
1214 if (sc->num_slices > 1) {
1215 /* ask the maximum number of slices it supports */
1216 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1219 device_printf(sc->dev,
1220 "failed to get number of slices\n");
1224 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1225 * to setting up the interrupt queue DMA
1227 cmd.data0 = sc->num_slices;
1228 cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1229 #ifdef IFNET_BUF_RING
1230 cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1232 status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1235 device_printf(sc->dev,
1236 "failed to set number of slices\n");
1242 if (interrupts_setup) {
1243 /* Now exchange information about interrupts */
1244 for (slice = 0; slice < sc->num_slices; slice++) {
1245 rx_done = &sc->ss[slice].rx_done;
1246 memset(rx_done->entry, 0, sc->rx_ring_size);
1247 cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1248 cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1250 status |= mxge_send_cmd(sc,
1251 MXGEFW_CMD_SET_INTRQ_DMA,
1256 status |= mxge_send_cmd(sc,
1257 MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1260 sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1262 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1263 irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1266 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1268 sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1270 device_printf(sc->dev, "failed set interrupt parameters\n");
1275 *sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1278 /* run a DMA benchmark */
1279 (void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1281 for (slice = 0; slice < sc->num_slices; slice++) {
1282 ss = &sc->ss[slice];
1284 ss->irq_claim = irq_claim + (2 * slice);
1285 /* reset mcp/driver shared state back to 0 */
1286 ss->rx_done.idx = 0;
1287 ss->rx_done.cnt = 0;
1290 ss->tx.pkt_done = 0;
1291 ss->tx.queue_active = 0;
1292 ss->tx.activate = 0;
1293 ss->tx.deactivate = 0;
1298 ss->rx_small.cnt = 0;
1299 ss->lro_bad_csum = 0;
1301 ss->lro_flushed = 0;
1302 if (ss->fw_stats != NULL) {
1303 ss->fw_stats->valid = 0;
1304 ss->fw_stats->send_done_count = 0;
1307 sc->rdma_tags_available = 15;
1308 status = mxge_update_mac_address(sc);
1309 mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1310 mxge_change_pause(sc, sc->pause);
1311 mxge_set_multicast_list(sc);
1316 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1319 unsigned int intr_coal_delay;
1323 intr_coal_delay = sc->intr_coal_delay;
1324 err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1328 if (intr_coal_delay == sc->intr_coal_delay)
1331 if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1334 mtx_lock(&sc->driver_mtx);
1335 *sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1336 sc->intr_coal_delay = intr_coal_delay;
1338 mtx_unlock(&sc->driver_mtx);
1343 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1346 unsigned int enabled;
1350 enabled = sc->pause;
1351 err = sysctl_handle_int(oidp, &enabled, arg2, req);
1355 if (enabled == sc->pause)
1358 mtx_lock(&sc->driver_mtx);
1359 err = mxge_change_pause(sc, enabled);
1360 mtx_unlock(&sc->driver_mtx);
1365 mxge_change_lro_locked(mxge_softc_t *sc, int lro_cnt)
1372 ifp->if_capenable &= ~IFCAP_LRO;
1374 ifp->if_capenable |= IFCAP_LRO;
1375 sc->lro_cnt = lro_cnt;
1376 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
1378 err = mxge_open(sc);
1384 mxge_change_lro(SYSCTL_HANDLER_ARGS)
1387 unsigned int lro_cnt;
1391 lro_cnt = sc->lro_cnt;
1392 err = sysctl_handle_int(oidp, &lro_cnt, arg2, req);
1396 if (lro_cnt == sc->lro_cnt)
1402 mtx_lock(&sc->driver_mtx);
1403 err = mxge_change_lro_locked(sc, lro_cnt);
1404 mtx_unlock(&sc->driver_mtx);
1409 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1415 arg2 = be32toh(*(int *)arg1);
1417 err = sysctl_handle_int(oidp, arg1, arg2, req);
1423 mxge_rem_sysctls(mxge_softc_t *sc)
1425 struct mxge_slice_state *ss;
1428 if (sc->slice_sysctl_tree == NULL)
1431 for (slice = 0; slice < sc->num_slices; slice++) {
1432 ss = &sc->ss[slice];
1433 if (ss == NULL || ss->sysctl_tree == NULL)
1435 sysctl_ctx_free(&ss->sysctl_ctx);
1436 ss->sysctl_tree = NULL;
1438 sysctl_ctx_free(&sc->slice_sysctl_ctx);
1439 sc->slice_sysctl_tree = NULL;
1443 mxge_add_sysctls(mxge_softc_t *sc)
1445 struct sysctl_ctx_list *ctx;
1446 struct sysctl_oid_list *children;
1448 struct mxge_slice_state *ss;
1452 ctx = device_get_sysctl_ctx(sc->dev);
1453 children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1454 fw = sc->ss[0].fw_stats;
1456 /* random information */
1457 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1459 CTLFLAG_RD, &sc->fw_version,
1460 0, "firmware version");
1461 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1463 CTLFLAG_RD, &sc->serial_number_string,
1464 0, "serial number");
1465 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1467 CTLFLAG_RD, &sc->product_code_string,
1469 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1471 CTLFLAG_RD, &sc->link_width,
1473 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1475 CTLFLAG_RD, &sc->tx_boundary,
1477 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1479 CTLFLAG_RD, &sc->wc,
1480 0, "write combining PIO?");
1481 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1483 CTLFLAG_RD, &sc->read_dma,
1484 0, "DMA Read speed in MB/s");
1485 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1487 CTLFLAG_RD, &sc->write_dma,
1488 0, "DMA Write speed in MB/s");
1489 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1490 "read_write_dma_MBs",
1491 CTLFLAG_RD, &sc->read_write_dma,
1492 0, "DMA concurrent Read/Write speed in MB/s");
1495 /* performance related tunables */
1496 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1498 CTLTYPE_INT|CTLFLAG_RW, sc,
1499 0, mxge_change_intr_coal,
1500 "I", "interrupt coalescing delay in usecs");
1502 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1503 "flow_control_enabled",
1504 CTLTYPE_INT|CTLFLAG_RW, sc,
1505 0, mxge_change_flow_control,
1506 "I", "interrupt coalescing delay in usecs");
1508 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1510 CTLFLAG_RW, &mxge_deassert_wait,
1511 0, "Wait for IRQ line to go low in ihandler");
1513 /* stats block from firmware is in network byte order.
1515 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1517 CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1518 0, mxge_handle_be32,
1520 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1521 "rdma_tags_available",
1522 CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1523 0, mxge_handle_be32,
1524 "I", "rdma_tags_available");
1525 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1526 "dropped_bad_crc32",
1527 CTLTYPE_INT|CTLFLAG_RD,
1528 &fw->dropped_bad_crc32,
1529 0, mxge_handle_be32,
1530 "I", "dropped_bad_crc32");
1531 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1533 CTLTYPE_INT|CTLFLAG_RD,
1534 &fw->dropped_bad_phy,
1535 0, mxge_handle_be32,
1536 "I", "dropped_bad_phy");
1537 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1538 "dropped_link_error_or_filtered",
1539 CTLTYPE_INT|CTLFLAG_RD,
1540 &fw->dropped_link_error_or_filtered,
1541 0, mxge_handle_be32,
1542 "I", "dropped_link_error_or_filtered");
1543 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1544 "dropped_link_overflow",
1545 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1546 0, mxge_handle_be32,
1547 "I", "dropped_link_overflow");
1548 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1549 "dropped_multicast_filtered",
1550 CTLTYPE_INT|CTLFLAG_RD,
1551 &fw->dropped_multicast_filtered,
1552 0, mxge_handle_be32,
1553 "I", "dropped_multicast_filtered");
1554 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1555 "dropped_no_big_buffer",
1556 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1557 0, mxge_handle_be32,
1558 "I", "dropped_no_big_buffer");
1559 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1560 "dropped_no_small_buffer",
1561 CTLTYPE_INT|CTLFLAG_RD,
1562 &fw->dropped_no_small_buffer,
1563 0, mxge_handle_be32,
1564 "I", "dropped_no_small_buffer");
1565 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1567 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1568 0, mxge_handle_be32,
1569 "I", "dropped_overrun");
1570 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1572 CTLTYPE_INT|CTLFLAG_RD,
1574 0, mxge_handle_be32,
1575 "I", "dropped_pause");
1576 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1578 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1579 0, mxge_handle_be32,
1580 "I", "dropped_runt");
1582 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1583 "dropped_unicast_filtered",
1584 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1585 0, mxge_handle_be32,
1586 "I", "dropped_unicast_filtered");
1588 /* verbose printing? */
1589 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1591 CTLFLAG_RW, &mxge_verbose,
1592 0, "verbose printing");
1595 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1597 CTLTYPE_INT|CTLFLAG_RW, sc,
1599 "I", "number of lro merge queues");
1602 /* add counters exported for debugging from all slices */
1603 sysctl_ctx_init(&sc->slice_sysctl_ctx);
1604 sc->slice_sysctl_tree =
1605 SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1606 "slice", CTLFLAG_RD, 0, "");
1608 for (slice = 0; slice < sc->num_slices; slice++) {
1609 ss = &sc->ss[slice];
1610 sysctl_ctx_init(&ss->sysctl_ctx);
1611 ctx = &ss->sysctl_ctx;
1612 children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1613 sprintf(slice_num, "%d", slice);
1615 SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1617 children = SYSCTL_CHILDREN(ss->sysctl_tree);
1618 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1620 CTLFLAG_RD, &ss->rx_small.cnt,
1622 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1624 CTLFLAG_RD, &ss->rx_big.cnt,
1626 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1627 "lro_flushed", CTLFLAG_RD, &ss->lro_flushed,
1628 0, "number of lro merge queues flushed");
1630 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1631 "lro_queued", CTLFLAG_RD, &ss->lro_queued,
1632 0, "number of frames appended to lro merge"
1635 #ifndef IFNET_BUF_RING
1636 /* only transmit from slice 0 for now */
1640 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1642 CTLFLAG_RD, &ss->tx.req,
1645 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1647 CTLFLAG_RD, &ss->tx.done,
1649 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1651 CTLFLAG_RD, &ss->tx.pkt_done,
1653 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1655 CTLFLAG_RD, &ss->tx.stall,
1657 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1659 CTLFLAG_RD, &ss->tx.wake,
1661 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1663 CTLFLAG_RD, &ss->tx.defrag,
1665 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1667 CTLFLAG_RD, &ss->tx.queue_active,
1668 0, "tx_queue_active");
1669 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1671 CTLFLAG_RD, &ss->tx.activate,
1673 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1675 CTLFLAG_RD, &ss->tx.deactivate,
1676 0, "tx_deactivate");
1680 /* copy an array of mcp_kreq_ether_send_t's to the mcp. Copy
1681 backwards one at a time and handle ring wraps */
1684 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1685 mcp_kreq_ether_send_t *src, int cnt)
1687 int idx, starting_slot;
1688 starting_slot = tx->req;
1691 idx = (starting_slot + cnt) & tx->mask;
1692 mxge_pio_copy(&tx->lanai[idx],
1693 &src[cnt], sizeof(*src));
1699 * copy an array of mcp_kreq_ether_send_t's to the mcp. Copy
1700 * at most 32 bytes at a time, so as to avoid involving the software
1701 * pio handler in the nic. We re-write the first segment's flags
1702 * to mark them valid only after writing the entire chain
1706 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1711 volatile uint32_t *dst_ints;
1712 mcp_kreq_ether_send_t *srcp;
1713 volatile mcp_kreq_ether_send_t *dstp, *dst;
1716 idx = tx->req & tx->mask;
1718 last_flags = src->flags;
1721 dst = dstp = &tx->lanai[idx];
1724 if ((idx + cnt) < tx->mask) {
1725 for (i = 0; i < (cnt - 1); i += 2) {
1726 mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1727 wmb(); /* force write every 32 bytes */
1732 /* submit all but the first request, and ensure
1733 that it is submitted below */
1734 mxge_submit_req_backwards(tx, src, cnt);
1738 /* submit the first request */
1739 mxge_pio_copy(dstp, srcp, sizeof(*src));
1740 wmb(); /* barrier before setting valid flag */
1743 /* re-write the last 32-bits with the valid flags */
1744 src->flags = last_flags;
1745 src_ints = (uint32_t *)src;
1747 dst_ints = (volatile uint32_t *)dst;
1749 *dst_ints = *src_ints;
1757 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1758 int busdma_seg_cnt, int ip_off)
1761 mcp_kreq_ether_send_t *req;
1762 bus_dma_segment_t *seg;
1765 uint32_t low, high_swapped;
1766 int len, seglen, cum_len, cum_len_next;
1767 int next_is_first, chop, cnt, rdma_count, small;
1768 uint16_t pseudo_hdr_offset, cksum_offset, mss;
1769 uint8_t flags, flags_next;
1772 mss = m->m_pkthdr.tso_segsz;
1774 /* negative cum_len signifies to the
1775 * send loop that we are still in the
1776 * header portion of the TSO packet.
1779 /* ensure we have the ethernet, IP and TCP
1780 header together in the first mbuf, copy
1781 it to a scratch buffer if not */
1782 if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1783 m_copydata(m, 0, ip_off + sizeof (*ip),
1785 ip = (struct ip *)(ss->scratch + ip_off);
1787 ip = (struct ip *)(mtod(m, char *) + ip_off);
1789 if (__predict_false(m->m_len < ip_off + (ip->ip_hl << 2)
1791 m_copydata(m, 0, ip_off + (ip->ip_hl << 2)
1792 + sizeof (*tcp), ss->scratch);
1793 ip = (struct ip *)(mtod(m, char *) + ip_off);
1796 tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1797 cum_len = -(ip_off + ((ip->ip_hl + tcp->th_off) << 2));
1799 /* TSO implies checksum offload on this hardware */
1800 cksum_offset = ip_off + (ip->ip_hl << 2);
1801 flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1804 /* for TSO, pseudo_hdr_offset holds mss.
1805 * The firmware figures out where to put
1806 * the checksum by parsing the header. */
1807 pseudo_hdr_offset = htobe16(mss);
1814 /* "rdma_count" is the number of RDMAs belonging to the
1815 * current packet BEFORE the current send request. For
1816 * non-TSO packets, this is equal to "count".
1817 * For TSO packets, rdma_count needs to be reset
1818 * to 0 after a segment cut.
1820 * The rdma_count field of the send request is
1821 * the number of RDMAs of the packet starting at
1822 * that request. For TSO send requests with one ore more cuts
1823 * in the middle, this is the number of RDMAs starting
1824 * after the last cut in the request. All previous
1825 * segments before the last cut implicitly have 1 RDMA.
1827 * Since the number of RDMAs is not known beforehand,
1828 * it must be filled-in retroactively - after each
1829 * segmentation cut or at the end of the entire packet.
1832 while (busdma_seg_cnt) {
1833 /* Break the busdma segment up into pieces*/
1834 low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1835 high_swapped = htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1839 flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1841 cum_len_next = cum_len + seglen;
1842 (req-rdma_count)->rdma_count = rdma_count + 1;
1843 if (__predict_true(cum_len >= 0)) {
1845 chop = (cum_len_next > mss);
1846 cum_len_next = cum_len_next % mss;
1847 next_is_first = (cum_len_next == 0);
1848 flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1849 flags_next |= next_is_first *
1851 rdma_count |= -(chop | next_is_first);
1852 rdma_count += chop & !next_is_first;
1853 } else if (cum_len_next >= 0) {
1858 small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1859 flags_next = MXGEFW_FLAGS_TSO_PLD |
1860 MXGEFW_FLAGS_FIRST |
1861 (small * MXGEFW_FLAGS_SMALL);
1864 req->addr_high = high_swapped;
1865 req->addr_low = htobe32(low);
1866 req->pseudo_hdr_offset = pseudo_hdr_offset;
1868 req->rdma_count = 1;
1869 req->length = htobe16(seglen);
1870 req->cksum_offset = cksum_offset;
1871 req->flags = flags | ((cum_len & 1) *
1872 MXGEFW_FLAGS_ALIGN_ODD);
1875 cum_len = cum_len_next;
1880 if (__predict_false(cksum_offset > seglen))
1881 cksum_offset -= seglen;
1884 if (__predict_false(cnt > tx->max_desc))
1890 (req-rdma_count)->rdma_count = rdma_count;
1894 req->flags |= MXGEFW_FLAGS_TSO_LAST;
1895 } while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1897 tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1898 mxge_submit_req(tx, tx->req_list, cnt);
1899 #ifdef IFNET_BUF_RING
1900 if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1901 /* tell the NIC to start polling this slice */
1903 tx->queue_active = 1;
1911 bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1915 printf("tx->max_desc exceeded via TSO!\n");
1916 printf("mss = %d, %ld, %d!\n", mss,
1917 (long)seg - (long)tx->seg_list, tx->max_desc);
1924 #endif /* IFCAP_TSO4 */
1926 #ifdef MXGE_NEW_VLAN_API
1928 * We reproduce the software vlan tag insertion from
1929 * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
1930 * vlan tag insertion. We need to advertise this in order to have the
1931 * vlan interface respect our csum offload flags.
1933 static struct mbuf *
1934 mxge_vlan_tag_insert(struct mbuf *m)
1936 struct ether_vlan_header *evl;
1938 M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_DONTWAIT);
1939 if (__predict_false(m == NULL))
1941 if (m->m_len < sizeof(*evl)) {
1942 m = m_pullup(m, sizeof(*evl));
1943 if (__predict_false(m == NULL))
1947 * Transform the Ethernet header into an Ethernet header
1948 * with 802.1Q encapsulation.
1950 evl = mtod(m, struct ether_vlan_header *);
1951 bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
1952 (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
1953 evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
1954 evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
1955 m->m_flags &= ~M_VLANTAG;
1958 #endif /* MXGE_NEW_VLAN_API */
1961 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
1964 mcp_kreq_ether_send_t *req;
1965 bus_dma_segment_t *seg;
1970 int cnt, cum_len, err, i, idx, odd_flag, ip_off;
1971 uint16_t pseudo_hdr_offset;
1972 uint8_t flags, cksum_offset;
1979 ip_off = sizeof (struct ether_header);
1980 #ifdef MXGE_NEW_VLAN_API
1981 if (m->m_flags & M_VLANTAG) {
1982 m = mxge_vlan_tag_insert(m);
1983 if (__predict_false(m == NULL))
1985 ip_off += ETHER_VLAN_ENCAP_LEN;
1988 /* (try to) map the frame for DMA */
1989 idx = tx->req & tx->mask;
1990 err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
1991 m, tx->seg_list, &cnt,
1993 if (__predict_false(err == EFBIG)) {
1994 /* Too many segments in the chain. Try
1996 m_tmp = m_defrag(m, M_NOWAIT);
1997 if (m_tmp == NULL) {
2002 err = bus_dmamap_load_mbuf_sg(tx->dmat,
2004 m, tx->seg_list, &cnt,
2007 if (__predict_false(err != 0)) {
2008 device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
2009 " packet len = %d\n", err, m->m_pkthdr.len);
2012 bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2013 BUS_DMASYNC_PREWRITE);
2014 tx->info[idx].m = m;
2017 /* TSO is different enough, we handle it in another routine */
2018 if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2019 mxge_encap_tso(ss, m, cnt, ip_off);
2026 pseudo_hdr_offset = 0;
2027 flags = MXGEFW_FLAGS_NO_TSO;
2029 /* checksum offloading? */
2030 if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
2031 /* ensure ip header is in first mbuf, copy
2032 it to a scratch buffer if not */
2033 if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
2034 m_copydata(m, 0, ip_off + sizeof (*ip),
2036 ip = (struct ip *)(ss->scratch + ip_off);
2038 ip = (struct ip *)(mtod(m, char *) + ip_off);
2040 cksum_offset = ip_off + (ip->ip_hl << 2);
2041 pseudo_hdr_offset = cksum_offset + m->m_pkthdr.csum_data;
2042 pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2043 req->cksum_offset = cksum_offset;
2044 flags |= MXGEFW_FLAGS_CKSUM;
2045 odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2049 if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2050 flags |= MXGEFW_FLAGS_SMALL;
2052 /* convert segments into a request list */
2055 req->flags = MXGEFW_FLAGS_FIRST;
2056 for (i = 0; i < cnt; i++) {
2058 htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2060 htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2061 req->length = htobe16(seg->ds_len);
2062 req->cksum_offset = cksum_offset;
2063 if (cksum_offset > seg->ds_len)
2064 cksum_offset -= seg->ds_len;
2067 req->pseudo_hdr_offset = pseudo_hdr_offset;
2068 req->pad = 0; /* complete solid 16-byte block */
2069 req->rdma_count = 1;
2070 req->flags |= flags | ((cum_len & 1) * odd_flag);
2071 cum_len += seg->ds_len;
2077 /* pad runts to 60 bytes */
2081 htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2083 htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2084 req->length = htobe16(60 - cum_len);
2085 req->cksum_offset = 0;
2086 req->pseudo_hdr_offset = pseudo_hdr_offset;
2087 req->pad = 0; /* complete solid 16-byte block */
2088 req->rdma_count = 1;
2089 req->flags |= flags | ((cum_len & 1) * odd_flag);
2093 tx->req_list[0].rdma_count = cnt;
2095 /* print what the firmware will see */
2096 for (i = 0; i < cnt; i++) {
2097 printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2098 "cso:%d, flags:0x%x, rdma:%d\n",
2099 i, (int)ntohl(tx->req_list[i].addr_high),
2100 (int)ntohl(tx->req_list[i].addr_low),
2101 (int)ntohs(tx->req_list[i].length),
2102 (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2103 tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2104 tx->req_list[i].rdma_count);
2106 printf("--------------\n");
2108 tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2109 mxge_submit_req(tx, tx->req_list, cnt);
2110 #ifdef IFNET_BUF_RING
2111 if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2112 /* tell the NIC to start polling this slice */
2114 tx->queue_active = 1;
2127 #ifdef IFNET_BUF_RING
2129 mxge_qflush(struct ifnet *ifp)
2131 mxge_softc_t *sc = ifp->if_softc;
2136 for (slice = 0; slice < sc->num_slices; slice++) {
2137 tx = &sc->ss[slice].tx;
2139 while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2141 mtx_unlock(&tx->mtx);
2147 mxge_start_locked(struct mxge_slice_state *ss)
2158 while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2159 m = drbr_dequeue(ifp, tx->br);
2163 /* let BPF see it */
2166 /* give it to the nic */
2169 /* ran out of transmit slots */
2170 if (((ss->if_drv_flags & IFF_DRV_OACTIVE) == 0)
2171 && (!drbr_empty(ifp, tx->br))) {
2172 ss->if_drv_flags |= IFF_DRV_OACTIVE;
2178 mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2189 if ((ss->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
2191 err = drbr_enqueue(ifp, tx->br, m);
2195 if (drbr_empty(ifp, tx->br) &&
2196 ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2197 /* let BPF see it */
2199 /* give it to the nic */
2201 } else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2204 if (!drbr_empty(ifp, tx->br))
2205 mxge_start_locked(ss);
2210 mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2212 mxge_softc_t *sc = ifp->if_softc;
2213 struct mxge_slice_state *ss;
2218 slice = m->m_pkthdr.flowid;
2219 slice &= (sc->num_slices - 1); /* num_slices always power of 2 */
2221 ss = &sc->ss[slice];
2224 if (mtx_trylock(&tx->mtx)) {
2225 err = mxge_transmit_locked(ss, m);
2226 mtx_unlock(&tx->mtx);
2228 err = drbr_enqueue(ifp, tx->br, m);
2237 mxge_start_locked(struct mxge_slice_state *ss)
2247 while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2248 IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
2252 /* let BPF see it */
2255 /* give it to the nic */
2258 /* ran out of transmit slots */
2259 if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
2260 sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
2266 mxge_start(struct ifnet *ifp)
2268 mxge_softc_t *sc = ifp->if_softc;
2269 struct mxge_slice_state *ss;
2271 /* only use the first slice for now */
2273 mtx_lock(&ss->tx.mtx);
2274 mxge_start_locked(ss);
2275 mtx_unlock(&ss->tx.mtx);
2279 * copy an array of mcp_kreq_ether_recv_t's to the mcp. Copy
2280 * at most 32 bytes at a time, so as to avoid involving the software
2281 * pio handler in the nic. We re-write the first segment's low
2282 * DMA address to mark it valid only after we write the entire chunk
2286 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2287 mcp_kreq_ether_recv_t *src)
2291 low = src->addr_low;
2292 src->addr_low = 0xffffffff;
2293 mxge_pio_copy(dst, src, 4 * sizeof (*src));
2295 mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2297 src->addr_low = low;
2298 dst->addr_low = low;
2303 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2305 bus_dma_segment_t seg;
2307 mxge_rx_ring_t *rx = &ss->rx_small;
2310 m = m_gethdr(M_DONTWAIT, MT_DATA);
2317 err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2318 &seg, &cnt, BUS_DMA_NOWAIT);
2323 rx->info[idx].m = m;
2324 rx->shadow[idx].addr_low =
2325 htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2326 rx->shadow[idx].addr_high =
2327 htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2331 mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2336 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2338 bus_dma_segment_t seg[3];
2340 mxge_rx_ring_t *rx = &ss->rx_big;
2343 if (rx->cl_size == MCLBYTES)
2344 m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR);
2346 m = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2352 m->m_len = rx->mlen;
2353 err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2354 seg, &cnt, BUS_DMA_NOWAIT);
2359 rx->info[idx].m = m;
2360 rx->shadow[idx].addr_low =
2361 htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2362 rx->shadow[idx].addr_high =
2363 htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2365 #if MXGE_VIRT_JUMBOS
2366 for (i = 1; i < cnt; i++) {
2367 rx->shadow[idx + i].addr_low =
2368 htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2369 rx->shadow[idx + i].addr_high =
2370 htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2375 for (i = 0; i < rx->nbufs; i++) {
2376 if ((idx & 7) == 7) {
2377 mxge_submit_8rx(&rx->lanai[idx - 7],
2378 &rx->shadow[idx - 7]);
2386 * Myri10GE hardware checksums are not valid if the sender
2387 * padded the frame with non-zero padding. This is because
2388 * the firmware just does a simple 16-bit 1s complement
2389 * checksum across the entire frame, excluding the first 14
2390 * bytes. It is best to simply to check the checksum and
2391 * tell the stack about it only if the checksum is good
2394 static inline uint16_t
2395 mxge_rx_csum(struct mbuf *m, int csum)
2397 struct ether_header *eh;
2401 eh = mtod(m, struct ether_header *);
2403 /* only deal with IPv4 TCP & UDP for now */
2404 if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2406 ip = (struct ip *)(eh + 1);
2407 if (__predict_false(ip->ip_p != IPPROTO_TCP &&
2408 ip->ip_p != IPPROTO_UDP))
2411 c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2412 htonl(ntohs(csum) + ntohs(ip->ip_len) +
2413 - (ip->ip_hl << 2) + ip->ip_p));
2422 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2424 struct ether_vlan_header *evl;
2425 struct ether_header *eh;
2428 evl = mtod(m, struct ether_vlan_header *);
2429 eh = mtod(m, struct ether_header *);
2432 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2433 * after what the firmware thought was the end of the ethernet
2437 /* put checksum into host byte order */
2438 *csum = ntohs(*csum);
2439 partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2440 (*csum) += ~partial;
2441 (*csum) += ((*csum) < ~partial);
2442 (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2443 (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2445 /* restore checksum to network byte order;
2446 later consumers expect this */
2447 *csum = htons(*csum);
2450 #ifdef MXGE_NEW_VLAN_API
2451 m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2455 mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2459 VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2460 m_tag_prepend(m, mtag);
2464 m->m_flags |= M_VLANTAG;
2467 * Remove the 802.1q header by copying the Ethernet
2468 * addresses over it and adjusting the beginning of
2469 * the data in the mbuf. The encapsulated Ethernet
2470 * type field is already in place.
2472 bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2473 ETHER_HDR_LEN - ETHER_TYPE_LEN);
2474 m_adj(m, ETHER_VLAN_ENCAP_LEN);
2479 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2484 struct ether_header *eh;
2486 bus_dmamap_t old_map;
2488 uint16_t tcpudp_csum;
2493 idx = rx->cnt & rx->mask;
2494 rx->cnt += rx->nbufs;
2495 /* save a pointer to the received mbuf */
2496 m = rx->info[idx].m;
2497 /* try to replace the received mbuf */
2498 if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2499 /* drop the frame -- the old mbuf is re-cycled */
2504 /* unmap the received buffer */
2505 old_map = rx->info[idx].map;
2506 bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2507 bus_dmamap_unload(rx->dmat, old_map);
2509 /* swap the bus_dmamap_t's */
2510 rx->info[idx].map = rx->extra_map;
2511 rx->extra_map = old_map;
2513 /* mcp implicitly skips 1st 2 bytes so that packet is properly
2515 m->m_data += MXGEFW_PAD;
2517 m->m_pkthdr.rcvif = ifp;
2518 m->m_len = m->m_pkthdr.len = len;
2520 eh = mtod(m, struct ether_header *);
2521 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2522 mxge_vlan_tag_remove(m, &csum);
2524 /* if the checksum is valid, mark it in the mbuf header */
2525 if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2526 if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2528 /* otherwise, it was a UDP frame, or a TCP frame which
2529 we could not do LRO on. Tell the stack that the
2531 m->m_pkthdr.csum_data = 0xffff;
2532 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2534 /* flowid only valid if RSS hashing is enabled */
2535 if (sc->num_slices > 1) {
2536 m->m_pkthdr.flowid = (ss - sc->ss);
2537 m->m_flags |= M_FLOWID;
2539 /* pass the frame up the stack */
2540 (*ifp->if_input)(ifp, m);
2544 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2548 struct ether_header *eh;
2551 bus_dmamap_t old_map;
2553 uint16_t tcpudp_csum;
2558 idx = rx->cnt & rx->mask;
2560 /* save a pointer to the received mbuf */
2561 m = rx->info[idx].m;
2562 /* try to replace the received mbuf */
2563 if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2564 /* drop the frame -- the old mbuf is re-cycled */
2569 /* unmap the received buffer */
2570 old_map = rx->info[idx].map;
2571 bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2572 bus_dmamap_unload(rx->dmat, old_map);
2574 /* swap the bus_dmamap_t's */
2575 rx->info[idx].map = rx->extra_map;
2576 rx->extra_map = old_map;
2578 /* mcp implicitly skips 1st 2 bytes so that packet is properly
2580 m->m_data += MXGEFW_PAD;
2582 m->m_pkthdr.rcvif = ifp;
2583 m->m_len = m->m_pkthdr.len = len;
2585 eh = mtod(m, struct ether_header *);
2586 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2587 mxge_vlan_tag_remove(m, &csum);
2589 /* if the checksum is valid, mark it in the mbuf header */
2590 if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2591 if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2593 /* otherwise, it was a UDP frame, or a TCP frame which
2594 we could not do LRO on. Tell the stack that the
2596 m->m_pkthdr.csum_data = 0xffff;
2597 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2599 /* flowid only valid if RSS hashing is enabled */
2600 if (sc->num_slices > 1) {
2601 m->m_pkthdr.flowid = (ss - sc->ss);
2602 m->m_flags |= M_FLOWID;
2604 /* pass the frame up the stack */
2605 (*ifp->if_input)(ifp, m);
2609 mxge_clean_rx_done(struct mxge_slice_state *ss)
2611 mxge_rx_done_t *rx_done = &ss->rx_done;
2617 while (rx_done->entry[rx_done->idx].length != 0) {
2618 length = ntohs(rx_done->entry[rx_done->idx].length);
2619 rx_done->entry[rx_done->idx].length = 0;
2620 checksum = rx_done->entry[rx_done->idx].checksum;
2621 if (length <= (MHLEN - MXGEFW_PAD))
2622 mxge_rx_done_small(ss, length, checksum);
2624 mxge_rx_done_big(ss, length, checksum);
2626 rx_done->idx = rx_done->cnt & rx_done->mask;
2628 /* limit potential for livelock */
2629 if (__predict_false(++limit > rx_done->mask / 2))
2633 while (!SLIST_EMPTY(&ss->lro_active)) {
2634 struct lro_entry *lro = SLIST_FIRST(&ss->lro_active);
2635 SLIST_REMOVE_HEAD(&ss->lro_active, next);
2636 mxge_lro_flush(ss, lro);
2643 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2654 while (tx->pkt_done != mcp_idx) {
2655 idx = tx->done & tx->mask;
2657 m = tx->info[idx].m;
2658 /* mbuf and DMA map only attached to the first
2661 ss->obytes += m->m_pkthdr.len;
2662 if (m->m_flags & M_MCAST)
2665 tx->info[idx].m = NULL;
2666 map = tx->info[idx].map;
2667 bus_dmamap_unload(tx->dmat, map);
2670 if (tx->info[idx].flag) {
2671 tx->info[idx].flag = 0;
2676 /* If we have space, clear IFF_OACTIVE to tell the stack that
2677 its OK to send packets */
2678 #ifdef IFNET_BUF_RING
2679 flags = &ss->if_drv_flags;
2681 flags = &ifp->if_drv_flags;
2683 mtx_lock(&ss->tx.mtx);
2684 if ((*flags) & IFF_DRV_OACTIVE &&
2685 tx->req - tx->done < (tx->mask + 1)/4) {
2686 *(flags) &= ~IFF_DRV_OACTIVE;
2688 mxge_start_locked(ss);
2690 #ifdef IFNET_BUF_RING
2691 if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2692 /* let the NIC stop polling this queue, since there
2693 * are no more transmits pending */
2694 if (tx->req == tx->done) {
2696 tx->queue_active = 0;
2702 mtx_unlock(&ss->tx.mtx);
2706 static struct mxge_media_type mxge_xfp_media_types[] =
2708 {IFM_10G_CX4, 0x7f, "10GBASE-CX4 (module)"},
2709 {IFM_10G_SR, (1 << 7), "10GBASE-SR"},
2710 {IFM_10G_LR, (1 << 6), "10GBASE-LR"},
2711 {0, (1 << 5), "10GBASE-ER"},
2712 {IFM_10G_LRM, (1 << 4), "10GBASE-LRM"},
2713 {0, (1 << 3), "10GBASE-SW"},
2714 {0, (1 << 2), "10GBASE-LW"},
2715 {0, (1 << 1), "10GBASE-EW"},
2716 {0, (1 << 0), "Reserved"}
2718 static struct mxge_media_type mxge_sfp_media_types[] =
2720 {0, (1 << 7), "Reserved"},
2721 {IFM_10G_LRM, (1 << 6), "10GBASE-LRM"},
2722 {IFM_10G_LR, (1 << 5), "10GBASE-LR"},
2723 {IFM_10G_SR, (1 << 4), "10GBASE-SR"}
2727 mxge_set_media(mxge_softc_t *sc, int type)
2729 sc->media_flags |= type;
2730 ifmedia_add(&sc->media, sc->media_flags, 0, NULL);
2731 ifmedia_set(&sc->media, sc->media_flags);
2736 * Determine the media type for a NIC. Some XFPs will identify
2737 * themselves only when their link is up, so this is initiated via a
2738 * link up interrupt. However, this can potentially take up to
2739 * several milliseconds, so it is run via the watchdog routine, rather
2740 * than in the interrupt handler itself. This need only be done
2741 * once, not each time the link is up.
2744 mxge_media_probe(mxge_softc_t *sc)
2749 struct mxge_media_type *mxge_media_types = NULL;
2750 int i, err, ms, mxge_media_type_entries;
2753 sc->need_media_probe = 0;
2755 /* if we've already set a media type, we're done */
2756 if (sc->media_flags != (IFM_ETHER | IFM_AUTO))
2760 * parse the product code to deterimine the interface type
2761 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2762 * after the 3rd dash in the driver's cached copy of the
2763 * EEPROM's product code string.
2765 ptr = sc->product_code_string;
2767 device_printf(sc->dev, "Missing product code\n");
2770 for (i = 0; i < 3; i++, ptr++) {
2771 ptr = index(ptr, '-');
2773 device_printf(sc->dev,
2774 "only %d dashes in PC?!?\n", i);
2780 mxge_set_media(sc, IFM_10G_CX4);
2783 else if (*ptr == 'Q') {
2784 /* -Q is Quad Ribbon Fiber */
2785 device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2786 /* FreeBSD has no media type for Quad ribbon fiber */
2792 mxge_media_types = mxge_xfp_media_types;
2793 mxge_media_type_entries =
2794 sizeof (mxge_xfp_media_types) /
2795 sizeof (mxge_xfp_media_types[0]);
2796 byte = MXGE_XFP_COMPLIANCE_BYTE;
2800 if (*ptr == 'S' || *(ptr +1) == 'S') {
2801 /* -S or -2S is SFP+ */
2802 mxge_media_types = mxge_sfp_media_types;
2803 mxge_media_type_entries =
2804 sizeof (mxge_sfp_media_types) /
2805 sizeof (mxge_sfp_media_types[0]);
2810 if (mxge_media_types == NULL) {
2811 device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2816 * At this point we know the NIC has an XFP cage, so now we
2817 * try to determine what is in the cage by using the
2818 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2819 * register. We read just one byte, which may take over
2823 cmd.data0 = 0; /* just fetch 1 byte, not all 256 */
2825 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2826 if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
2827 device_printf(sc->dev, "failed to read XFP\n");
2829 if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
2830 device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
2832 if (err != MXGEFW_CMD_OK) {
2836 /* now we wait for the data to be cached */
2838 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2839 for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
2842 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2844 if (err != MXGEFW_CMD_OK) {
2845 device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
2846 cage_type, err, ms);
2850 if (cmd.data0 == mxge_media_types[0].bitmask) {
2852 device_printf(sc->dev, "%s:%s\n", cage_type,
2853 mxge_media_types[0].name);
2854 mxge_set_media(sc, IFM_10G_CX4);
2857 for (i = 1; i < mxge_media_type_entries; i++) {
2858 if (cmd.data0 & mxge_media_types[i].bitmask) {
2860 device_printf(sc->dev, "%s:%s\n",
2862 mxge_media_types[i].name);
2864 mxge_set_media(sc, mxge_media_types[i].flag);
2868 device_printf(sc->dev, "%s media 0x%x unknown\n", cage_type,
2875 mxge_intr(void *arg)
2877 struct mxge_slice_state *ss = arg;
2878 mxge_softc_t *sc = ss->sc;
2879 mcp_irq_data_t *stats = ss->fw_stats;
2880 mxge_tx_ring_t *tx = &ss->tx;
2881 mxge_rx_done_t *rx_done = &ss->rx_done;
2882 uint32_t send_done_count;
2886 #ifndef IFNET_BUF_RING
2887 /* an interrupt on a non-zero slice is implicitly valid
2888 since MSI-X irqs are not shared */
2890 mxge_clean_rx_done(ss);
2891 *ss->irq_claim = be32toh(3);
2896 /* make sure the DMA has finished */
2897 if (!stats->valid) {
2900 valid = stats->valid;
2902 if (sc->legacy_irq) {
2903 /* lower legacy IRQ */
2904 *sc->irq_deassert = 0;
2905 if (!mxge_deassert_wait)
2906 /* don't wait for conf. that irq is low */
2912 /* loop while waiting for legacy irq deassertion */
2914 /* check for transmit completes and receives */
2915 send_done_count = be32toh(stats->send_done_count);
2916 while ((send_done_count != tx->pkt_done) ||
2917 (rx_done->entry[rx_done->idx].length != 0)) {
2918 if (send_done_count != tx->pkt_done)
2919 mxge_tx_done(ss, (int)send_done_count);
2920 mxge_clean_rx_done(ss);
2921 send_done_count = be32toh(stats->send_done_count);
2923 if (sc->legacy_irq && mxge_deassert_wait)
2925 } while (*((volatile uint8_t *) &stats->valid));
2927 /* fw link & error stats meaningful only on the first slice */
2928 if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
2929 if (sc->link_state != stats->link_up) {
2930 sc->link_state = stats->link_up;
2931 if (sc->link_state) {
2932 if_link_state_change(sc->ifp, LINK_STATE_UP);
2934 device_printf(sc->dev, "link up\n");
2936 if_link_state_change(sc->ifp, LINK_STATE_DOWN);
2938 device_printf(sc->dev, "link down\n");
2940 sc->need_media_probe = 1;
2942 if (sc->rdma_tags_available !=
2943 be32toh(stats->rdma_tags_available)) {
2944 sc->rdma_tags_available =
2945 be32toh(stats->rdma_tags_available);
2946 device_printf(sc->dev, "RDMA timed out! %d tags "
2947 "left\n", sc->rdma_tags_available);
2950 if (stats->link_down) {
2951 sc->down_cnt += stats->link_down;
2953 if_link_state_change(sc->ifp, LINK_STATE_DOWN);
2957 /* check to see if we have rx token to pass back */
2959 *ss->irq_claim = be32toh(3);
2960 *(ss->irq_claim + 1) = be32toh(3);
2964 mxge_init(void *arg)
2971 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
2973 struct lro_entry *lro_entry;
2976 while (!SLIST_EMPTY(&ss->lro_free)) {
2977 lro_entry = SLIST_FIRST(&ss->lro_free);
2978 SLIST_REMOVE_HEAD(&ss->lro_free, next);
2979 kfree(lro_entry, M_DEVBUF);
2982 for (i = 0; i <= ss->rx_big.mask; i++) {
2983 if (ss->rx_big.info[i].m == NULL)
2985 bus_dmamap_unload(ss->rx_big.dmat,
2986 ss->rx_big.info[i].map);
2987 m_freem(ss->rx_big.info[i].m);
2988 ss->rx_big.info[i].m = NULL;
2991 for (i = 0; i <= ss->rx_small.mask; i++) {
2992 if (ss->rx_small.info[i].m == NULL)
2994 bus_dmamap_unload(ss->rx_small.dmat,
2995 ss->rx_small.info[i].map);
2996 m_freem(ss->rx_small.info[i].m);
2997 ss->rx_small.info[i].m = NULL;
3000 /* transmit ring used only on the first slice */
3001 if (ss->tx.info == NULL)
3004 for (i = 0; i <= ss->tx.mask; i++) {
3005 ss->tx.info[i].flag = 0;
3006 if (ss->tx.info[i].m == NULL)
3008 bus_dmamap_unload(ss->tx.dmat,
3009 ss->tx.info[i].map);
3010 m_freem(ss->tx.info[i].m);
3011 ss->tx.info[i].m = NULL;
3016 mxge_free_mbufs(mxge_softc_t *sc)
3020 for (slice = 0; slice < sc->num_slices; slice++)
3021 mxge_free_slice_mbufs(&sc->ss[slice]);
3025 mxge_free_slice_rings(struct mxge_slice_state *ss)
3030 if (ss->rx_done.entry != NULL)
3031 mxge_dma_free(&ss->rx_done.dma);
3032 ss->rx_done.entry = NULL;
3034 if (ss->tx.req_bytes != NULL)
3035 kfree(ss->tx.req_bytes, M_DEVBUF);
3036 ss->tx.req_bytes = NULL;
3038 if (ss->tx.seg_list != NULL)
3039 kfree(ss->tx.seg_list, M_DEVBUF);
3040 ss->tx.seg_list = NULL;
3042 if (ss->rx_small.shadow != NULL)
3043 kfree(ss->rx_small.shadow, M_DEVBUF);
3044 ss->rx_small.shadow = NULL;
3046 if (ss->rx_big.shadow != NULL)
3047 kfree(ss->rx_big.shadow, M_DEVBUF);
3048 ss->rx_big.shadow = NULL;
3050 if (ss->tx.info != NULL) {
3051 if (ss->tx.dmat != NULL) {
3052 for (i = 0; i <= ss->tx.mask; i++) {
3053 bus_dmamap_destroy(ss->tx.dmat,
3054 ss->tx.info[i].map);
3056 bus_dma_tag_destroy(ss->tx.dmat);
3058 kfree(ss->tx.info, M_DEVBUF);
3062 if (ss->rx_small.info != NULL) {
3063 if (ss->rx_small.dmat != NULL) {
3064 for (i = 0; i <= ss->rx_small.mask; i++) {
3065 bus_dmamap_destroy(ss->rx_small.dmat,
3066 ss->rx_small.info[i].map);
3068 bus_dmamap_destroy(ss->rx_small.dmat,
3069 ss->rx_small.extra_map);
3070 bus_dma_tag_destroy(ss->rx_small.dmat);
3072 kfree(ss->rx_small.info, M_DEVBUF);
3074 ss->rx_small.info = NULL;
3076 if (ss->rx_big.info != NULL) {
3077 if (ss->rx_big.dmat != NULL) {
3078 for (i = 0; i <= ss->rx_big.mask; i++) {
3079 bus_dmamap_destroy(ss->rx_big.dmat,
3080 ss->rx_big.info[i].map);
3082 bus_dmamap_destroy(ss->rx_big.dmat,
3083 ss->rx_big.extra_map);
3084 bus_dma_tag_destroy(ss->rx_big.dmat);
3086 kfree(ss->rx_big.info, M_DEVBUF);
3088 ss->rx_big.info = NULL;
3092 mxge_free_rings(mxge_softc_t *sc)
3096 for (slice = 0; slice < sc->num_slices; slice++)
3097 mxge_free_slice_rings(&sc->ss[slice]);
3101 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3102 int tx_ring_entries)
3104 mxge_softc_t *sc = ss->sc;
3110 /* allocate per-slice receive resources */
3112 ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3113 ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3115 /* allocate the rx shadow rings */
3116 bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3117 ss->rx_small.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3118 if (ss->rx_small.shadow == NULL)
3121 bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3122 ss->rx_big.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3123 if (ss->rx_big.shadow == NULL)
3126 /* allocate the rx host info rings */
3127 bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3128 ss->rx_small.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3129 if (ss->rx_small.info == NULL)
3132 bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3133 ss->rx_big.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3134 if (ss->rx_big.info == NULL)
3137 /* allocate the rx busdma resources */
3138 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3140 4096, /* boundary */
3141 BUS_SPACE_MAXADDR, /* low */
3142 BUS_SPACE_MAXADDR, /* high */
3143 NULL, NULL, /* filter */
3144 MHLEN, /* maxsize */
3146 MHLEN, /* maxsegsize */
3147 BUS_DMA_ALLOCNOW, /* flags */
3148 NULL, NULL, /* lock */
3149 &ss->rx_small.dmat); /* tag */
3151 device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3156 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3158 #if MXGE_VIRT_JUMBOS
3159 4096, /* boundary */
3163 BUS_SPACE_MAXADDR, /* low */
3164 BUS_SPACE_MAXADDR, /* high */
3165 NULL, NULL, /* filter */
3166 3*4096, /* maxsize */
3167 #if MXGE_VIRT_JUMBOS
3169 4096, /* maxsegsize*/
3172 MJUM9BYTES, /* maxsegsize*/
3174 BUS_DMA_ALLOCNOW, /* flags */
3175 NULL, NULL, /* lock */
3176 &ss->rx_big.dmat); /* tag */
3178 device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3182 for (i = 0; i <= ss->rx_small.mask; i++) {
3183 err = bus_dmamap_create(ss->rx_small.dmat, 0,
3184 &ss->rx_small.info[i].map);
3186 device_printf(sc->dev, "Err %d rx_small dmamap\n",
3191 err = bus_dmamap_create(ss->rx_small.dmat, 0,
3192 &ss->rx_small.extra_map);
3194 device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3199 for (i = 0; i <= ss->rx_big.mask; i++) {
3200 err = bus_dmamap_create(ss->rx_big.dmat, 0,
3201 &ss->rx_big.info[i].map);
3203 device_printf(sc->dev, "Err %d rx_big dmamap\n",
3208 err = bus_dmamap_create(ss->rx_big.dmat, 0,
3209 &ss->rx_big.extra_map);
3211 device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3216 /* now allocate TX resouces */
3218 #ifndef IFNET_BUF_RING
3219 /* only use a single TX ring for now */
3220 if (ss != ss->sc->ss)
3224 ss->tx.mask = tx_ring_entries - 1;
3225 ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3228 /* allocate the tx request copy block */
3230 sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3231 ss->tx.req_bytes = kmalloc(bytes, M_DEVBUF, M_WAITOK);
3232 if (ss->tx.req_bytes == NULL)
3234 /* ensure req_list entries are aligned to 8 bytes */
3235 ss->tx.req_list = (mcp_kreq_ether_send_t *)
3236 ((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3238 /* allocate the tx busdma segment list */
3239 bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3240 ss->tx.seg_list = (bus_dma_segment_t *)
3241 kmalloc(bytes, M_DEVBUF, M_WAITOK);
3242 if (ss->tx.seg_list == NULL)
3245 /* allocate the tx host info ring */
3246 bytes = tx_ring_entries * sizeof (*ss->tx.info);
3247 ss->tx.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3248 if (ss->tx.info == NULL)
3251 /* allocate the tx busdma resources */
3252 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3254 sc->tx_boundary, /* boundary */
3255 BUS_SPACE_MAXADDR, /* low */
3256 BUS_SPACE_MAXADDR, /* high */
3257 NULL, NULL, /* filter */
3258 65536 + 256, /* maxsize */
3259 ss->tx.max_desc - 2, /* num segs */
3260 sc->tx_boundary, /* maxsegsz */
3261 BUS_DMA_ALLOCNOW, /* flags */
3262 NULL, NULL, /* lock */
3263 &ss->tx.dmat); /* tag */
3266 device_printf(sc->dev, "Err %d allocating tx dmat\n",
3271 /* now use these tags to setup dmamaps for each slot
3273 for (i = 0; i <= ss->tx.mask; i++) {
3274 err = bus_dmamap_create(ss->tx.dmat, 0,
3275 &ss->tx.info[i].map);
3277 device_printf(sc->dev, "Err %d tx dmamap\n",
3287 mxge_alloc_rings(mxge_softc_t *sc)
3291 int tx_ring_entries, rx_ring_entries;
3294 /* get ring sizes */
3295 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3296 tx_ring_size = cmd.data0;
3298 device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3302 tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3303 rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3304 IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
3305 sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
3306 IFQ_SET_READY(&sc->ifp->if_snd);
3308 for (slice = 0; slice < sc->num_slices; slice++) {
3309 err = mxge_alloc_slice_rings(&sc->ss[slice],
3318 mxge_free_rings(sc);
3325 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3327 int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3329 if (bufsize < MCLBYTES) {
3330 /* easy, everything fits in a single buffer */
3331 *big_buf_size = MCLBYTES;
3332 *cl_size = MCLBYTES;
3337 if (bufsize < MJUMPAGESIZE) {
3338 /* still easy, everything still fits in a single buffer */
3339 *big_buf_size = MJUMPAGESIZE;
3340 *cl_size = MJUMPAGESIZE;
3344 #if MXGE_VIRT_JUMBOS
3345 /* now we need to use virtually contiguous buffers */
3346 *cl_size = MJUM9BYTES;
3347 *big_buf_size = 4096;
3348 *nbufs = mtu / 4096 + 1;
3349 /* needs to be a power of two, so round up */
3353 *cl_size = MJUM9BYTES;
3354 *big_buf_size = MJUM9BYTES;
3360 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3365 struct lro_entry *lro_entry;
3370 slice = ss - sc->ss;
3372 SLIST_INIT(&ss->lro_free);
3373 SLIST_INIT(&ss->lro_active);
3375 for (i = 0; i < sc->lro_cnt; i++) {
3376 lro_entry = (struct lro_entry *)
3377 kmalloc(sizeof (*lro_entry), M_DEVBUF,
3379 if (lro_entry == NULL) {
3383 SLIST_INSERT_HEAD(&ss->lro_free, lro_entry, next);
3385 /* get the lanai pointers to the send and receive rings */
3388 #ifndef IFNET_BUF_RING
3389 /* We currently only send from the first slice */
3393 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3395 (volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3396 ss->tx.send_go = (volatile uint32_t *)
3397 (sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3398 ss->tx.send_stop = (volatile uint32_t *)
3399 (sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3400 #ifndef IFNET_BUF_RING
3404 err |= mxge_send_cmd(sc,
3405 MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3406 ss->rx_small.lanai =
3407 (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3409 err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3411 (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3414 device_printf(sc->dev,
3415 "failed to get ring sizes or locations\n");
3419 /* stock receive rings */
3420 for (i = 0; i <= ss->rx_small.mask; i++) {
3421 map = ss->rx_small.info[i].map;
3422 err = mxge_get_buf_small(ss, map, i);
3424 device_printf(sc->dev, "alloced %d/%d smalls\n",
3425 i, ss->rx_small.mask + 1);
3429 for (i = 0; i <= ss->rx_big.mask; i++) {
3430 ss->rx_big.shadow[i].addr_low = 0xffffffff;
3431 ss->rx_big.shadow[i].addr_high = 0xffffffff;
3433 ss->rx_big.nbufs = nbufs;
3434 ss->rx_big.cl_size = cl_size;
3435 ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3436 ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3437 for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3438 map = ss->rx_big.info[i].map;
3439 err = mxge_get_buf_big(ss, map, i);
3441 device_printf(sc->dev, "alloced %d/%d bigs\n",
3442 i, ss->rx_big.mask + 1);
3450 mxge_open(mxge_softc_t *sc)
3453 int err, big_bytes, nbufs, slice, cl_size, i;
3455 volatile uint8_t *itable;
3456 struct mxge_slice_state *ss;
3458 /* Copy the MAC address in case it was overridden */
3459 bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3461 err = mxge_reset(sc, 1);
3463 device_printf(sc->dev, "failed to reset\n");
3467 if (sc->num_slices > 1) {
3468 /* setup the indirection table */
3469 cmd.data0 = sc->num_slices;
3470 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3473 err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3476 device_printf(sc->dev,
3477 "failed to setup rss tables\n");
3481 /* just enable an identity mapping */
3482 itable = sc->sram + cmd.data0;
3483 for (i = 0; i < sc->num_slices; i++)
3484 itable[i] = (uint8_t)i;
3487 cmd.data1 = mxge_rss_hash_type;
3488 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3490 device_printf(sc->dev, "failed to enable slices\n");
3496 mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3499 err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3501 /* error is only meaningful if we're trying to set
3502 MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3503 if (err && nbufs > 1) {
3504 device_printf(sc->dev,
3505 "Failed to set alway-use-n to %d\n",
3509 /* Give the firmware the mtu and the big and small buffer
3510 sizes. The firmware wants the big buf size to be a power
3511 of two. Luckily, FreeBSD's clusters are powers of two */
3512 cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3513 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3514 cmd.data0 = MHLEN - MXGEFW_PAD;
3515 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3517 cmd.data0 = big_bytes;
3518 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3521 device_printf(sc->dev, "failed to setup params\n");
3525 /* Now give him the pointer to the stats block */
3527 #ifdef IFNET_BUF_RING
3528 slice < sc->num_slices;
3533 ss = &sc->ss[slice];
3535 MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3537 MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3538 cmd.data2 = sizeof(struct mcp_irq_data);
3539 cmd.data2 |= (slice << 16);
3540 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3544 bus = sc->ss->fw_stats_dma.bus_addr;
3545 bus += offsetof(struct mcp_irq_data, send_done_count);
3546 cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3547 cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3548 err = mxge_send_cmd(sc,
3549 MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3551 /* Firmware cannot support multicast without STATS_DMA_V2 */
3552 sc->fw_multicast_support = 0;
3554 sc->fw_multicast_support = 1;
3558 device_printf(sc->dev, "failed to setup params\n");
3562 for (slice = 0; slice < sc->num_slices; slice++) {
3563 err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3565 device_printf(sc->dev, "couldn't open slice %d\n",
3571 /* Finally, start the firmware running */
3572 err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3574 device_printf(sc->dev, "Couldn't bring up link\n");
3577 #ifdef IFNET_BUF_RING
3578 for (slice = 0; slice < sc->num_slices; slice++) {
3579 ss = &sc->ss[slice];
3580 ss->if_drv_flags |= IFF_DRV_RUNNING;
3581 ss->if_drv_flags &= ~IFF_DRV_OACTIVE;
3584 sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
3585 sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
3586 callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3592 mxge_free_mbufs(sc);
3598 mxge_close(mxge_softc_t *sc)
3601 int err, old_down_cnt;
3602 #ifdef IFNET_BUF_RING
3603 struct mxge_slice_state *ss;
3607 callout_stop(&sc->co_hdl);
3608 #ifdef IFNET_BUF_RING
3609 for (slice = 0; slice < sc->num_slices; slice++) {
3610 ss = &sc->ss[slice];
3611 ss->if_drv_flags &= ~IFF_DRV_RUNNING;
3614 sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
3615 old_down_cnt = sc->down_cnt;
3617 err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3619 device_printf(sc->dev, "Couldn't bring down link\n");
3621 if (old_down_cnt == sc->down_cnt) {
3622 /* wait for down irq */
3623 DELAY(10 * sc->intr_coal_delay);
3626 if (old_down_cnt == sc->down_cnt) {
3627 device_printf(sc->dev, "never got down irq\n");
3630 mxge_free_mbufs(sc);
3636 mxge_setup_cfg_space(mxge_softc_t *sc)
3638 device_t dev = sc->dev;
3640 uint16_t cmd, lnk, pectl;
3642 /* find the PCIe link width and set max read request to 4KB*/
3643 if (pci_find_extcap(dev, PCIY_EXPRESS, ®) == 0) {
3644 lnk = pci_read_config(dev, reg + 0x12, 2);
3645 sc->link_width = (lnk >> 4) & 0x3f;
3647 pectl = pci_read_config(dev, reg + 0x8, 2);
3648 pectl = (pectl & ~0x7000) | (5 << 12);
3649 pci_write_config(dev, reg + 0x8, pectl, 2);
3652 /* Enable DMA and Memory space access */
3653 pci_enable_busmaster(dev);
3654 cmd = pci_read_config(dev, PCIR_COMMAND, 2);
3655 cmd |= PCIM_CMD_MEMEN;
3656 pci_write_config(dev, PCIR_COMMAND, cmd, 2);
3660 mxge_read_reboot(mxge_softc_t *sc)
3662 device_t dev = sc->dev;
3665 /* find the vendor specific offset */
3666 if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
3667 device_printf(sc->dev,
3668 "could not find vendor specific offset\n");
3669 return (uint32_t)-1;
3671 /* enable read32 mode */
3672 pci_write_config(dev, vs + 0x10, 0x3, 1);
3673 /* tell NIC which register to read */
3674 pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3675 return (pci_read_config(dev, vs + 0x14, 4));
3679 mxge_watchdog_reset(mxge_softc_t *sc, int slice)
3681 struct pci_devinfo *dinfo;
3689 device_printf(sc->dev, "Watchdog reset!\n");
3692 * check to see if the NIC rebooted. If it did, then all of
3693 * PCI config space has been reset, and things like the
3694 * busmaster bit will be zero. If this is the case, then we
3695 * must restore PCI config space before the NIC can be used
3698 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3699 if (cmd == 0xffff) {
3701 * maybe the watchdog caught the NIC rebooting; wait
3702 * up to 100ms for it to finish. If it does not come
3703 * back, then give up
3706 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3707 if (cmd == 0xffff) {
3708 device_printf(sc->dev, "NIC disappeared!\n");
3712 if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3713 /* print the reboot status */
3714 reboot = mxge_read_reboot(sc);
3715 device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3717 /* restore PCI configuration space */
3718 dinfo = device_get_ivars(sc->dev);
3719 pci_cfg_restore(sc->dev, dinfo);
3721 /* and redo any changes we made to our config space */
3722 mxge_setup_cfg_space(sc);
3724 if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING) {
3726 err = mxge_open(sc);
3729 tx = &sc->ss[slice].tx;
3730 device_printf(sc->dev,
3731 "NIC did not reboot, slice %d ring state:\n",
3733 device_printf(sc->dev,
3734 "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3735 tx->req, tx->done, tx->queue_active);
3736 device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3737 tx->activate, tx->deactivate);
3738 device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3740 be32toh(sc->ss->fw_stats->send_done_count));
3741 device_printf(sc->dev, "not resetting\n");
3747 mxge_watchdog(mxge_softc_t *sc)
3750 uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3753 /* see if we have outstanding transmits, which
3754 have been pending for more than mxge_ticks */
3756 #ifdef IFNET_BUF_RING
3757 (i < sc->num_slices) && (err == 0);
3759 (i < 1) && (err == 0);
3763 if (tx->req != tx->done &&
3764 tx->watchdog_req != tx->watchdog_done &&
3765 tx->done == tx->watchdog_done) {
3766 /* check for pause blocking before resetting */
3767 if (tx->watchdog_rx_pause == rx_pause)
3768 err = mxge_watchdog_reset(sc, i);
3770 device_printf(sc->dev, "Flow control blocking "
3771 "xmits, check link partner\n");
3774 tx->watchdog_req = tx->req;
3775 tx->watchdog_done = tx->done;
3776 tx->watchdog_rx_pause = rx_pause;
3779 if (sc->need_media_probe)
3780 mxge_media_probe(sc);
3785 mxge_update_stats(mxge_softc_t *sc)
3787 struct mxge_slice_state *ss;
3788 u_long ipackets = 0;
3789 u_long opackets = 0;
3790 #ifdef IFNET_BUF_RING
3798 for (slice = 0; slice < sc->num_slices; slice++) {
3799 ss = &sc->ss[slice];
3800 ipackets += ss->ipackets;
3801 opackets += ss->opackets;
3802 #ifdef IFNET_BUF_RING
3803 obytes += ss->obytes;
3804 omcasts += ss->omcasts;
3805 odrops += ss->tx.br->br_drops;
3807 oerrors += ss->oerrors;
3809 sc->ifp->if_ipackets = ipackets;
3810 sc->ifp->if_opackets = opackets;
3811 #ifdef IFNET_BUF_RING
3812 sc->ifp->if_obytes = obytes;
3813 sc->ifp->if_omcasts = omcasts;
3814 sc->ifp->if_snd.ifq_drops = odrops;
3816 sc->ifp->if_oerrors = oerrors;
3820 mxge_tick(void *arg)
3822 mxge_softc_t *sc = arg;
3825 /* aggregate stats from different slices */
3826 mxge_update_stats(sc);
3827 if (!sc->watchdog_countdown) {
3828 err = mxge_watchdog(sc);
3829 sc->watchdog_countdown = 4;
3831 sc->watchdog_countdown--;
3833 callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3838 mxge_media_change(struct ifnet *ifp)
3844 mxge_change_mtu(mxge_softc_t *sc, int mtu)
3846 struct ifnet *ifp = sc->ifp;
3847 int real_mtu, old_mtu;
3851 real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3852 if ((real_mtu > sc->max_mtu) || real_mtu < 60)
3854 mtx_lock(&sc->driver_mtx);
3855 old_mtu = ifp->if_mtu;
3857 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3859 err = mxge_open(sc);
3861 ifp->if_mtu = old_mtu;
3863 (void) mxge_open(sc);
3866 mtx_unlock(&sc->driver_mtx);
3871 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
3873 mxge_softc_t *sc = ifp->if_softc;
3878 ifmr->ifm_status = IFM_AVALID;
3879 ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
3880 ifmr->ifm_active = IFM_AUTO | IFM_ETHER;
3881 ifmr->ifm_active |= sc->link_state ? IFM_FDX : 0;
3885 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
3887 mxge_softc_t *sc = ifp->if_softc;
3888 struct ifreq *ifr = (struct ifreq *)data;
3895 err = ether_ioctl(ifp, command, data);
3899 err = mxge_change_mtu(sc, ifr->ifr_mtu);
3903 mtx_lock(&sc->driver_mtx);
3905 mtx_unlock(&sc->driver_mtx);
3908 if (ifp->if_flags & IFF_UP) {
3909 if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
3910 err = mxge_open(sc);
3912 /* take care of promis can allmulti
3914 mxge_change_promisc(sc,
3915 ifp->if_flags & IFF_PROMISC);
3916 mxge_set_multicast_list(sc);
3919 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3923 mtx_unlock(&sc->driver_mtx);
3928 mtx_lock(&sc->driver_mtx);
3929 mxge_set_multicast_list(sc);
3930 mtx_unlock(&sc->driver_mtx);
3934 mtx_lock(&sc->driver_mtx);
3935 mask = ifr->ifr_reqcap ^ ifp->if_capenable;
3936 if (mask & IFCAP_TXCSUM) {
3937 if (IFCAP_TXCSUM & ifp->if_capenable) {
3938 ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
3939 ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
3942 ifp->if_capenable |= IFCAP_TXCSUM;
3943 ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
3945 } else if (mask & IFCAP_RXCSUM) {
3946 if (IFCAP_RXCSUM & ifp->if_capenable) {
3947 ifp->if_capenable &= ~IFCAP_RXCSUM;
3950 ifp->if_capenable |= IFCAP_RXCSUM;
3954 if (mask & IFCAP_TSO4) {
3955 if (IFCAP_TSO4 & ifp->if_capenable) {
3956 ifp->if_capenable &= ~IFCAP_TSO4;
3957 ifp->if_hwassist &= ~CSUM_TSO;
3958 } else if (IFCAP_TXCSUM & ifp->if_capenable) {
3959 ifp->if_capenable |= IFCAP_TSO4;
3960 ifp->if_hwassist |= CSUM_TSO;
3962 printf("mxge requires tx checksum offload"
3963 " be enabled to use TSO\n");
3967 if (mask & IFCAP_LRO) {
3968 if (IFCAP_LRO & ifp->if_capenable)
3969 err = mxge_change_lro_locked(sc, 0);
3971 err = mxge_change_lro_locked(sc, mxge_lro_cnt);
3973 if (mask & IFCAP_VLAN_HWTAGGING)
3974 ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
3975 mtx_unlock(&sc->driver_mtx);
3976 VLAN_CAPABILITIES(ifp);
3981 err = ifmedia_ioctl(ifp, (struct ifreq *)data,
3982 &sc->media, command);
3992 mxge_fetch_tunables(mxge_softc_t *sc)
3995 TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
3996 TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
3997 &mxge_flow_control);
3998 TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
3999 &mxge_intr_coal_delay);
4000 TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4001 &mxge_nvidia_ecrc_enable);
4002 TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4003 &mxge_force_firmware);
4004 TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4005 &mxge_deassert_wait);
4006 TUNABLE_INT_FETCH("hw.mxge.verbose",
4008 TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4009 TUNABLE_INT_FETCH("hw.mxge.lro_cnt", &sc->lro_cnt);
4010 TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4011 TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4012 TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4013 if (sc->lro_cnt != 0)
4014 mxge_lro_cnt = sc->lro_cnt;
4018 if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4019 mxge_intr_coal_delay = 30;
4020 if (mxge_ticks == 0)
4021 mxge_ticks = hz / 2;
4022 sc->pause = mxge_flow_control;
4023 if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4024 || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4025 mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
4027 if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4028 mxge_initial_mtu < ETHER_MIN_LEN)
4029 mxge_initial_mtu = ETHERMTU_JUMBO;
4034 mxge_free_slices(mxge_softc_t *sc)
4036 struct mxge_slice_state *ss;
4043 for (i = 0; i < sc->num_slices; i++) {
4045 if (ss->fw_stats != NULL) {
4046 mxge_dma_free(&ss->fw_stats_dma);
4047 ss->fw_stats = NULL;
4048 #ifdef IFNET_BUF_RING
4049 if (ss->tx.br != NULL) {
4050 drbr_free(ss->tx.br, M_DEVBUF);
4054 mtx_destroy(&ss->tx.mtx);
4056 if (ss->rx_done.entry != NULL) {
4057 mxge_dma_free(&ss->rx_done.dma);
4058 ss->rx_done.entry = NULL;
4061 free(sc->ss, M_DEVBUF);
4066 mxge_alloc_slices(mxge_softc_t *sc)
4069 struct mxge_slice_state *ss;
4071 int err, i, max_intr_slots;
4073 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4075 device_printf(sc->dev, "Cannot determine rx ring size\n");
4078 sc->rx_ring_size = cmd.data0;
4079 max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4081 bytes = sizeof (*sc->ss) * sc->num_slices;
4082 sc->ss = kmalloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4085 for (i = 0; i < sc->num_slices; i++) {
4090 /* allocate per-slice rx interrupt queues */
4092 bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4093 err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4096 ss->rx_done.entry = ss->rx_done.dma.addr;
4097 bzero(ss->rx_done.entry, bytes);
4100 * allocate the per-slice firmware stats; stats
4101 * (including tx) are used used only on the first
4104 #ifndef IFNET_BUF_RING
4109 bytes = sizeof (*ss->fw_stats);
4110 err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4111 sizeof (*ss->fw_stats), 64);
4114 ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4115 snprintf(ss->tx.lock_name, sizeof(ss->tx.mtx_name),
4116 "%s:tx(%d)", device_get_nameunit(sc->dev), i);
4117 lock_init(&ss->tx.lock, ss->tx.lock_name, 0, LK_CANRECURSE);
4118 #ifdef IFNET_BUF_RING
4119 ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4127 mxge_free_slices(sc);
4132 mxge_slice_probe(mxge_softc_t *sc)
4136 int msix_cnt, status, max_intr_slots;
4140 * don't enable multiple slices if they are not enabled,
4141 * or if this is not an SMP system
4144 if (mxge_max_slices == 0 || mxge_max_slices == 1 || mp_ncpus < 2)
4147 /* see how many MSI-X interrupts are available */
4148 msix_cnt = pci_msix_count(sc->dev);
4152 /* now load the slice aware firmware see what it supports */
4153 old_fw = sc->fw_name;
4154 if (old_fw == mxge_fw_aligned)
4155 sc->fw_name = mxge_fw_rss_aligned;
4157 sc->fw_name = mxge_fw_rss_unaligned;
4158 status = mxge_load_firmware(sc, 0);
4160 device_printf(sc->dev, "Falling back to a single slice\n");
4164 /* try to send a reset command to the card to see if it
4166 memset(&cmd, 0, sizeof (cmd));
4167 status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4169 device_printf(sc->dev, "failed reset\n");
4173 /* get rx ring size */
4174 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4176 device_printf(sc->dev, "Cannot determine rx ring size\n");
4179 max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4181 /* tell it the size of the interrupt queues */
4182 cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4183 status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4185 device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4189 /* ask the maximum number of slices it supports */
4190 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4192 device_printf(sc->dev,
4193 "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4196 sc->num_slices = cmd.data0;
4197 if (sc->num_slices > msix_cnt)
4198 sc->num_slices = msix_cnt;
4200 if (mxge_max_slices == -1) {
4201 /* cap to number of CPUs in system */
4202 if (sc->num_slices > mp_ncpus)
4203 sc->num_slices = mp_ncpus;
4205 if (sc->num_slices > mxge_max_slices)
4206 sc->num_slices = mxge_max_slices;
4208 /* make sure it is a power of two */
4209 while (sc->num_slices & (sc->num_slices - 1))
4213 device_printf(sc->dev, "using %d slices\n",
4219 sc->fw_name = old_fw;
4220 (void) mxge_load_firmware(sc, 0);
4224 mxge_add_msix_irqs(mxge_softc_t *sc)
4227 int count, err, i, rid;
4230 sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4233 if (sc->msix_table_res == NULL) {
4234 device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4238 count = sc->num_slices;
4239 err = pci_alloc_msix(sc->dev, &count);
4241 device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4242 "err = %d \n", sc->num_slices, err);
4243 goto abort_with_msix_table;
4245 if (count < sc->num_slices) {
4246 device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4247 count, sc->num_slices);
4248 device_printf(sc->dev,
4249 "Try setting hw.mxge.max_slices to %d\n",
4252 goto abort_with_msix;
4254 bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4255 sc->msix_irq_res = kmalloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4256 if (sc->msix_irq_res == NULL) {
4258 goto abort_with_msix;
4261 for (i = 0; i < sc->num_slices; i++) {
4263 sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4266 if (sc->msix_irq_res[i] == NULL) {
4267 device_printf(sc->dev, "couldn't allocate IRQ res"
4268 " for message %d\n", i);
4270 goto abort_with_res;
4274 bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4275 sc->msix_ih = kmalloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4277 for (i = 0; i < sc->num_slices; i++) {
4278 err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4279 INTR_TYPE_NET | INTR_MPSAFE,
4280 #if __FreeBSD_version > 700030
4283 mxge_intr, &sc->ss[i], &sc->msix_ih[i]);
4285 device_printf(sc->dev, "couldn't setup intr for "
4287 goto abort_with_intr;
4292 device_printf(sc->dev, "using %d msix IRQs:",
4294 for (i = 0; i < sc->num_slices; i++)
4295 printf(" %ld", rman_get_start(sc->msix_irq_res[i]));
4301 for (i = 0; i < sc->num_slices; i++) {
4302 if (sc->msix_ih[i] != NULL) {
4303 bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4305 sc->msix_ih[i] = NULL;
4308 kfree(sc->msix_ih, M_DEVBUF);
4312 for (i = 0; i < sc->num_slices; i++) {
4314 if (sc->msix_irq_res[i] != NULL)
4315 bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4316 sc->msix_irq_res[i]);
4317 sc->msix_irq_res[i] = NULL;
4319 kfree(sc->msix_irq_res, M_DEVBUF);
4323 pci_release_msi(sc->dev);
4325 abort_with_msix_table:
4326 bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4327 sc->msix_table_res);
4333 mxge_add_single_irq(mxge_softc_t *sc)
4335 int count, err, rid;
4337 count = pci_msi_count(sc->dev);
4338 if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4344 sc->irq_res = bus_alloc_resource(sc->dev, SYS_RES_IRQ, &rid, 0, ~0,
4345 1, RF_SHAREABLE | RF_ACTIVE);
4346 if (sc->irq_res == NULL) {
4347 device_printf(sc->dev, "could not alloc interrupt\n");
4351 device_printf(sc->dev, "using %s irq %ld\n",
4352 sc->legacy_irq ? "INTx" : "MSI",
4353 rman_get_start(sc->irq_res));
4354 err = bus_setup_intr(sc->dev, sc->irq_res,
4355 INTR_TYPE_NET | INTR_MPSAFE,
4356 #if __FreeBSD_version > 700030
4359 mxge_intr, &sc->ss[0], &sc->ih);
4361 bus_release_resource(sc->dev, SYS_RES_IRQ,
4362 sc->legacy_irq ? 0 : 1, sc->irq_res);
4363 if (!sc->legacy_irq)
4364 pci_release_msi(sc->dev);
4370 mxge_rem_msix_irqs(mxge_softc_t *sc)
4374 for (i = 0; i < sc->num_slices; i++) {
4375 if (sc->msix_ih[i] != NULL) {
4376 bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4378 sc->msix_ih[i] = NULL;
4381 kfree(sc->msix_ih, M_DEVBUF);
4383 for (i = 0; i < sc->num_slices; i++) {
4385 if (sc->msix_irq_res[i] != NULL)
4386 bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4387 sc->msix_irq_res[i]);
4388 sc->msix_irq_res[i] = NULL;
4390 kfree(sc->msix_irq_res, M_DEVBUF);
4392 bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4393 sc->msix_table_res);
4395 pci_release_msi(sc->dev);
4400 mxge_rem_single_irq(mxge_softc_t *sc)
4402 bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4403 bus_release_resource(sc->dev, SYS_RES_IRQ,
4404 sc->legacy_irq ? 0 : 1, sc->irq_res);
4405 if (!sc->legacy_irq)
4406 pci_release_msi(sc->dev);
4410 mxge_rem_irq(mxge_softc_t *sc)
4412 if (sc->num_slices > 1)
4413 mxge_rem_msix_irqs(sc);
4415 mxge_rem_single_irq(sc);
4419 mxge_add_irq(mxge_softc_t *sc)
4423 if (sc->num_slices > 1)
4424 err = mxge_add_msix_irqs(sc);
4426 err = mxge_add_single_irq(sc);
4428 if (0 && err == 0 && sc->num_slices > 1) {
4429 mxge_rem_msix_irqs(sc);
4430 err = mxge_add_msix_irqs(sc);
4437 mxge_attach(device_t dev)
4439 mxge_softc_t *sc = device_get_softc(dev);
4444 mxge_fetch_tunables(sc);
4446 err = bus_dma_tag_create(NULL, /* parent */
4449 BUS_SPACE_MAXADDR, /* low */
4450 BUS_SPACE_MAXADDR, /* high */
4451 NULL, NULL, /* filter */
4452 65536 + 256, /* maxsize */
4453 MXGE_MAX_SEND_DESC, /* num segs */
4454 65536, /* maxsegsize */
4456 NULL, NULL, /* lock */
4457 &sc->parent_dmat); /* tag */
4460 device_printf(sc->dev, "Err %d allocating parent dmat\n",
4462 goto abort_with_nothing;
4465 ifp = sc->ifp = if_alloc(IFT_ETHER);
4467 device_printf(dev, "can not if_alloc()\n");
4469 goto abort_with_parent_dmat;
4471 if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4473 snprintf(sc->cmd_lock_name, sizeof(sc->cmd_lock_name), "%s:cmd",
4474 device_get_nameunit(dev));
4475 lock_init(&sc->cmd_lock, sc->cmd_lock_name, 0, LK_CANRECURSE);
4476 snprintf(sc->driver_lock_name, sizeof(sc->driver_lock_name),
4477 "%s:drv", device_get_nameunit(dev));
4478 lock_init(&sc->driver_lock, sc->driver_lock_name,
4481 callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
4483 mxge_setup_cfg_space(sc);
4485 /* Map the board into the kernel */
4487 sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
4489 if (sc->mem_res == NULL) {
4490 device_printf(dev, "could not map memory\n");
4492 goto abort_with_lock;
4494 sc->sram = rman_get_virtual(sc->mem_res);
4495 sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4496 if (sc->sram_size > rman_get_size(sc->mem_res)) {
4497 device_printf(dev, "impossible memory region size %ld\n",
4498 rman_get_size(sc->mem_res));
4500 goto abort_with_mem_res;
4503 /* make NULL terminated copy of the EEPROM strings section of
4505 bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4506 bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4507 rman_get_bushandle(sc->mem_res),
4508 sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4510 MXGE_EEPROM_STRINGS_SIZE - 2);
4511 err = mxge_parse_strings(sc);
4513 goto abort_with_mem_res;
4515 /* Enable write combining for efficient use of PCIe bus */
4518 /* Allocate the out of band dma memory */
4519 err = mxge_dma_alloc(sc, &sc->cmd_dma,
4520 sizeof (mxge_cmd_t), 64);
4522 goto abort_with_mem_res;
4523 sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4524 err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4526 goto abort_with_cmd_dma;
4528 err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4530 goto abort_with_zeropad_dma;
4532 /* select & load the firmware */
4533 err = mxge_select_firmware(sc);
4535 goto abort_with_dmabench;
4536 sc->intr_coal_delay = mxge_intr_coal_delay;
4538 mxge_slice_probe(sc);
4539 err = mxge_alloc_slices(sc);
4541 goto abort_with_dmabench;
4543 err = mxge_reset(sc, 0);
4545 goto abort_with_slices;
4547 err = mxge_alloc_rings(sc);
4549 device_printf(sc->dev, "failed to allocate rings\n");
4550 goto abort_with_dmabench;
4553 err = mxge_add_irq(sc);
4555 device_printf(sc->dev, "failed to add irq\n");
4556 goto abort_with_rings;
4559 ifp->if_baudrate = IF_Gbps(10UL);
4560 ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4563 ifp->if_capabilities |= IFCAP_LRO;
4566 #ifdef MXGE_NEW_VLAN_API
4567 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4570 sc->max_mtu = mxge_max_mtu(sc);
4571 if (sc->max_mtu >= 9000)
4572 ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4574 device_printf(dev, "MTU limited to %d. Install "
4575 "latest firmware for 9000 byte jumbo support\n",
4576 sc->max_mtu - ETHER_HDR_LEN);
4577 ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4578 ifp->if_capenable = ifp->if_capabilities;
4579 if (sc->lro_cnt == 0)
4580 ifp->if_capenable &= ~IFCAP_LRO;
4582 ifp->if_init = mxge_init;
4584 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4585 ifp->if_ioctl = mxge_ioctl;
4586 ifp->if_start = mxge_start;
4587 /* Initialise the ifmedia structure */
4588 ifmedia_init(&sc->media, 0, mxge_media_change,
4590 mxge_set_media(sc, IFM_ETHER | IFM_AUTO);
4591 mxge_media_probe(sc);
4593 ether_ifattach(ifp, sc->mac_addr);
4594 /* ether_ifattach sets mtu to ETHERMTU */
4595 if (mxge_initial_mtu != ETHERMTU)
4596 mxge_change_mtu(sc, mxge_initial_mtu);
4598 mxge_add_sysctls(sc);
4599 #ifdef IFNET_BUF_RING
4600 ifp->if_transmit = mxge_transmit;
4601 ifp->if_qflush = mxge_qflush;
4606 mxge_free_rings(sc);
4608 mxge_free_slices(sc);
4609 abort_with_dmabench:
4610 mxge_dma_free(&sc->dmabench_dma);
4611 abort_with_zeropad_dma:
4612 mxge_dma_free(&sc->zeropad_dma);
4614 mxge_dma_free(&sc->cmd_dma);
4616 bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4618 pci_disable_busmaster(dev);
4619 mtx_destroy(&sc->cmd_mtx);
4620 mtx_destroy(&sc->driver_mtx);
4622 abort_with_parent_dmat:
4623 bus_dma_tag_destroy(sc->parent_dmat);
4630 mxge_detach(device_t dev)
4632 mxge_softc_t *sc = device_get_softc(dev);
4634 if (mxge_vlans_active(sc)) {
4635 device_printf(sc->dev,
4636 "Detach vlans before removing module\n");
4639 mtx_lock(&sc->driver_mtx);
4641 if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
4643 mtx_unlock(&sc->driver_mtx);
4644 ether_ifdetach(sc->ifp);
4645 callout_drain(&sc->co_hdl);
4646 ifmedia_removeall(&sc->media);
4647 mxge_dummy_rdma(sc, 0);
4648 mxge_rem_sysctls(sc);
4650 mxge_free_rings(sc);
4651 mxge_free_slices(sc);
4652 mxge_dma_free(&sc->dmabench_dma);
4653 mxge_dma_free(&sc->zeropad_dma);
4654 mxge_dma_free(&sc->cmd_dma);
4655 bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4656 pci_disable_busmaster(dev);
4657 mtx_destroy(&sc->cmd_mtx);
4658 mtx_destroy(&sc->driver_mtx);
4660 bus_dma_tag_destroy(sc->parent_dmat);
4665 mxge_shutdown(device_t dev)
4671 This file uses Myri10GE driver indentation.
4674 c-file-style:"linux"