1 /******************************************************************************
3 Copyright (c) 2006-2009, Myricom Inc.
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
9 1. Redistributions of source code must retain the above copyright notice,
10 this list of conditions and the following disclaimer.
12 2. Neither the name of the Myricom Inc, nor the names of its
13 contributors may be used to endorse or promote products derived from
14 this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
28 ***************************************************************************/
30 #include <sys/cdefs.h>
31 /*__FBSDID("$FreeBSD: src/sys/dev/mxge/if_mxge.c,v 1.63 2009/06/26 11:45:06 rwatson Exp $");*/
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/linker.h>
36 #include <sys/firmware.h>
37 #include <sys/endian.h>
38 #include <sys/sockio.h>
40 #include <sys/malloc.h>
41 #include <sys/kernel.h>
43 #include <sys/module.h>
44 #include <sys/socket.h>
45 #include <sys/sysctl.h>
47 /* count xmits ourselves, rather than via drbr */
50 #include <net/if_arp.h>
51 #include <net/ethernet.h>
52 #include <net/if_dl.h>
53 #include <net/if_media.h>
57 #include <net/if_types.h>
58 #include <net/vlan/if_vlan_var.h>
61 #include <netinet/in_systm.h>
62 #include <netinet/in.h>
63 #include <netinet/ip.h>
64 #include <netinet/tcp.h>
66 #include <machine/resource.h>
70 #include <bus/pci/pcireg.h>
71 #include <bus/pci/pcivar.h>
72 #include <bus/pci/pci_private.h> /* XXX for pci_cfg_restore */
74 #include <vm/vm.h> /* for pmap_mapdev() */
77 #if defined(__i386) || defined(__amd64)
78 #include <machine/specialreg.h>
81 #include <dev/netif/mxge/mxge_mcp.h>
82 #include <dev/netif/mxge/mcp_gen_header.h>
83 /*#define MXGE_FAKE_IFP*/
84 #include <dev/netif/mxge/if_mxge_var.h>
86 #include <sys/buf_ring.h>
92 static int mxge_nvidia_ecrc_enable = 1;
93 static int mxge_force_firmware = 0;
94 static int mxge_intr_coal_delay = 30;
95 static int mxge_deassert_wait = 1;
96 static int mxge_flow_control = 1;
97 static int mxge_verbose = 0;
98 static int mxge_lro_cnt = 8;
99 static int mxge_ticks;
100 static int mxge_max_slices = 1;
101 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
102 static int mxge_always_promisc = 0;
103 static int mxge_initial_mtu = ETHERMTU_JUMBO;
104 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
105 static char *mxge_fw_aligned = "mxge_eth_z8e";
106 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
107 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
109 static int mxge_probe(device_t dev);
110 static int mxge_attach(device_t dev);
111 static int mxge_detach(device_t dev);
112 static int mxge_shutdown(device_t dev);
113 static void mxge_intr(void *arg);
115 static device_method_t mxge_methods[] =
117 /* Device interface */
118 DEVMETHOD(device_probe, mxge_probe),
119 DEVMETHOD(device_attach, mxge_attach),
120 DEVMETHOD(device_detach, mxge_detach),
121 DEVMETHOD(device_shutdown, mxge_shutdown),
125 static driver_t mxge_driver =
129 sizeof(mxge_softc_t),
132 static devclass_t mxge_devclass;
134 /* Declare ourselves to be a child of the PCI bus.*/
135 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
136 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
137 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
139 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
140 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
141 static int mxge_close(mxge_softc_t *sc);
142 static int mxge_open(mxge_softc_t *sc);
143 static void mxge_tick(void *arg);
146 mxge_probe(device_t dev)
151 if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
152 ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
153 (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
154 rev = pci_get_revid(dev);
156 case MXGE_PCI_REV_Z8E:
157 device_set_desc(dev, "Myri10G-PCIE-8A");
159 case MXGE_PCI_REV_Z8ES:
160 device_set_desc(dev, "Myri10G-PCIE-8B");
163 device_set_desc(dev, "Myri10G-PCIE-8??");
164 device_printf(dev, "Unrecognized rev %d NIC\n",
174 mxge_enable_wc(mxge_softc_t *sc)
176 #if defined(__i386) || defined(__amd64)
181 len = rman_get_size(sc->mem_res);
182 err = pmap_change_attr((vm_offset_t) sc->sram,
183 len, PAT_WRITE_COMBINING);
185 device_printf(sc->dev, "pmap_change_attr failed, %d\n",
193 /* callback to get our DMA address */
195 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
199 *(bus_addr_t *) arg = segs->ds_addr;
204 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
205 bus_size_t alignment)
208 device_t dev = sc->dev;
209 bus_size_t boundary, maxsegsize;
211 if (bytes > 4096 && alignment == 4096) {
219 /* allocate DMAable memory tags */
220 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
221 alignment, /* alignment */
222 boundary, /* boundary */
223 BUS_SPACE_MAXADDR, /* low */
224 BUS_SPACE_MAXADDR, /* high */
225 NULL, NULL, /* filter */
228 maxsegsize, /* maxsegsize */
229 BUS_DMA_COHERENT, /* flags */
230 NULL, NULL, /* lock */
231 &dma->dmat); /* tag */
233 device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
237 /* allocate DMAable memory & map */
238 err = bus_dmamem_alloc(dma->dmat, &dma->addr,
239 (BUS_DMA_WAITOK | BUS_DMA_COHERENT
240 | BUS_DMA_ZERO), &dma->map);
242 device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
243 goto abort_with_dmat;
246 /* load the memory */
247 err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
248 mxge_dmamap_callback,
249 (void *)&dma->bus_addr, 0);
251 device_printf(dev, "couldn't load map (err = %d)\n", err);
257 bus_dmamem_free(dma->dmat, dma->addr, dma->map);
259 (void)bus_dma_tag_destroy(dma->dmat);
265 mxge_dma_free(mxge_dma_t *dma)
267 bus_dmamap_unload(dma->dmat, dma->map);
268 bus_dmamem_free(dma->dmat, dma->addr, dma->map);
269 (void)bus_dma_tag_destroy(dma->dmat);
273 * The eeprom strings on the lanaiX have the format
280 mxge_parse_strings(mxge_softc_t *sc)
282 #define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
287 ptr = sc->eeprom_strings;
288 limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
290 while (ptr < limit && *ptr != '\0') {
291 if (memcmp(ptr, "MAC=", 4) == 0) {
293 sc->mac_addr_string = ptr;
294 for (i = 0; i < 6; i++) {
296 if ((ptr + 2) > limit)
298 sc->mac_addr[i] = strtoul(ptr, NULL, 16);
301 } else if (memcmp(ptr, "PC=", 3) == 0) {
303 strncpy(sc->product_code_string, ptr,
304 sizeof (sc->product_code_string) - 1);
305 } else if (memcmp(ptr, "SN=", 3) == 0) {
307 strncpy(sc->serial_number_string, ptr,
308 sizeof (sc->serial_number_string) - 1);
310 MXGE_NEXT_STRING(ptr);
317 device_printf(sc->dev, "failed to parse eeprom_strings\n");
322 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
324 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
327 unsigned long base, off;
329 device_t pdev, mcp55;
330 uint16_t vendor_id, device_id, word;
331 uintptr_t bus, slot, func, ivend, idev;
335 if (!mxge_nvidia_ecrc_enable)
338 pdev = device_get_parent(device_get_parent(sc->dev));
340 device_printf(sc->dev, "could not find parent?\n");
343 vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
344 device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
346 if (vendor_id != 0x10de)
351 if (device_id == 0x005d) {
352 /* ck804, base address is magic */
354 } else if (device_id >= 0x0374 && device_id <= 0x378) {
355 /* mcp55, base address stored in chipset */
356 mcp55 = pci_find_bsf(0, 0, 0);
358 0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
359 0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
360 word = pci_read_config(mcp55, 0x90, 2);
361 base = ((unsigned long)word & 0x7ffeU) << 25;
368 Test below is commented because it is believed that doing
369 config read/write beyond 0xff will access the config space
370 for the next larger function. Uncomment this and remove
371 the hacky pmap_mapdev() way of accessing config space when
372 FreeBSD grows support for extended pcie config space access
375 /* See if we can, by some miracle, access the extended
377 val = pci_read_config(pdev, 0x178, 4);
378 if (val != 0xffffffff) {
380 pci_write_config(pdev, 0x178, val, 4);
384 /* Rather than using normal pci config space writes, we must
385 * map the Nvidia config space ourselves. This is because on
386 * opteron/nvidia class machine the 0xe000000 mapping is
387 * handled by the nvidia chipset, that means the internal PCI
388 * device (the on-chip northbridge), or the amd-8131 bridge
389 * and things behind them are not visible by this method.
392 BUS_READ_IVAR(device_get_parent(pdev), pdev,
394 BUS_READ_IVAR(device_get_parent(pdev), pdev,
395 PCI_IVAR_SLOT, &slot);
396 BUS_READ_IVAR(device_get_parent(pdev), pdev,
397 PCI_IVAR_FUNCTION, &func);
398 BUS_READ_IVAR(device_get_parent(pdev), pdev,
399 PCI_IVAR_VENDOR, &ivend);
400 BUS_READ_IVAR(device_get_parent(pdev), pdev,
401 PCI_IVAR_DEVICE, &idev);
404 + 0x00100000UL * (unsigned long)bus
405 + 0x00001000UL * (unsigned long)(func
408 /* map it into the kernel */
409 va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
413 device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
416 /* get a pointer to the config space mapped into the kernel */
417 cfgptr = va + (off & PAGE_MASK);
419 /* make sure that we can really access it */
420 vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
421 device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
422 if (! (vendor_id == ivend && device_id == idev)) {
423 device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
424 vendor_id, device_id);
425 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
429 ptr32 = (uint32_t*)(cfgptr + 0x178);
432 if (val == 0xffffffff) {
433 device_printf(sc->dev, "extended mapping failed\n");
434 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
438 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
440 device_printf(sc->dev,
441 "Enabled ECRC on upstream Nvidia bridge "
443 (int)bus, (int)slot, (int)func);
448 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
450 device_printf(sc->dev,
451 "Nforce 4 chipset on non-x86/amd64!?!?!\n");
458 mxge_dma_test(mxge_softc_t *sc, int test_type)
461 bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
467 /* Run a small DMA test.
468 * The magic multipliers to the length tell the firmware
469 * to do DMA read, write, or read+write tests. The
470 * results are returned in cmd.data0. The upper 16
471 * bits of the return is the number of transfers completed.
472 * The lower 16 bits is the time in 0.5us ticks that the
473 * transfers took to complete.
476 len = sc->tx_boundary;
478 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
479 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
480 cmd.data2 = len * 0x10000;
481 status = mxge_send_cmd(sc, test_type, &cmd);
486 sc->read_dma = ((cmd.data0>>16) * len * 2) /
487 (cmd.data0 & 0xffff);
488 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
489 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
490 cmd.data2 = len * 0x1;
491 status = mxge_send_cmd(sc, test_type, &cmd);
496 sc->write_dma = ((cmd.data0>>16) * len * 2) /
497 (cmd.data0 & 0xffff);
499 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
500 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
501 cmd.data2 = len * 0x10001;
502 status = mxge_send_cmd(sc, test_type, &cmd);
507 sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
508 (cmd.data0 & 0xffff);
511 if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
512 device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
519 * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
520 * when the PCI-E Completion packets are aligned on an 8-byte
521 * boundary. Some PCI-E chip sets always align Completion packets; on
522 * the ones that do not, the alignment can be enforced by enabling
523 * ECRC generation (if supported).
525 * When PCI-E Completion packets are not aligned, it is actually more
526 * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
528 * If the driver can neither enable ECRC nor verify that it has
529 * already been enabled, then it must use a firmware image which works
530 * around unaligned completion packets (ethp_z8e.dat), and it should
531 * also ensure that it never gives the device a Read-DMA which is
532 * larger than 2KB by setting the tx_boundary to 2KB. If ECRC is
533 * enabled, then the driver should use the aligned (eth_z8e.dat)
534 * firmware image, and set tx_boundary to 4KB.
538 mxge_firmware_probe(mxge_softc_t *sc)
540 device_t dev = sc->dev;
544 sc->tx_boundary = 4096;
546 * Verify the max read request size was set to 4KB
547 * before trying the test with 4KB.
549 if (pci_find_extcap(dev, PCIY_EXPRESS, ®) == 0) {
550 pectl = pci_read_config(dev, reg + 0x8, 2);
551 if ((pectl & (5 << 12)) != (5 << 12)) {
552 device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
554 sc->tx_boundary = 2048;
559 * load the optimized firmware (which assumes aligned PCIe
560 * completions) in order to see if it works on this host.
562 sc->fw_name = mxge_fw_aligned;
563 status = mxge_load_firmware(sc, 1);
569 * Enable ECRC if possible
571 mxge_enable_nvidia_ecrc(sc);
574 * Run a DMA test which watches for unaligned completions and
575 * aborts on the first one seen.
578 status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
580 return 0; /* keep the aligned firmware */
583 device_printf(dev, "DMA test failed: %d\n", status);
584 if (status == ENOSYS)
585 device_printf(dev, "Falling back to ethp! "
586 "Please install up to date fw\n");
591 mxge_select_firmware(mxge_softc_t *sc)
596 if (mxge_force_firmware != 0) {
597 if (mxge_force_firmware == 1)
602 device_printf(sc->dev,
603 "Assuming %s completions (forced)\n",
604 aligned ? "aligned" : "unaligned");
608 /* if the PCIe link width is 4 or less, we can use the aligned
609 firmware and skip any checks */
610 if (sc->link_width != 0 && sc->link_width <= 4) {
611 device_printf(sc->dev,
612 "PCIe x%d Link, expect reduced performance\n",
618 if (0 == mxge_firmware_probe(sc))
623 sc->fw_name = mxge_fw_aligned;
624 sc->tx_boundary = 4096;
626 sc->fw_name = mxge_fw_unaligned;
627 sc->tx_boundary = 2048;
629 return (mxge_load_firmware(sc, 0));
639 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
643 if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
644 device_printf(sc->dev, "Bad firmware type: 0x%x\n",
645 be32toh(hdr->mcp_type));
649 /* save firmware version for sysctl */
650 strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
652 device_printf(sc->dev, "firmware id: %s\n", hdr->version);
654 ksscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
655 &sc->fw_ver_minor, &sc->fw_ver_tiny);
657 if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
658 && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
659 device_printf(sc->dev, "Found firmware version %s\n",
661 device_printf(sc->dev, "Driver needs %d.%d\n",
662 MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
670 z_alloc(void *nil, u_int items, u_int size)
674 ptr = kmalloc(items * size, M_TEMP, M_NOWAIT);
679 z_free(void *nil, void *ptr)
686 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
689 char *inflate_buffer;
690 const struct firmware *fw;
691 const mcp_gen_header_t *hdr;
698 fw = firmware_get(sc->fw_name);
700 device_printf(sc->dev, "Could not find firmware image %s\n",
707 /* setup zlib and decompress f/w */
708 bzero(&zs, sizeof (zs));
711 status = inflateInit(&zs);
712 if (status != Z_OK) {
717 /* the uncompressed size is stored as the firmware version,
718 which would otherwise go unused */
719 fw_len = (size_t) fw->version;
720 inflate_buffer = kmalloc(fw_len, M_TEMP, M_NOWAIT);
721 if (inflate_buffer == NULL)
723 zs.avail_in = fw->datasize;
724 zs.next_in = __DECONST(char *, fw->data);
725 zs.avail_out = fw_len;
726 zs.next_out = inflate_buffer;
727 status = inflate(&zs, Z_FINISH);
728 if (status != Z_STREAM_END) {
729 device_printf(sc->dev, "zlib %d\n", status);
731 goto abort_with_buffer;
735 hdr_offset = htobe32(*(const uint32_t *)
736 (inflate_buffer + MCP_HEADER_PTR_OFFSET));
737 if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
738 device_printf(sc->dev, "Bad firmware file");
740 goto abort_with_buffer;
742 hdr = (const void*)(inflate_buffer + hdr_offset);
744 status = mxge_validate_firmware(sc, hdr);
746 goto abort_with_buffer;
748 /* Copy the inflated firmware to NIC SRAM. */
749 for (i = 0; i < fw_len; i += 256) {
750 mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
752 min(256U, (unsigned)(fw_len - i)));
761 kfree(inflate_buffer, M_TEMP);
765 firmware_put(fw, FIRMWARE_UNLOAD);
770 * Enable or disable periodic RDMAs from the host to make certain
771 * chipsets resend dropped PCIe messages
775 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
778 volatile uint32_t *confirm;
779 volatile char *submit;
780 uint32_t *buf, dma_low, dma_high;
783 buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
785 /* clear confirmation addr */
786 confirm = (volatile uint32_t *)sc->cmd;
790 /* send an rdma command to the PCIe engine, and wait for the
791 response in the confirmation address. The firmware should
792 write a -1 there to indicate it is alive and well
795 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
796 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
797 buf[0] = htobe32(dma_high); /* confirm addr MSW */
798 buf[1] = htobe32(dma_low); /* confirm addr LSW */
799 buf[2] = htobe32(0xffffffff); /* confirm data */
800 dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
801 dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
802 buf[3] = htobe32(dma_high); /* dummy addr MSW */
803 buf[4] = htobe32(dma_low); /* dummy addr LSW */
804 buf[5] = htobe32(enable); /* enable? */
807 submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
809 mxge_pio_copy(submit, buf, 64);
814 while (*confirm != 0xffffffff && i < 20) {
818 if (*confirm != 0xffffffff) {
819 device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
820 (enable ? "enable" : "disable"), confirm,
827 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
830 char buf_bytes[sizeof(*buf) + 8];
831 volatile mcp_cmd_response_t *response = sc->cmd;
832 volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
833 uint32_t dma_low, dma_high;
834 int err, sleep_total = 0;
836 /* ensure buf is aligned to 8 bytes */
837 buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
839 buf->data0 = htobe32(data->data0);
840 buf->data1 = htobe32(data->data1);
841 buf->data2 = htobe32(data->data2);
842 buf->cmd = htobe32(cmd);
843 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
844 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
846 buf->response_addr.low = htobe32(dma_low);
847 buf->response_addr.high = htobe32(dma_high);
848 lockmgr(&sc->cmd_lock, LK_EXCLUSIVE);
849 response->result = 0xffffffff;
851 mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
853 /* wait up to 20ms */
855 for (sleep_total = 0; sleep_total < 20; sleep_total++) {
856 bus_dmamap_sync(sc->cmd_dma.dmat,
857 sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
859 switch (be32toh(response->result)) {
861 data->data0 = be32toh(response->data);
867 case MXGEFW_CMD_UNKNOWN:
870 case MXGEFW_CMD_ERROR_UNALIGNED:
873 case MXGEFW_CMD_ERROR_BUSY:
877 device_printf(sc->dev,
879 "failed, result = %d\n",
880 cmd, be32toh(response->result));
888 device_printf(sc->dev, "mxge: command %d timed out"
890 cmd, be32toh(response->result));
891 lockmgr(&sc->cmd_lock, LK_RELEASE);
896 mxge_adopt_running_firmware(mxge_softc_t *sc)
898 struct mcp_gen_header *hdr;
899 const size_t bytes = sizeof (struct mcp_gen_header);
903 /* find running firmware header */
904 hdr_offset = htobe32(*(volatile uint32_t *)
905 (sc->sram + MCP_HEADER_PTR_OFFSET));
907 if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
908 device_printf(sc->dev,
909 "Running firmware has bad header offset (%d)\n",
914 /* copy header of running firmware from SRAM to host memory to
915 * validate firmware */
916 hdr = kmalloc(bytes, M_DEVBUF, M_NOWAIT);
918 device_printf(sc->dev, "could not kmalloc firmware hdr\n");
921 bus_space_read_region_1(rman_get_bustag(sc->mem_res),
922 rman_get_bushandle(sc->mem_res),
923 hdr_offset, (char *)hdr, bytes);
924 status = mxge_validate_firmware(sc, hdr);
925 kfree(hdr, M_DEVBUF);
928 * check to see if adopted firmware has bug where adopting
929 * it will cause broadcasts to be filtered unless the NIC
930 * is kept in ALLMULTI mode
932 if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
933 sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
934 sc->adopted_rx_filter_bug = 1;
935 device_printf(sc->dev, "Adopting fw %d.%d.%d: "
936 "working around rx filter bug\n",
937 sc->fw_ver_major, sc->fw_ver_minor,
946 mxge_load_firmware(mxge_softc_t *sc, int adopt)
948 volatile uint32_t *confirm;
949 volatile char *submit;
951 uint32_t *buf, size, dma_low, dma_high;
954 buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
956 size = sc->sram_size;
957 status = mxge_load_firmware_helper(sc, &size);
961 /* Try to use the currently running firmware, if
963 status = mxge_adopt_running_firmware(sc);
965 device_printf(sc->dev,
966 "failed to adopt running firmware\n");
969 device_printf(sc->dev,
970 "Successfully adopted running firmware\n");
971 if (sc->tx_boundary == 4096) {
972 device_printf(sc->dev,
973 "Using firmware currently running on NIC"
975 device_printf(sc->dev,
976 "performance consider loading optimized "
979 sc->fw_name = mxge_fw_unaligned;
980 sc->tx_boundary = 2048;
983 /* clear confirmation addr */
984 confirm = (volatile uint32_t *)sc->cmd;
987 /* send a reload command to the bootstrap MCP, and wait for the
988 response in the confirmation address. The firmware should
989 write a -1 there to indicate it is alive and well
992 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
993 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
995 buf[0] = htobe32(dma_high); /* confirm addr MSW */
996 buf[1] = htobe32(dma_low); /* confirm addr LSW */
997 buf[2] = htobe32(0xffffffff); /* confirm data */
999 /* FIX: All newest firmware should un-protect the bottom of
1000 the sram before handoff. However, the very first interfaces
1001 do not. Therefore the handoff copy must skip the first 8 bytes
1003 /* where the code starts*/
1004 buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1005 buf[4] = htobe32(size - 8); /* length of code */
1006 buf[5] = htobe32(8); /* where to copy to */
1007 buf[6] = htobe32(0); /* where to jump to */
1009 submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1010 mxge_pio_copy(submit, buf, 64);
1015 while (*confirm != 0xffffffff && i < 20) {
1018 bus_dmamap_sync(sc->cmd_dma.dmat,
1019 sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1021 if (*confirm != 0xffffffff) {
1022 device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1031 mxge_update_mac_address(mxge_softc_t *sc)
1034 uint8_t *addr = sc->mac_addr;
1038 cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1039 | (addr[2] << 8) | addr[3]);
1041 cmd.data1 = ((addr[4] << 8) | (addr[5]));
1043 status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1048 mxge_change_pause(mxge_softc_t *sc, int pause)
1054 status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1057 status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1061 device_printf(sc->dev, "Failed to set flow control mode\n");
1069 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1074 if (mxge_always_promisc)
1078 status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1081 status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1085 device_printf(sc->dev, "Failed to set promisc mode\n");
1090 mxge_set_multicast_list(mxge_softc_t *sc)
1093 struct ifmultiaddr *ifma;
1094 struct ifnet *ifp = sc->ifp;
1097 /* This firmware is known to not support multicast */
1098 if (!sc->fw_multicast_support)
1101 /* Disable multicast filtering while we play with the lists*/
1102 err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1104 device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1105 " error status: %d\n", err);
1109 if (sc->adopted_rx_filter_bug)
1112 if (ifp->if_flags & IFF_ALLMULTI)
1113 /* request to disable multicast filtering, so quit here */
1116 /* Flush all the filters */
1118 err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1120 device_printf(sc->dev,
1121 "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1122 ", error status: %d\n", err);
1126 /* Walk the multicast list, and add each address */
1128 if_maddr_rlock(ifp);
1129 TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1130 if (ifma->ifma_addr->sa_family != AF_LINK)
1132 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1134 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1136 cmd.data0 = htonl(cmd.data0);
1137 cmd.data1 = htonl(cmd.data1);
1138 err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1140 device_printf(sc->dev, "Failed "
1141 "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1143 /* abort, leaving multicast filtering off */
1144 if_maddr_runlock(ifp);
1148 if_maddr_runlock(ifp);
1149 /* Enable multicast filtering */
1150 err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1152 device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1153 ", error status: %d\n", err);
1158 mxge_max_mtu(mxge_softc_t *sc)
1163 if (MJUMPAGESIZE - MXGEFW_PAD > MXGEFW_MAX_MTU)
1164 return MXGEFW_MAX_MTU - MXGEFW_PAD;
1166 /* try to set nbufs to see if it we can
1167 use virtually contiguous jumbos */
1169 status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1172 return MXGEFW_MAX_MTU - MXGEFW_PAD;
1174 /* otherwise, we're limited to MJUMPAGESIZE */
1175 return MJUMPAGESIZE - MXGEFW_PAD;
1179 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1181 struct mxge_slice_state *ss;
1182 mxge_rx_done_t *rx_done;
1183 volatile uint32_t *irq_claim;
1187 /* try to send a reset command to the card to see if it
1189 memset(&cmd, 0, sizeof (cmd));
1190 status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1192 device_printf(sc->dev, "failed reset\n");
1196 mxge_dummy_rdma(sc, 1);
1199 /* set the intrq size */
1200 cmd.data0 = sc->rx_ring_size;
1201 status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1204 * Even though we already know how many slices are supported
1205 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1206 * has magic side effects, and must be called after a reset.
1207 * It must be called prior to calling any RSS related cmds,
1208 * including assigning an interrupt queue for anything but
1209 * slice 0. It must also be called *after*
1210 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1211 * the firmware to compute offsets.
1214 if (sc->num_slices > 1) {
1215 /* ask the maximum number of slices it supports */
1216 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1219 device_printf(sc->dev,
1220 "failed to get number of slices\n");
1224 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1225 * to setting up the interrupt queue DMA
1227 cmd.data0 = sc->num_slices;
1228 cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1229 #ifdef IFNET_BUF_RING
1230 cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1232 status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1235 device_printf(sc->dev,
1236 "failed to set number of slices\n");
1242 if (interrupts_setup) {
1243 /* Now exchange information about interrupts */
1244 for (slice = 0; slice < sc->num_slices; slice++) {
1245 rx_done = &sc->ss[slice].rx_done;
1246 memset(rx_done->entry, 0, sc->rx_ring_size);
1247 cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1248 cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1250 status |= mxge_send_cmd(sc,
1251 MXGEFW_CMD_SET_INTRQ_DMA,
1256 status |= mxge_send_cmd(sc,
1257 MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1260 sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1262 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1263 irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1266 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1268 sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1270 device_printf(sc->dev, "failed set interrupt parameters\n");
1275 *sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1278 /* run a DMA benchmark */
1279 (void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1281 for (slice = 0; slice < sc->num_slices; slice++) {
1282 ss = &sc->ss[slice];
1284 ss->irq_claim = irq_claim + (2 * slice);
1285 /* reset mcp/driver shared state back to 0 */
1286 ss->rx_done.idx = 0;
1287 ss->rx_done.cnt = 0;
1290 ss->tx.pkt_done = 0;
1291 ss->tx.queue_active = 0;
1292 ss->tx.activate = 0;
1293 ss->tx.deactivate = 0;
1298 ss->rx_small.cnt = 0;
1299 ss->lro_bad_csum = 0;
1301 ss->lro_flushed = 0;
1302 if (ss->fw_stats != NULL) {
1303 ss->fw_stats->valid = 0;
1304 ss->fw_stats->send_done_count = 0;
1307 sc->rdma_tags_available = 15;
1308 status = mxge_update_mac_address(sc);
1309 mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1310 mxge_change_pause(sc, sc->pause);
1311 mxge_set_multicast_list(sc);
1316 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1319 unsigned int intr_coal_delay;
1323 intr_coal_delay = sc->intr_coal_delay;
1324 err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1328 if (intr_coal_delay == sc->intr_coal_delay)
1331 if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1334 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
1335 *sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1336 sc->intr_coal_delay = intr_coal_delay;
1338 lockmgr(&sc->driver_lock, LK_RELEASE);
1343 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1346 unsigned int enabled;
1350 enabled = sc->pause;
1351 err = sysctl_handle_int(oidp, &enabled, arg2, req);
1355 if (enabled == sc->pause)
1358 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
1359 err = mxge_change_pause(sc, enabled);
1360 lockmgr(&sc->driver_lock, LK_RELEASE);
1365 mxge_change_lro_locked(mxge_softc_t *sc, int lro_cnt)
1372 ifp->if_capenable &= ~IFCAP_LRO;
1374 ifp->if_capenable |= IFCAP_LRO;
1375 sc->lro_cnt = lro_cnt;
1376 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
1378 err = mxge_open(sc);
1384 mxge_change_lro(SYSCTL_HANDLER_ARGS)
1387 unsigned int lro_cnt;
1391 lro_cnt = sc->lro_cnt;
1392 err = sysctl_handle_int(oidp, &lro_cnt, arg2, req);
1396 if (lro_cnt == sc->lro_cnt)
1402 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
1403 err = mxge_change_lro_locked(sc, lro_cnt);
1404 lockmgr(&sc->driver_lock, LK_RELEASE);
1409 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1415 arg2 = be32toh(*(int *)arg1);
1417 err = sysctl_handle_int(oidp, arg1, arg2, req);
1423 mxge_rem_sysctls(mxge_softc_t *sc)
1425 struct mxge_slice_state *ss;
1428 if (sc->slice_sysctl_tree == NULL)
1431 for (slice = 0; slice < sc->num_slices; slice++) {
1432 ss = &sc->ss[slice];
1433 if (ss == NULL || ss->sysctl_tree == NULL)
1435 sysctl_ctx_free(&ss->sysctl_ctx);
1436 ss->sysctl_tree = NULL;
1438 sysctl_ctx_free(&sc->slice_sysctl_ctx);
1439 sc->slice_sysctl_tree = NULL;
1443 mxge_add_sysctls(mxge_softc_t *sc)
1445 struct sysctl_ctx_list *ctx;
1446 struct sysctl_oid_list *children;
1448 struct mxge_slice_state *ss;
1452 ctx = &sc->sysctl_ctx;
1453 sysctl_ctx_init(ctx);
1454 sc->sysctl_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_STATIC_CHILDREN(_hw),
1456 device_get_nameunit(sc->dev),
1458 if (sc->sysctl_tree == NULL) {
1459 device_printf(sc->dev, "can't add sysctl node\n");
1463 children = SYSCTL_CHILDREN(sc->sysctl_tree);
1464 fw = sc->ss[0].fw_stats;
1466 /* random information */
1467 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1469 CTLFLAG_RD, &sc->fw_version,
1470 0, "firmware version");
1471 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1473 CTLFLAG_RD, &sc->serial_number_string,
1474 0, "serial number");
1475 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1477 CTLFLAG_RD, &sc->product_code_string,
1479 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1481 CTLFLAG_RD, &sc->link_width,
1483 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1485 CTLFLAG_RD, &sc->tx_boundary,
1487 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1489 CTLFLAG_RD, &sc->wc,
1490 0, "write combining PIO?");
1491 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1493 CTLFLAG_RD, &sc->read_dma,
1494 0, "DMA Read speed in MB/s");
1495 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1497 CTLFLAG_RD, &sc->write_dma,
1498 0, "DMA Write speed in MB/s");
1499 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1500 "read_write_dma_MBs",
1501 CTLFLAG_RD, &sc->read_write_dma,
1502 0, "DMA concurrent Read/Write speed in MB/s");
1505 /* performance related tunables */
1506 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1508 CTLTYPE_INT|CTLFLAG_RW, sc,
1509 0, mxge_change_intr_coal,
1510 "I", "interrupt coalescing delay in usecs");
1512 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1513 "flow_control_enabled",
1514 CTLTYPE_INT|CTLFLAG_RW, sc,
1515 0, mxge_change_flow_control,
1516 "I", "interrupt coalescing delay in usecs");
1518 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1520 CTLFLAG_RW, &mxge_deassert_wait,
1521 0, "Wait for IRQ line to go low in ihandler");
1523 /* stats block from firmware is in network byte order.
1525 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1527 CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1528 0, mxge_handle_be32,
1530 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1531 "rdma_tags_available",
1532 CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1533 0, mxge_handle_be32,
1534 "I", "rdma_tags_available");
1535 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1536 "dropped_bad_crc32",
1537 CTLTYPE_INT|CTLFLAG_RD,
1538 &fw->dropped_bad_crc32,
1539 0, mxge_handle_be32,
1540 "I", "dropped_bad_crc32");
1541 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1543 CTLTYPE_INT|CTLFLAG_RD,
1544 &fw->dropped_bad_phy,
1545 0, mxge_handle_be32,
1546 "I", "dropped_bad_phy");
1547 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1548 "dropped_link_error_or_filtered",
1549 CTLTYPE_INT|CTLFLAG_RD,
1550 &fw->dropped_link_error_or_filtered,
1551 0, mxge_handle_be32,
1552 "I", "dropped_link_error_or_filtered");
1553 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1554 "dropped_link_overflow",
1555 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1556 0, mxge_handle_be32,
1557 "I", "dropped_link_overflow");
1558 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1559 "dropped_multicast_filtered",
1560 CTLTYPE_INT|CTLFLAG_RD,
1561 &fw->dropped_multicast_filtered,
1562 0, mxge_handle_be32,
1563 "I", "dropped_multicast_filtered");
1564 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1565 "dropped_no_big_buffer",
1566 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1567 0, mxge_handle_be32,
1568 "I", "dropped_no_big_buffer");
1569 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1570 "dropped_no_small_buffer",
1571 CTLTYPE_INT|CTLFLAG_RD,
1572 &fw->dropped_no_small_buffer,
1573 0, mxge_handle_be32,
1574 "I", "dropped_no_small_buffer");
1575 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1577 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1578 0, mxge_handle_be32,
1579 "I", "dropped_overrun");
1580 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1582 CTLTYPE_INT|CTLFLAG_RD,
1584 0, mxge_handle_be32,
1585 "I", "dropped_pause");
1586 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1588 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1589 0, mxge_handle_be32,
1590 "I", "dropped_runt");
1592 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1593 "dropped_unicast_filtered",
1594 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1595 0, mxge_handle_be32,
1596 "I", "dropped_unicast_filtered");
1598 /* verbose printing? */
1599 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1601 CTLFLAG_RW, &mxge_verbose,
1602 0, "verbose printing");
1605 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1607 CTLTYPE_INT|CTLFLAG_RW, sc,
1609 "I", "number of lro merge queues");
1612 /* add counters exported for debugging from all slices */
1613 sysctl_ctx_init(&sc->slice_sysctl_ctx);
1614 sc->slice_sysctl_tree =
1615 SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1616 "slice", CTLFLAG_RD, 0, "");
1618 for (slice = 0; slice < sc->num_slices; slice++) {
1619 ss = &sc->ss[slice];
1620 sysctl_ctx_init(&ss->sysctl_ctx);
1621 ctx = &ss->sysctl_ctx;
1622 children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1623 ksprintf(slice_num, "%d", slice);
1625 SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1627 children = SYSCTL_CHILDREN(ss->sysctl_tree);
1628 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1630 CTLFLAG_RD, &ss->rx_small.cnt,
1632 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1634 CTLFLAG_RD, &ss->rx_big.cnt,
1636 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1637 "lro_flushed", CTLFLAG_RD, &ss->lro_flushed,
1638 0, "number of lro merge queues flushed");
1640 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1641 "lro_queued", CTLFLAG_RD, &ss->lro_queued,
1642 0, "number of frames appended to lro merge"
1645 #ifndef IFNET_BUF_RING
1646 /* only transmit from slice 0 for now */
1650 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1652 CTLFLAG_RD, &ss->tx.req,
1655 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1657 CTLFLAG_RD, &ss->tx.done,
1659 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1661 CTLFLAG_RD, &ss->tx.pkt_done,
1663 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1665 CTLFLAG_RD, &ss->tx.stall,
1667 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1669 CTLFLAG_RD, &ss->tx.wake,
1671 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1673 CTLFLAG_RD, &ss->tx.defrag,
1675 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1677 CTLFLAG_RD, &ss->tx.queue_active,
1678 0, "tx_queue_active");
1679 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1681 CTLFLAG_RD, &ss->tx.activate,
1683 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1685 CTLFLAG_RD, &ss->tx.deactivate,
1686 0, "tx_deactivate");
1690 /* copy an array of mcp_kreq_ether_send_t's to the mcp. Copy
1691 backwards one at a time and handle ring wraps */
1694 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1695 mcp_kreq_ether_send_t *src, int cnt)
1697 int idx, starting_slot;
1698 starting_slot = tx->req;
1701 idx = (starting_slot + cnt) & tx->mask;
1702 mxge_pio_copy(&tx->lanai[idx],
1703 &src[cnt], sizeof(*src));
1709 * copy an array of mcp_kreq_ether_send_t's to the mcp. Copy
1710 * at most 32 bytes at a time, so as to avoid involving the software
1711 * pio handler in the nic. We re-write the first segment's flags
1712 * to mark them valid only after writing the entire chain
1716 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1721 volatile uint32_t *dst_ints;
1722 mcp_kreq_ether_send_t *srcp;
1723 volatile mcp_kreq_ether_send_t *dstp, *dst;
1726 idx = tx->req & tx->mask;
1728 last_flags = src->flags;
1731 dst = dstp = &tx->lanai[idx];
1734 if ((idx + cnt) < tx->mask) {
1735 for (i = 0; i < (cnt - 1); i += 2) {
1736 mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1737 wmb(); /* force write every 32 bytes */
1742 /* submit all but the first request, and ensure
1743 that it is submitted below */
1744 mxge_submit_req_backwards(tx, src, cnt);
1748 /* submit the first request */
1749 mxge_pio_copy(dstp, srcp, sizeof(*src));
1750 wmb(); /* barrier before setting valid flag */
1753 /* re-write the last 32-bits with the valid flags */
1754 src->flags = last_flags;
1755 src_ints = (uint32_t *)src;
1757 dst_ints = (volatile uint32_t *)dst;
1759 *dst_ints = *src_ints;
1767 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1768 int busdma_seg_cnt, int ip_off)
1771 mcp_kreq_ether_send_t *req;
1772 bus_dma_segment_t *seg;
1775 uint32_t low, high_swapped;
1776 int len, seglen, cum_len, cum_len_next;
1777 int next_is_first, chop, cnt, rdma_count, small;
1778 uint16_t pseudo_hdr_offset, cksum_offset, mss;
1779 uint8_t flags, flags_next;
1782 mss = m->m_pkthdr.tso_segsz;
1784 /* negative cum_len signifies to the
1785 * send loop that we are still in the
1786 * header portion of the TSO packet.
1789 /* ensure we have the ethernet, IP and TCP
1790 header together in the first mbuf, copy
1791 it to a scratch buffer if not */
1792 if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1793 m_copydata(m, 0, ip_off + sizeof (*ip),
1795 ip = (struct ip *)(ss->scratch + ip_off);
1797 ip = (struct ip *)(mtod(m, char *) + ip_off);
1799 if (__predict_false(m->m_len < ip_off + (ip->ip_hl << 2)
1801 m_copydata(m, 0, ip_off + (ip->ip_hl << 2)
1802 + sizeof (*tcp), ss->scratch);
1803 ip = (struct ip *)(mtod(m, char *) + ip_off);
1806 tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1807 cum_len = -(ip_off + ((ip->ip_hl + tcp->th_off) << 2));
1809 /* TSO implies checksum offload on this hardware */
1810 cksum_offset = ip_off + (ip->ip_hl << 2);
1811 flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1814 /* for TSO, pseudo_hdr_offset holds mss.
1815 * The firmware figures out where to put
1816 * the checksum by parsing the header. */
1817 pseudo_hdr_offset = htobe16(mss);
1824 /* "rdma_count" is the number of RDMAs belonging to the
1825 * current packet BEFORE the current send request. For
1826 * non-TSO packets, this is equal to "count".
1827 * For TSO packets, rdma_count needs to be reset
1828 * to 0 after a segment cut.
1830 * The rdma_count field of the send request is
1831 * the number of RDMAs of the packet starting at
1832 * that request. For TSO send requests with one ore more cuts
1833 * in the middle, this is the number of RDMAs starting
1834 * after the last cut in the request. All previous
1835 * segments before the last cut implicitly have 1 RDMA.
1837 * Since the number of RDMAs is not known beforehand,
1838 * it must be filled-in retroactively - after each
1839 * segmentation cut or at the end of the entire packet.
1842 while (busdma_seg_cnt) {
1843 /* Break the busdma segment up into pieces*/
1844 low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1845 high_swapped = htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1849 flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1851 cum_len_next = cum_len + seglen;
1852 (req-rdma_count)->rdma_count = rdma_count + 1;
1853 if (__predict_true(cum_len >= 0)) {
1855 chop = (cum_len_next > mss);
1856 cum_len_next = cum_len_next % mss;
1857 next_is_first = (cum_len_next == 0);
1858 flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1859 flags_next |= next_is_first *
1861 rdma_count |= -(chop | next_is_first);
1862 rdma_count += chop & !next_is_first;
1863 } else if (cum_len_next >= 0) {
1868 small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1869 flags_next = MXGEFW_FLAGS_TSO_PLD |
1870 MXGEFW_FLAGS_FIRST |
1871 (small * MXGEFW_FLAGS_SMALL);
1874 req->addr_high = high_swapped;
1875 req->addr_low = htobe32(low);
1876 req->pseudo_hdr_offset = pseudo_hdr_offset;
1878 req->rdma_count = 1;
1879 req->length = htobe16(seglen);
1880 req->cksum_offset = cksum_offset;
1881 req->flags = flags | ((cum_len & 1) *
1882 MXGEFW_FLAGS_ALIGN_ODD);
1885 cum_len = cum_len_next;
1890 if (__predict_false(cksum_offset > seglen))
1891 cksum_offset -= seglen;
1894 if (__predict_false(cnt > tx->max_desc))
1900 (req-rdma_count)->rdma_count = rdma_count;
1904 req->flags |= MXGEFW_FLAGS_TSO_LAST;
1905 } while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1907 tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1908 mxge_submit_req(tx, tx->req_list, cnt);
1909 #ifdef IFNET_BUF_RING
1910 if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1911 /* tell the NIC to start polling this slice */
1913 tx->queue_active = 1;
1921 bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1925 kprintf("tx->max_desc exceeded via TSO!\n");
1926 kprintf("mss = %d, %ld, %d!\n", mss,
1927 (long)seg - (long)tx->seg_list, tx->max_desc);
1934 #endif /* IFCAP_TSO4 */
1936 #ifdef MXGE_NEW_VLAN_API
1938 * We reproduce the software vlan tag insertion from
1939 * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
1940 * vlan tag insertion. We need to advertise this in order to have the
1941 * vlan interface respect our csum offload flags.
1943 static struct mbuf *
1944 mxge_vlan_tag_insert(struct mbuf *m)
1946 struct ether_vlan_header *evl;
1948 M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, MB_DONTWAIT);
1949 if (__predict_false(m == NULL))
1951 if (m->m_len < sizeof(*evl)) {
1952 m = m_pullup(m, sizeof(*evl));
1953 if (__predict_false(m == NULL))
1957 * Transform the Ethernet header into an Ethernet header
1958 * with 802.1Q encapsulation.
1960 evl = mtod(m, struct ether_vlan_header *);
1961 bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
1962 (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
1963 evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
1964 evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
1965 m->m_flags &= ~M_VLANTAG;
1968 #endif /* MXGE_NEW_VLAN_API */
1971 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
1974 mcp_kreq_ether_send_t *req;
1975 bus_dma_segment_t *seg;
1980 int cnt, cum_len, err, i, idx, odd_flag, ip_off;
1981 uint16_t pseudo_hdr_offset;
1982 uint8_t flags, cksum_offset;
1989 ip_off = sizeof (struct ether_header);
1990 #ifdef MXGE_NEW_VLAN_API
1991 if (m->m_flags & M_VLANTAG) {
1992 m = mxge_vlan_tag_insert(m);
1993 if (__predict_false(m == NULL))
1995 ip_off += ETHER_VLAN_ENCAP_LEN;
1998 /* (try to) map the frame for DMA */
1999 idx = tx->req & tx->mask;
2000 err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
2001 m, tx->seg_list, &cnt,
2003 if (__predict_false(err == EFBIG)) {
2004 /* Too many segments in the chain. Try
2006 m_tmp = m_defrag(m, M_NOWAIT);
2007 if (m_tmp == NULL) {
2012 err = bus_dmamap_load_mbuf_sg(tx->dmat,
2014 m, tx->seg_list, &cnt,
2017 if (__predict_false(err != 0)) {
2018 device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
2019 " packet len = %d\n", err, m->m_pkthdr.len);
2022 bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2023 BUS_DMASYNC_PREWRITE);
2024 tx->info[idx].m = m;
2027 /* TSO is different enough, we handle it in another routine */
2028 if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2029 mxge_encap_tso(ss, m, cnt, ip_off);
2036 pseudo_hdr_offset = 0;
2037 flags = MXGEFW_FLAGS_NO_TSO;
2039 /* checksum offloading? */
2040 if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
2041 /* ensure ip header is in first mbuf, copy
2042 it to a scratch buffer if not */
2043 if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
2044 m_copydata(m, 0, ip_off + sizeof (*ip),
2046 ip = (struct ip *)(ss->scratch + ip_off);
2048 ip = (struct ip *)(mtod(m, char *) + ip_off);
2050 cksum_offset = ip_off + (ip->ip_hl << 2);
2051 pseudo_hdr_offset = cksum_offset + m->m_pkthdr.csum_data;
2052 pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2053 req->cksum_offset = cksum_offset;
2054 flags |= MXGEFW_FLAGS_CKSUM;
2055 odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2059 if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2060 flags |= MXGEFW_FLAGS_SMALL;
2062 /* convert segments into a request list */
2065 req->flags = MXGEFW_FLAGS_FIRST;
2066 for (i = 0; i < cnt; i++) {
2068 htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2070 htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2071 req->length = htobe16(seg->ds_len);
2072 req->cksum_offset = cksum_offset;
2073 if (cksum_offset > seg->ds_len)
2074 cksum_offset -= seg->ds_len;
2077 req->pseudo_hdr_offset = pseudo_hdr_offset;
2078 req->pad = 0; /* complete solid 16-byte block */
2079 req->rdma_count = 1;
2080 req->flags |= flags | ((cum_len & 1) * odd_flag);
2081 cum_len += seg->ds_len;
2087 /* pad runts to 60 bytes */
2091 htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2093 htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2094 req->length = htobe16(60 - cum_len);
2095 req->cksum_offset = 0;
2096 req->pseudo_hdr_offset = pseudo_hdr_offset;
2097 req->pad = 0; /* complete solid 16-byte block */
2098 req->rdma_count = 1;
2099 req->flags |= flags | ((cum_len & 1) * odd_flag);
2103 tx->req_list[0].rdma_count = cnt;
2105 /* print what the firmware will see */
2106 for (i = 0; i < cnt; i++) {
2107 kprintf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2108 "cso:%d, flags:0x%x, rdma:%d\n",
2109 i, (int)ntohl(tx->req_list[i].addr_high),
2110 (int)ntohl(tx->req_list[i].addr_low),
2111 (int)ntohs(tx->req_list[i].length),
2112 (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2113 tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2114 tx->req_list[i].rdma_count);
2116 kprintf("--------------\n");
2118 tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2119 mxge_submit_req(tx, tx->req_list, cnt);
2120 #ifdef IFNET_BUF_RING
2121 if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2122 /* tell the NIC to start polling this slice */
2124 tx->queue_active = 1;
2137 #ifdef IFNET_BUF_RING
2139 mxge_qflush(struct ifnet *ifp)
2141 mxge_softc_t *sc = ifp->if_softc;
2146 for (slice = 0; slice < sc->num_slices; slice++) {
2147 tx = &sc->ss[slice].tx;
2148 lockmgr(&tx->lock, LK_EXCLUSIVE);
2149 while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2151 lockmgr(&tx->lock, LK_RELEASE);
2157 mxge_start_locked(struct mxge_slice_state *ss)
2168 while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2169 m = drbr_dequeue(ifp, tx->br);
2173 /* let BPF see it */
2176 /* give it to the nic */
2179 /* ran out of transmit slots */
2180 if (((ss->if_drv_flags & IFF_DRV_OACTIVE) == 0)
2181 && (!drbr_empty(ifp, tx->br))) {
2182 ss->if_drv_flags |= IFF_DRV_OACTIVE;
2188 mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2199 if ((ss->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
2201 err = drbr_enqueue(ifp, tx->br, m);
2205 if (drbr_empty(ifp, tx->br) &&
2206 ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2207 /* let BPF see it */
2209 /* give it to the nic */
2211 } else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2214 if (!drbr_empty(ifp, tx->br))
2215 mxge_start_locked(ss);
2220 mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2222 mxge_softc_t *sc = ifp->if_softc;
2223 struct mxge_slice_state *ss;
2228 slice = m->m_pkthdr.flowid;
2229 slice &= (sc->num_slices - 1); /* num_slices always power of 2 */
2231 ss = &sc->ss[slice];
2234 if (lockmgr(&tx->lock, LK_EXCLUSIVE|LK_NOWAIT)) {
2235 err = mxge_transmit_locked(ss, m);
2236 lockmgr(&tx->lock, LK_RELEASE);
2238 err = drbr_enqueue(ifp, tx->br, m);
2247 mxge_start_locked(struct mxge_slice_state *ss)
2257 while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2258 IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
2262 /* let BPF see it */
2265 /* give it to the nic */
2268 /* ran out of transmit slots */
2269 if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
2270 sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
2276 mxge_start(struct ifnet *ifp)
2278 mxge_softc_t *sc = ifp->if_softc;
2279 struct mxge_slice_state *ss;
2281 /* only use the first slice for now */
2283 lockmgr(&ss->tx.lock, LK_EXCLUSIVE);
2284 mxge_start_locked(ss);
2285 lockmgr(&ss->tx.lock, LK_RELEASE);
2289 * copy an array of mcp_kreq_ether_recv_t's to the mcp. Copy
2290 * at most 32 bytes at a time, so as to avoid involving the software
2291 * pio handler in the nic. We re-write the first segment's low
2292 * DMA address to mark it valid only after we write the entire chunk
2296 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2297 mcp_kreq_ether_recv_t *src)
2301 low = src->addr_low;
2302 src->addr_low = 0xffffffff;
2303 mxge_pio_copy(dst, src, 4 * sizeof (*src));
2305 mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2307 src->addr_low = low;
2308 dst->addr_low = low;
2313 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2315 bus_dma_segment_t seg;
2317 mxge_rx_ring_t *rx = &ss->rx_small;
2320 m = m_gethdr(MB_DONTWAIT, MT_DATA);
2327 err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2328 &seg, &cnt, BUS_DMA_NOWAIT);
2333 rx->info[idx].m = m;
2334 rx->shadow[idx].addr_low =
2335 htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2336 rx->shadow[idx].addr_high =
2337 htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2341 mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2346 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2348 bus_dma_segment_t seg[3];
2350 mxge_rx_ring_t *rx = &ss->rx_big;
2353 if (rx->cl_size == MCLBYTES)
2354 m = m_getcl(MB_DONTWAIT, MT_DATA, M_PKTHDR);
2356 m = m_getjcl(MB_DONTWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2362 m->m_len = rx->mlen;
2363 err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2364 seg, &cnt, BUS_DMA_NOWAIT);
2369 rx->info[idx].m = m;
2370 rx->shadow[idx].addr_low =
2371 htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2372 rx->shadow[idx].addr_high =
2373 htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2375 #if MXGE_VIRT_JUMBOS
2376 for (i = 1; i < cnt; i++) {
2377 rx->shadow[idx + i].addr_low =
2378 htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2379 rx->shadow[idx + i].addr_high =
2380 htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2385 for (i = 0; i < rx->nbufs; i++) {
2386 if ((idx & 7) == 7) {
2387 mxge_submit_8rx(&rx->lanai[idx - 7],
2388 &rx->shadow[idx - 7]);
2396 * Myri10GE hardware checksums are not valid if the sender
2397 * padded the frame with non-zero padding. This is because
2398 * the firmware just does a simple 16-bit 1s complement
2399 * checksum across the entire frame, excluding the first 14
2400 * bytes. It is best to simply to check the checksum and
2401 * tell the stack about it only if the checksum is good
2404 static inline uint16_t
2405 mxge_rx_csum(struct mbuf *m, int csum)
2407 struct ether_header *eh;
2411 eh = mtod(m, struct ether_header *);
2413 /* only deal with IPv4 TCP & UDP for now */
2414 if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2416 ip = (struct ip *)(eh + 1);
2417 if (__predict_false(ip->ip_p != IPPROTO_TCP &&
2418 ip->ip_p != IPPROTO_UDP))
2421 c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2422 htonl(ntohs(csum) + ntohs(ip->ip_len) +
2423 - (ip->ip_hl << 2) + ip->ip_p));
2432 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2434 struct ether_vlan_header *evl;
2435 struct ether_header *eh;
2438 evl = mtod(m, struct ether_vlan_header *);
2439 eh = mtod(m, struct ether_header *);
2442 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2443 * after what the firmware thought was the end of the ethernet
2447 /* put checksum into host byte order */
2448 *csum = ntohs(*csum);
2449 partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2450 (*csum) += ~partial;
2451 (*csum) += ((*csum) < ~partial);
2452 (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2453 (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2455 /* restore checksum to network byte order;
2456 later consumers expect this */
2457 *csum = htons(*csum);
2460 #ifdef MXGE_NEW_VLAN_API
2461 m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2465 mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2469 VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2470 m_tag_prepend(m, mtag);
2474 m->m_flags |= M_VLANTAG;
2477 * Remove the 802.1q header by copying the Ethernet
2478 * addresses over it and adjusting the beginning of
2479 * the data in the mbuf. The encapsulated Ethernet
2480 * type field is already in place.
2482 bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2483 ETHER_HDR_LEN - ETHER_TYPE_LEN);
2484 m_adj(m, ETHER_VLAN_ENCAP_LEN);
2489 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2494 struct ether_header *eh;
2496 bus_dmamap_t old_map;
2498 uint16_t tcpudp_csum;
2503 idx = rx->cnt & rx->mask;
2504 rx->cnt += rx->nbufs;
2505 /* save a pointer to the received mbuf */
2506 m = rx->info[idx].m;
2507 /* try to replace the received mbuf */
2508 if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2509 /* drop the frame -- the old mbuf is re-cycled */
2514 /* unmap the received buffer */
2515 old_map = rx->info[idx].map;
2516 bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2517 bus_dmamap_unload(rx->dmat, old_map);
2519 /* swap the bus_dmamap_t's */
2520 rx->info[idx].map = rx->extra_map;
2521 rx->extra_map = old_map;
2523 /* mcp implicitly skips 1st 2 bytes so that packet is properly
2525 m->m_data += MXGEFW_PAD;
2527 m->m_pkthdr.rcvif = ifp;
2528 m->m_len = m->m_pkthdr.len = len;
2530 eh = mtod(m, struct ether_header *);
2531 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2532 mxge_vlan_tag_remove(m, &csum);
2534 /* if the checksum is valid, mark it in the mbuf header */
2535 if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2536 if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2538 /* otherwise, it was a UDP frame, or a TCP frame which
2539 we could not do LRO on. Tell the stack that the
2541 m->m_pkthdr.csum_data = 0xffff;
2542 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2544 /* flowid only valid if RSS hashing is enabled */
2545 if (sc->num_slices > 1) {
2546 m->m_pkthdr.flowid = (ss - sc->ss);
2547 m->m_flags |= M_FLOWID;
2549 /* pass the frame up the stack */
2550 (*ifp->if_input)(ifp, m);
2554 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2558 struct ether_header *eh;
2561 bus_dmamap_t old_map;
2563 uint16_t tcpudp_csum;
2568 idx = rx->cnt & rx->mask;
2570 /* save a pointer to the received mbuf */
2571 m = rx->info[idx].m;
2572 /* try to replace the received mbuf */
2573 if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2574 /* drop the frame -- the old mbuf is re-cycled */
2579 /* unmap the received buffer */
2580 old_map = rx->info[idx].map;
2581 bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2582 bus_dmamap_unload(rx->dmat, old_map);
2584 /* swap the bus_dmamap_t's */
2585 rx->info[idx].map = rx->extra_map;
2586 rx->extra_map = old_map;
2588 /* mcp implicitly skips 1st 2 bytes so that packet is properly
2590 m->m_data += MXGEFW_PAD;
2592 m->m_pkthdr.rcvif = ifp;
2593 m->m_len = m->m_pkthdr.len = len;
2595 eh = mtod(m, struct ether_header *);
2596 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2597 mxge_vlan_tag_remove(m, &csum);
2599 /* if the checksum is valid, mark it in the mbuf header */
2600 if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2601 if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2603 /* otherwise, it was a UDP frame, or a TCP frame which
2604 we could not do LRO on. Tell the stack that the
2606 m->m_pkthdr.csum_data = 0xffff;
2607 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2609 /* flowid only valid if RSS hashing is enabled */
2610 if (sc->num_slices > 1) {
2611 m->m_pkthdr.flowid = (ss - sc->ss);
2612 m->m_flags |= M_FLOWID;
2614 /* pass the frame up the stack */
2615 (*ifp->if_input)(ifp, m);
2619 mxge_clean_rx_done(struct mxge_slice_state *ss)
2621 mxge_rx_done_t *rx_done = &ss->rx_done;
2627 while (rx_done->entry[rx_done->idx].length != 0) {
2628 length = ntohs(rx_done->entry[rx_done->idx].length);
2629 rx_done->entry[rx_done->idx].length = 0;
2630 checksum = rx_done->entry[rx_done->idx].checksum;
2631 if (length <= (MHLEN - MXGEFW_PAD))
2632 mxge_rx_done_small(ss, length, checksum);
2634 mxge_rx_done_big(ss, length, checksum);
2636 rx_done->idx = rx_done->cnt & rx_done->mask;
2638 /* limit potential for livelock */
2639 if (__predict_false(++limit > rx_done->mask / 2))
2643 while (!SLIST_EMPTY(&ss->lro_active)) {
2644 struct lro_entry *lro = SLIST_FIRST(&ss->lro_active);
2645 SLIST_REMOVE_HEAD(&ss->lro_active, next);
2646 mxge_lro_flush(ss, lro);
2653 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2664 while (tx->pkt_done != mcp_idx) {
2665 idx = tx->done & tx->mask;
2667 m = tx->info[idx].m;
2668 /* mbuf and DMA map only attached to the first
2671 ss->obytes += m->m_pkthdr.len;
2672 if (m->m_flags & M_MCAST)
2675 tx->info[idx].m = NULL;
2676 map = tx->info[idx].map;
2677 bus_dmamap_unload(tx->dmat, map);
2680 if (tx->info[idx].flag) {
2681 tx->info[idx].flag = 0;
2686 /* If we have space, clear IFF_OACTIVE to tell the stack that
2687 its OK to send packets */
2688 #ifdef IFNET_BUF_RING
2689 flags = &ss->if_drv_flags;
2691 flags = &ifp->if_drv_flags;
2693 lockmgr(&ss->tx.lock, LK_EXCLUSIVE);
2694 if ((*flags) & IFF_DRV_OACTIVE &&
2695 tx->req - tx->done < (tx->mask + 1)/4) {
2696 *(flags) &= ~IFF_DRV_OACTIVE;
2698 mxge_start_locked(ss);
2700 #ifdef IFNET_BUF_RING
2701 if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2702 /* let the NIC stop polling this queue, since there
2703 * are no more transmits pending */
2704 if (tx->req == tx->done) {
2706 tx->queue_active = 0;
2712 lockmgr(&ss->tx.lock, LK_RELEASE);
2716 static struct mxge_media_type mxge_xfp_media_types[] =
2718 {IFM_10G_CX4, 0x7f, "10GBASE-CX4 (module)"},
2719 {IFM_10G_SR, (1 << 7), "10GBASE-SR"},
2720 {IFM_10G_LR, (1 << 6), "10GBASE-LR"},
2721 {0, (1 << 5), "10GBASE-ER"},
2722 {IFM_10G_LRM, (1 << 4), "10GBASE-LRM"},
2723 {0, (1 << 3), "10GBASE-SW"},
2724 {0, (1 << 2), "10GBASE-LW"},
2725 {0, (1 << 1), "10GBASE-EW"},
2726 {0, (1 << 0), "Reserved"}
2728 static struct mxge_media_type mxge_sfp_media_types[] =
2730 {0, (1 << 7), "Reserved"},
2731 {IFM_10G_LRM, (1 << 6), "10GBASE-LRM"},
2732 {IFM_10G_LR, (1 << 5), "10GBASE-LR"},
2733 {IFM_10G_SR, (1 << 4), "10GBASE-SR"}
2737 mxge_set_media(mxge_softc_t *sc, int type)
2739 sc->media_flags |= type;
2740 ifmedia_add(&sc->media, sc->media_flags, 0, NULL);
2741 ifmedia_set(&sc->media, sc->media_flags);
2746 * Determine the media type for a NIC. Some XFPs will identify
2747 * themselves only when their link is up, so this is initiated via a
2748 * link up interrupt. However, this can potentially take up to
2749 * several milliseconds, so it is run via the watchdog routine, rather
2750 * than in the interrupt handler itself. This need only be done
2751 * once, not each time the link is up.
2754 mxge_media_probe(mxge_softc_t *sc)
2759 struct mxge_media_type *mxge_media_types = NULL;
2760 int i, err, ms, mxge_media_type_entries;
2763 sc->need_media_probe = 0;
2765 /* if we've already set a media type, we're done */
2766 if (sc->media_flags != (IFM_ETHER | IFM_AUTO))
2770 * parse the product code to deterimine the interface type
2771 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2772 * after the 3rd dash in the driver's cached copy of the
2773 * EEPROM's product code string.
2775 ptr = sc->product_code_string;
2777 device_printf(sc->dev, "Missing product code\n");
2780 for (i = 0; i < 3; i++, ptr++) {
2781 ptr = index(ptr, '-');
2783 device_printf(sc->dev,
2784 "only %d dashes in PC?!?\n", i);
2790 mxge_set_media(sc, IFM_10G_CX4);
2793 else if (*ptr == 'Q') {
2794 /* -Q is Quad Ribbon Fiber */
2795 device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2796 /* FreeBSD has no media type for Quad ribbon fiber */
2802 mxge_media_types = mxge_xfp_media_types;
2803 mxge_media_type_entries =
2804 sizeof (mxge_xfp_media_types) /
2805 sizeof (mxge_xfp_media_types[0]);
2806 byte = MXGE_XFP_COMPLIANCE_BYTE;
2810 if (*ptr == 'S' || *(ptr +1) == 'S') {
2811 /* -S or -2S is SFP+ */
2812 mxge_media_types = mxge_sfp_media_types;
2813 mxge_media_type_entries =
2814 sizeof (mxge_sfp_media_types) /
2815 sizeof (mxge_sfp_media_types[0]);
2820 if (mxge_media_types == NULL) {
2821 device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2826 * At this point we know the NIC has an XFP cage, so now we
2827 * try to determine what is in the cage by using the
2828 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2829 * register. We read just one byte, which may take over
2833 cmd.data0 = 0; /* just fetch 1 byte, not all 256 */
2835 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2836 if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
2837 device_printf(sc->dev, "failed to read XFP\n");
2839 if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
2840 device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
2842 if (err != MXGEFW_CMD_OK) {
2846 /* now we wait for the data to be cached */
2848 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2849 for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
2852 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2854 if (err != MXGEFW_CMD_OK) {
2855 device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
2856 cage_type, err, ms);
2860 if (cmd.data0 == mxge_media_types[0].bitmask) {
2862 device_printf(sc->dev, "%s:%s\n", cage_type,
2863 mxge_media_types[0].name);
2864 mxge_set_media(sc, IFM_10G_CX4);
2867 for (i = 1; i < mxge_media_type_entries; i++) {
2868 if (cmd.data0 & mxge_media_types[i].bitmask) {
2870 device_printf(sc->dev, "%s:%s\n",
2872 mxge_media_types[i].name);
2874 mxge_set_media(sc, mxge_media_types[i].flag);
2878 device_printf(sc->dev, "%s media 0x%x unknown\n", cage_type,
2885 mxge_intr(void *arg)
2887 struct mxge_slice_state *ss = arg;
2888 mxge_softc_t *sc = ss->sc;
2889 mcp_irq_data_t *stats = ss->fw_stats;
2890 mxge_tx_ring_t *tx = &ss->tx;
2891 mxge_rx_done_t *rx_done = &ss->rx_done;
2892 uint32_t send_done_count;
2896 #ifndef IFNET_BUF_RING
2897 /* an interrupt on a non-zero slice is implicitly valid
2898 since MSI-X irqs are not shared */
2900 mxge_clean_rx_done(ss);
2901 *ss->irq_claim = be32toh(3);
2906 /* make sure the DMA has finished */
2907 if (!stats->valid) {
2910 valid = stats->valid;
2912 if (sc->legacy_irq) {
2913 /* lower legacy IRQ */
2914 *sc->irq_deassert = 0;
2915 if (!mxge_deassert_wait)
2916 /* don't wait for conf. that irq is low */
2922 /* loop while waiting for legacy irq deassertion */
2924 /* check for transmit completes and receives */
2925 send_done_count = be32toh(stats->send_done_count);
2926 while ((send_done_count != tx->pkt_done) ||
2927 (rx_done->entry[rx_done->idx].length != 0)) {
2928 if (send_done_count != tx->pkt_done)
2929 mxge_tx_done(ss, (int)send_done_count);
2930 mxge_clean_rx_done(ss);
2931 send_done_count = be32toh(stats->send_done_count);
2933 if (sc->legacy_irq && mxge_deassert_wait)
2935 } while (*((volatile uint8_t *) &stats->valid));
2937 /* fw link & error stats meaningful only on the first slice */
2938 if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
2939 if (sc->link_state != stats->link_up) {
2940 sc->link_state = stats->link_up;
2941 if (sc->link_state) {
2942 sc->ifp->if_link_state = LINK_STATE_UP;
2943 if_link_state_change(sc->ifp);
2945 device_printf(sc->dev, "link up\n");
2947 sc->ifp->if_link_state = LINK_STATE_DOWN;
2948 if_link_state_change(sc->ifp);
2950 device_printf(sc->dev, "link down\n");
2952 sc->need_media_probe = 1;
2954 if (sc->rdma_tags_available !=
2955 be32toh(stats->rdma_tags_available)) {
2956 sc->rdma_tags_available =
2957 be32toh(stats->rdma_tags_available);
2958 device_printf(sc->dev, "RDMA timed out! %d tags "
2959 "left\n", sc->rdma_tags_available);
2962 if (stats->link_down) {
2963 sc->down_cnt += stats->link_down;
2965 sc->ifp->if_link_state = LINK_STATE_DOWN;
2966 if_link_state_change(sc->ifp);
2970 /* check to see if we have rx token to pass back */
2972 *ss->irq_claim = be32toh(3);
2973 *(ss->irq_claim + 1) = be32toh(3);
2977 mxge_init(void *arg)
2984 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
2986 struct lro_entry *lro_entry;
2989 while (!SLIST_EMPTY(&ss->lro_free)) {
2990 lro_entry = SLIST_FIRST(&ss->lro_free);
2991 SLIST_REMOVE_HEAD(&ss->lro_free, next);
2992 kfree(lro_entry, M_DEVBUF);
2995 for (i = 0; i <= ss->rx_big.mask; i++) {
2996 if (ss->rx_big.info[i].m == NULL)
2998 bus_dmamap_unload(ss->rx_big.dmat,
2999 ss->rx_big.info[i].map);
3000 m_freem(ss->rx_big.info[i].m);
3001 ss->rx_big.info[i].m = NULL;
3004 for (i = 0; i <= ss->rx_small.mask; i++) {
3005 if (ss->rx_small.info[i].m == NULL)
3007 bus_dmamap_unload(ss->rx_small.dmat,
3008 ss->rx_small.info[i].map);
3009 m_freem(ss->rx_small.info[i].m);
3010 ss->rx_small.info[i].m = NULL;
3013 /* transmit ring used only on the first slice */
3014 if (ss->tx.info == NULL)
3017 for (i = 0; i <= ss->tx.mask; i++) {
3018 ss->tx.info[i].flag = 0;
3019 if (ss->tx.info[i].m == NULL)
3021 bus_dmamap_unload(ss->tx.dmat,
3022 ss->tx.info[i].map);
3023 m_freem(ss->tx.info[i].m);
3024 ss->tx.info[i].m = NULL;
3029 mxge_free_mbufs(mxge_softc_t *sc)
3033 for (slice = 0; slice < sc->num_slices; slice++)
3034 mxge_free_slice_mbufs(&sc->ss[slice]);
3038 mxge_free_slice_rings(struct mxge_slice_state *ss)
3043 if (ss->rx_done.entry != NULL)
3044 mxge_dma_free(&ss->rx_done.dma);
3045 ss->rx_done.entry = NULL;
3047 if (ss->tx.req_bytes != NULL)
3048 kfree(ss->tx.req_bytes, M_DEVBUF);
3049 ss->tx.req_bytes = NULL;
3051 if (ss->tx.seg_list != NULL)
3052 kfree(ss->tx.seg_list, M_DEVBUF);
3053 ss->tx.seg_list = NULL;
3055 if (ss->rx_small.shadow != NULL)
3056 kfree(ss->rx_small.shadow, M_DEVBUF);
3057 ss->rx_small.shadow = NULL;
3059 if (ss->rx_big.shadow != NULL)
3060 kfree(ss->rx_big.shadow, M_DEVBUF);
3061 ss->rx_big.shadow = NULL;
3063 if (ss->tx.info != NULL) {
3064 if (ss->tx.dmat != NULL) {
3065 for (i = 0; i <= ss->tx.mask; i++) {
3066 bus_dmamap_destroy(ss->tx.dmat,
3067 ss->tx.info[i].map);
3069 bus_dma_tag_destroy(ss->tx.dmat);
3071 kfree(ss->tx.info, M_DEVBUF);
3075 if (ss->rx_small.info != NULL) {
3076 if (ss->rx_small.dmat != NULL) {
3077 for (i = 0; i <= ss->rx_small.mask; i++) {
3078 bus_dmamap_destroy(ss->rx_small.dmat,
3079 ss->rx_small.info[i].map);
3081 bus_dmamap_destroy(ss->rx_small.dmat,
3082 ss->rx_small.extra_map);
3083 bus_dma_tag_destroy(ss->rx_small.dmat);
3085 kfree(ss->rx_small.info, M_DEVBUF);
3087 ss->rx_small.info = NULL;
3089 if (ss->rx_big.info != NULL) {
3090 if (ss->rx_big.dmat != NULL) {
3091 for (i = 0; i <= ss->rx_big.mask; i++) {
3092 bus_dmamap_destroy(ss->rx_big.dmat,
3093 ss->rx_big.info[i].map);
3095 bus_dmamap_destroy(ss->rx_big.dmat,
3096 ss->rx_big.extra_map);
3097 bus_dma_tag_destroy(ss->rx_big.dmat);
3099 kfree(ss->rx_big.info, M_DEVBUF);
3101 ss->rx_big.info = NULL;
3105 mxge_free_rings(mxge_softc_t *sc)
3109 for (slice = 0; slice < sc->num_slices; slice++)
3110 mxge_free_slice_rings(&sc->ss[slice]);
3114 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3115 int tx_ring_entries)
3117 mxge_softc_t *sc = ss->sc;
3123 /* allocate per-slice receive resources */
3125 ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3126 ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3128 /* allocate the rx shadow rings */
3129 bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3130 ss->rx_small.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3131 if (ss->rx_small.shadow == NULL)
3134 bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3135 ss->rx_big.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3136 if (ss->rx_big.shadow == NULL)
3139 /* allocate the rx host info rings */
3140 bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3141 ss->rx_small.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3142 if (ss->rx_small.info == NULL)
3145 bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3146 ss->rx_big.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3147 if (ss->rx_big.info == NULL)
3150 /* allocate the rx busdma resources */
3151 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3153 4096, /* boundary */
3154 BUS_SPACE_MAXADDR, /* low */
3155 BUS_SPACE_MAXADDR, /* high */
3156 NULL, NULL, /* filter */
3157 MHLEN, /* maxsize */
3159 MHLEN, /* maxsegsize */
3160 BUS_DMA_ALLOCNOW, /* flags */
3161 NULL, NULL, /* lock */
3162 &ss->rx_small.dmat); /* tag */
3164 device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3169 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3171 #if MXGE_VIRT_JUMBOS
3172 4096, /* boundary */
3176 BUS_SPACE_MAXADDR, /* low */
3177 BUS_SPACE_MAXADDR, /* high */
3178 NULL, NULL, /* filter */
3179 3*4096, /* maxsize */
3180 #if MXGE_VIRT_JUMBOS
3182 4096, /* maxsegsize*/
3185 MJUM9BYTES, /* maxsegsize*/
3187 BUS_DMA_ALLOCNOW, /* flags */
3188 NULL, NULL, /* lock */
3189 &ss->rx_big.dmat); /* tag */
3191 device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3195 for (i = 0; i <= ss->rx_small.mask; i++) {
3196 err = bus_dmamap_create(ss->rx_small.dmat, 0,
3197 &ss->rx_small.info[i].map);
3199 device_printf(sc->dev, "Err %d rx_small dmamap\n",
3204 err = bus_dmamap_create(ss->rx_small.dmat, 0,
3205 &ss->rx_small.extra_map);
3207 device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3212 for (i = 0; i <= ss->rx_big.mask; i++) {
3213 err = bus_dmamap_create(ss->rx_big.dmat, 0,
3214 &ss->rx_big.info[i].map);
3216 device_printf(sc->dev, "Err %d rx_big dmamap\n",
3221 err = bus_dmamap_create(ss->rx_big.dmat, 0,
3222 &ss->rx_big.extra_map);
3224 device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3229 /* now allocate TX resouces */
3231 #ifndef IFNET_BUF_RING
3232 /* only use a single TX ring for now */
3233 if (ss != ss->sc->ss)
3237 ss->tx.mask = tx_ring_entries - 1;
3238 ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3241 /* allocate the tx request copy block */
3243 sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3244 ss->tx.req_bytes = kmalloc(bytes, M_DEVBUF, M_WAITOK);
3245 if (ss->tx.req_bytes == NULL)
3247 /* ensure req_list entries are aligned to 8 bytes */
3248 ss->tx.req_list = (mcp_kreq_ether_send_t *)
3249 ((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3251 /* allocate the tx busdma segment list */
3252 bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3253 ss->tx.seg_list = (bus_dma_segment_t *)
3254 kmalloc(bytes, M_DEVBUF, M_WAITOK);
3255 if (ss->tx.seg_list == NULL)
3258 /* allocate the tx host info ring */
3259 bytes = tx_ring_entries * sizeof (*ss->tx.info);
3260 ss->tx.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3261 if (ss->tx.info == NULL)
3264 /* allocate the tx busdma resources */
3265 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3267 sc->tx_boundary, /* boundary */
3268 BUS_SPACE_MAXADDR, /* low */
3269 BUS_SPACE_MAXADDR, /* high */
3270 NULL, NULL, /* filter */
3271 65536 + 256, /* maxsize */
3272 ss->tx.max_desc - 2, /* num segs */
3273 sc->tx_boundary, /* maxsegsz */
3274 BUS_DMA_ALLOCNOW, /* flags */
3275 NULL, NULL, /* lock */
3276 &ss->tx.dmat); /* tag */
3279 device_printf(sc->dev, "Err %d allocating tx dmat\n",
3284 /* now use these tags to setup dmamaps for each slot
3286 for (i = 0; i <= ss->tx.mask; i++) {
3287 err = bus_dmamap_create(ss->tx.dmat, 0,
3288 &ss->tx.info[i].map);
3290 device_printf(sc->dev, "Err %d tx dmamap\n",
3300 mxge_alloc_rings(mxge_softc_t *sc)
3304 int tx_ring_entries, rx_ring_entries;
3307 /* get ring sizes */
3308 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3309 tx_ring_size = cmd.data0;
3311 device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3315 tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3316 rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3317 IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
3318 sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
3319 IFQ_SET_READY(&sc->ifp->if_snd);
3321 for (slice = 0; slice < sc->num_slices; slice++) {
3322 err = mxge_alloc_slice_rings(&sc->ss[slice],
3331 mxge_free_rings(sc);
3338 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3340 int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3342 if (bufsize < MCLBYTES) {
3343 /* easy, everything fits in a single buffer */
3344 *big_buf_size = MCLBYTES;
3345 *cl_size = MCLBYTES;
3350 if (bufsize < MJUMPAGESIZE) {
3351 /* still easy, everything still fits in a single buffer */
3352 *big_buf_size = MJUMPAGESIZE;
3353 *cl_size = MJUMPAGESIZE;
3357 #if MXGE_VIRT_JUMBOS
3358 /* now we need to use virtually contiguous buffers */
3359 *cl_size = MJUM9BYTES;
3360 *big_buf_size = 4096;
3361 *nbufs = mtu / 4096 + 1;
3362 /* needs to be a power of two, so round up */
3366 *cl_size = MJUM9BYTES;
3367 *big_buf_size = MJUM9BYTES;
3373 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3378 struct lro_entry *lro_entry;
3383 slice = ss - sc->ss;
3385 SLIST_INIT(&ss->lro_free);
3386 SLIST_INIT(&ss->lro_active);
3388 for (i = 0; i < sc->lro_cnt; i++) {
3389 lro_entry = (struct lro_entry *)
3390 kmalloc(sizeof (*lro_entry), M_DEVBUF,
3392 if (lro_entry == NULL) {
3396 SLIST_INSERT_HEAD(&ss->lro_free, lro_entry, next);
3398 /* get the lanai pointers to the send and receive rings */
3401 #ifndef IFNET_BUF_RING
3402 /* We currently only send from the first slice */
3406 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3408 (volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3409 ss->tx.send_go = (volatile uint32_t *)
3410 (sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3411 ss->tx.send_stop = (volatile uint32_t *)
3412 (sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3413 #ifndef IFNET_BUF_RING
3417 err |= mxge_send_cmd(sc,
3418 MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3419 ss->rx_small.lanai =
3420 (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3422 err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3424 (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3427 device_printf(sc->dev,
3428 "failed to get ring sizes or locations\n");
3432 /* stock receive rings */
3433 for (i = 0; i <= ss->rx_small.mask; i++) {
3434 map = ss->rx_small.info[i].map;
3435 err = mxge_get_buf_small(ss, map, i);
3437 device_printf(sc->dev, "alloced %d/%d smalls\n",
3438 i, ss->rx_small.mask + 1);
3442 for (i = 0; i <= ss->rx_big.mask; i++) {
3443 ss->rx_big.shadow[i].addr_low = 0xffffffff;
3444 ss->rx_big.shadow[i].addr_high = 0xffffffff;
3446 ss->rx_big.nbufs = nbufs;
3447 ss->rx_big.cl_size = cl_size;
3448 ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3449 ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3450 for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3451 map = ss->rx_big.info[i].map;
3452 err = mxge_get_buf_big(ss, map, i);
3454 device_printf(sc->dev, "alloced %d/%d bigs\n",
3455 i, ss->rx_big.mask + 1);
3463 mxge_open(mxge_softc_t *sc)
3466 int err, big_bytes, nbufs, slice, cl_size, i;
3468 volatile uint8_t *itable;
3469 struct mxge_slice_state *ss;
3471 /* Copy the MAC address in case it was overridden */
3472 bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3474 err = mxge_reset(sc, 1);
3476 device_printf(sc->dev, "failed to reset\n");
3480 if (sc->num_slices > 1) {
3481 /* setup the indirection table */
3482 cmd.data0 = sc->num_slices;
3483 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3486 err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3489 device_printf(sc->dev,
3490 "failed to setup rss tables\n");
3494 /* just enable an identity mapping */
3495 itable = sc->sram + cmd.data0;
3496 for (i = 0; i < sc->num_slices; i++)
3497 itable[i] = (uint8_t)i;
3500 cmd.data1 = mxge_rss_hash_type;
3501 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3503 device_printf(sc->dev, "failed to enable slices\n");
3509 mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3512 err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3514 /* error is only meaningful if we're trying to set
3515 MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3516 if (err && nbufs > 1) {
3517 device_printf(sc->dev,
3518 "Failed to set alway-use-n to %d\n",
3522 /* Give the firmware the mtu and the big and small buffer
3523 sizes. The firmware wants the big buf size to be a power
3524 of two. Luckily, FreeBSD's clusters are powers of two */
3525 cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3526 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3527 cmd.data0 = MHLEN - MXGEFW_PAD;
3528 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3530 cmd.data0 = big_bytes;
3531 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3534 device_printf(sc->dev, "failed to setup params\n");
3538 /* Now give him the pointer to the stats block */
3540 #ifdef IFNET_BUF_RING
3541 slice < sc->num_slices;
3546 ss = &sc->ss[slice];
3548 MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3550 MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3551 cmd.data2 = sizeof(struct mcp_irq_data);
3552 cmd.data2 |= (slice << 16);
3553 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3557 bus = sc->ss->fw_stats_dma.bus_addr;
3558 bus += offsetof(struct mcp_irq_data, send_done_count);
3559 cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3560 cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3561 err = mxge_send_cmd(sc,
3562 MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3564 /* Firmware cannot support multicast without STATS_DMA_V2 */
3565 sc->fw_multicast_support = 0;
3567 sc->fw_multicast_support = 1;
3571 device_printf(sc->dev, "failed to setup params\n");
3575 for (slice = 0; slice < sc->num_slices; slice++) {
3576 err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3578 device_printf(sc->dev, "couldn't open slice %d\n",
3584 /* Finally, start the firmware running */
3585 err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3587 device_printf(sc->dev, "Couldn't bring up link\n");
3590 #ifdef IFNET_BUF_RING
3591 for (slice = 0; slice < sc->num_slices; slice++) {
3592 ss = &sc->ss[slice];
3593 ss->if_drv_flags |= IFF_DRV_RUNNING;
3594 ss->if_drv_flags &= ~IFF_DRV_OACTIVE;
3597 sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
3598 sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
3599 callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3605 mxge_free_mbufs(sc);
3611 mxge_close(mxge_softc_t *sc)
3614 int err, old_down_cnt;
3615 #ifdef IFNET_BUF_RING
3616 struct mxge_slice_state *ss;
3620 callout_stop(&sc->co_hdl);
3621 #ifdef IFNET_BUF_RING
3622 for (slice = 0; slice < sc->num_slices; slice++) {
3623 ss = &sc->ss[slice];
3624 ss->if_drv_flags &= ~IFF_DRV_RUNNING;
3627 sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
3628 old_down_cnt = sc->down_cnt;
3630 err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3632 device_printf(sc->dev, "Couldn't bring down link\n");
3634 if (old_down_cnt == sc->down_cnt) {
3635 /* wait for down irq */
3636 DELAY(10 * sc->intr_coal_delay);
3639 if (old_down_cnt == sc->down_cnt) {
3640 device_printf(sc->dev, "never got down irq\n");
3643 mxge_free_mbufs(sc);
3649 mxge_setup_cfg_space(mxge_softc_t *sc)
3651 device_t dev = sc->dev;
3653 uint16_t cmd, lnk, pectl;
3655 /* find the PCIe link width and set max read request to 4KB*/
3656 if (pci_find_extcap(dev, PCIY_EXPRESS, ®) == 0) {
3657 lnk = pci_read_config(dev, reg + 0x12, 2);
3658 sc->link_width = (lnk >> 4) & 0x3f;
3660 pectl = pci_read_config(dev, reg + 0x8, 2);
3661 pectl = (pectl & ~0x7000) | (5 << 12);
3662 pci_write_config(dev, reg + 0x8, pectl, 2);
3665 /* Enable DMA and Memory space access */
3666 pci_enable_busmaster(dev);
3667 cmd = pci_read_config(dev, PCIR_COMMAND, 2);
3668 cmd |= PCIM_CMD_MEMEN;
3669 pci_write_config(dev, PCIR_COMMAND, cmd, 2);
3673 mxge_read_reboot(mxge_softc_t *sc)
3675 device_t dev = sc->dev;
3678 /* find the vendor specific offset */
3679 if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
3680 device_printf(sc->dev,
3681 "could not find vendor specific offset\n");
3682 return (uint32_t)-1;
3684 /* enable read32 mode */
3685 pci_write_config(dev, vs + 0x10, 0x3, 1);
3686 /* tell NIC which register to read */
3687 pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3688 return (pci_read_config(dev, vs + 0x14, 4));
3692 mxge_watchdog_reset(mxge_softc_t *sc, int slice)
3694 struct pci_devinfo *dinfo;
3702 device_printf(sc->dev, "Watchdog reset!\n");
3705 * check to see if the NIC rebooted. If it did, then all of
3706 * PCI config space has been reset, and things like the
3707 * busmaster bit will be zero. If this is the case, then we
3708 * must restore PCI config space before the NIC can be used
3711 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3712 if (cmd == 0xffff) {
3714 * maybe the watchdog caught the NIC rebooting; wait
3715 * up to 100ms for it to finish. If it does not come
3716 * back, then give up
3719 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3720 if (cmd == 0xffff) {
3721 device_printf(sc->dev, "NIC disappeared!\n");
3725 if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3726 /* print the reboot status */
3727 reboot = mxge_read_reboot(sc);
3728 device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3730 /* restore PCI configuration space */
3731 dinfo = device_get_ivars(sc->dev);
3732 pci_cfg_restore(sc->dev, dinfo);
3734 /* and redo any changes we made to our config space */
3735 mxge_setup_cfg_space(sc);
3737 if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING) {
3739 err = mxge_open(sc);
3742 tx = &sc->ss[slice].tx;
3743 device_printf(sc->dev,
3744 "NIC did not reboot, slice %d ring state:\n",
3746 device_printf(sc->dev,
3747 "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3748 tx->req, tx->done, tx->queue_active);
3749 device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3750 tx->activate, tx->deactivate);
3751 device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3753 be32toh(sc->ss->fw_stats->send_done_count));
3754 device_printf(sc->dev, "not resetting\n");
3760 mxge_watchdog(mxge_softc_t *sc)
3763 uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3766 /* see if we have outstanding transmits, which
3767 have been pending for more than mxge_ticks */
3769 #ifdef IFNET_BUF_RING
3770 (i < sc->num_slices) && (err == 0);
3772 (i < 1) && (err == 0);
3776 if (tx->req != tx->done &&
3777 tx->watchdog_req != tx->watchdog_done &&
3778 tx->done == tx->watchdog_done) {
3779 /* check for pause blocking before resetting */
3780 if (tx->watchdog_rx_pause == rx_pause)
3781 err = mxge_watchdog_reset(sc, i);
3783 device_printf(sc->dev, "Flow control blocking "
3784 "xmits, check link partner\n");
3787 tx->watchdog_req = tx->req;
3788 tx->watchdog_done = tx->done;
3789 tx->watchdog_rx_pause = rx_pause;
3792 if (sc->need_media_probe)
3793 mxge_media_probe(sc);
3798 mxge_update_stats(mxge_softc_t *sc)
3800 struct mxge_slice_state *ss;
3801 u_long ipackets = 0;
3802 u_long opackets = 0;
3803 #ifdef IFNET_BUF_RING
3811 for (slice = 0; slice < sc->num_slices; slice++) {
3812 ss = &sc->ss[slice];
3813 ipackets += ss->ipackets;
3814 opackets += ss->opackets;
3815 #ifdef IFNET_BUF_RING
3816 obytes += ss->obytes;
3817 omcasts += ss->omcasts;
3818 odrops += ss->tx.br->br_drops;
3820 oerrors += ss->oerrors;
3822 sc->ifp->if_ipackets = ipackets;
3823 sc->ifp->if_opackets = opackets;
3824 #ifdef IFNET_BUF_RING
3825 sc->ifp->if_obytes = obytes;
3826 sc->ifp->if_omcasts = omcasts;
3827 sc->ifp->if_snd.ifq_drops = odrops;
3829 sc->ifp->if_oerrors = oerrors;
3833 mxge_tick(void *arg)
3835 mxge_softc_t *sc = arg;
3838 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
3839 /* aggregate stats from different slices */
3840 mxge_update_stats(sc);
3841 if (!sc->watchdog_countdown) {
3842 err = mxge_watchdog(sc);
3843 sc->watchdog_countdown = 4;
3845 sc->watchdog_countdown--;
3847 callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3848 lockmgr(&sc->driver_lock, LK_RELEASE);
3852 mxge_media_change(struct ifnet *ifp)
3858 mxge_change_mtu(mxge_softc_t *sc, int mtu)
3860 struct ifnet *ifp = sc->ifp;
3861 int real_mtu, old_mtu;
3865 real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3866 if ((real_mtu > sc->max_mtu) || real_mtu < 60)
3868 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
3869 old_mtu = ifp->if_mtu;
3871 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3873 err = mxge_open(sc);
3875 ifp->if_mtu = old_mtu;
3877 (void) mxge_open(sc);
3880 lockmgr(&sc->driver_lock, LK_RELEASE);
3885 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
3887 mxge_softc_t *sc = ifp->if_softc;
3892 ifmr->ifm_status = IFM_AVALID;
3893 ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
3894 ifmr->ifm_active = IFM_AUTO | IFM_ETHER;
3895 ifmr->ifm_active |= sc->link_state ? IFM_FDX : 0;
3899 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data, struct ucred *cr)
3901 mxge_softc_t *sc = ifp->if_softc;
3902 struct ifreq *ifr = (struct ifreq *)data;
3910 err = ether_ioctl(ifp, command, data);
3914 err = mxge_change_mtu(sc, ifr->ifr_mtu);
3918 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
3920 lockmgr(&sc->driver_lock, LK_RELEASE);
3923 if (ifp->if_flags & IFF_UP) {
3924 if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
3925 err = mxge_open(sc);
3927 /* take care of promis can allmulti
3929 mxge_change_promisc(sc,
3930 ifp->if_flags & IFF_PROMISC);
3931 mxge_set_multicast_list(sc);
3934 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3938 lockmgr(&sc->driver_lock, LK_RELEASE);
3943 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
3944 mxge_set_multicast_list(sc);
3945 lockmgr(&sc->driver_lock, LK_RELEASE);
3949 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
3950 mask = ifr->ifr_reqcap ^ ifp->if_capenable;
3951 if (mask & IFCAP_TXCSUM) {
3952 if (IFCAP_TXCSUM & ifp->if_capenable) {
3953 ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
3954 ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
3957 ifp->if_capenable |= IFCAP_TXCSUM;
3958 ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
3960 } else if (mask & IFCAP_RXCSUM) {
3961 if (IFCAP_RXCSUM & ifp->if_capenable) {
3962 ifp->if_capenable &= ~IFCAP_RXCSUM;
3965 ifp->if_capenable |= IFCAP_RXCSUM;
3969 if (mask & IFCAP_TSO4) {
3970 if (IFCAP_TSO4 & ifp->if_capenable) {
3971 ifp->if_capenable &= ~IFCAP_TSO4;
3972 ifp->if_hwassist &= ~CSUM_TSO;
3973 } else if (IFCAP_TXCSUM & ifp->if_capenable) {
3974 ifp->if_capenable |= IFCAP_TSO4;
3975 ifp->if_hwassist |= CSUM_TSO;
3977 kprintf("mxge requires tx checksum offload"
3978 " be enabled to use TSO\n");
3982 if (mask & IFCAP_LRO) {
3983 if (IFCAP_LRO & ifp->if_capenable)
3984 err = mxge_change_lro_locked(sc, 0);
3986 err = mxge_change_lro_locked(sc, mxge_lro_cnt);
3988 if (mask & IFCAP_VLAN_HWTAGGING)
3989 ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
3990 lockmgr(&sc->driver_lock, LK_RELEASE);
3991 VLAN_CAPABILITIES(ifp);
3996 err = ifmedia_ioctl(ifp, (struct ifreq *)data,
3997 &sc->media, command);
4007 mxge_fetch_tunables(mxge_softc_t *sc)
4010 TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4011 TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4012 &mxge_flow_control);
4013 TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4014 &mxge_intr_coal_delay);
4015 TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4016 &mxge_nvidia_ecrc_enable);
4017 TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4018 &mxge_force_firmware);
4019 TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4020 &mxge_deassert_wait);
4021 TUNABLE_INT_FETCH("hw.mxge.verbose",
4023 TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4024 TUNABLE_INT_FETCH("hw.mxge.lro_cnt", &sc->lro_cnt);
4025 TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4026 TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4027 TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4028 if (sc->lro_cnt != 0)
4029 mxge_lro_cnt = sc->lro_cnt;
4033 if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4034 mxge_intr_coal_delay = 30;
4035 if (mxge_ticks == 0)
4036 mxge_ticks = hz / 2;
4037 sc->pause = mxge_flow_control;
4038 if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4039 || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4040 mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
4042 if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4043 mxge_initial_mtu < ETHER_MIN_LEN)
4044 mxge_initial_mtu = ETHERMTU_JUMBO;
4049 mxge_free_slices(mxge_softc_t *sc)
4051 struct mxge_slice_state *ss;
4058 for (i = 0; i < sc->num_slices; i++) {
4060 if (ss->fw_stats != NULL) {
4061 mxge_dma_free(&ss->fw_stats_dma);
4062 ss->fw_stats = NULL;
4063 #ifdef IFNET_BUF_RING
4064 if (ss->tx.br != NULL) {
4065 drbr_free(ss->tx.br, M_DEVBUF);
4069 lockuninit(&ss->tx.lock);
4071 if (ss->rx_done.entry != NULL) {
4072 mxge_dma_free(&ss->rx_done.dma);
4073 ss->rx_done.entry = NULL;
4076 kfree(sc->ss, M_DEVBUF);
4081 mxge_alloc_slices(mxge_softc_t *sc)
4084 struct mxge_slice_state *ss;
4086 int err, i, max_intr_slots;
4088 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4090 device_printf(sc->dev, "Cannot determine rx ring size\n");
4093 sc->rx_ring_size = cmd.data0;
4094 max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4096 bytes = sizeof (*sc->ss) * sc->num_slices;
4097 sc->ss = kmalloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4100 for (i = 0; i < sc->num_slices; i++) {
4105 /* allocate per-slice rx interrupt queues */
4107 bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4108 err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4111 ss->rx_done.entry = ss->rx_done.dma.addr;
4112 bzero(ss->rx_done.entry, bytes);
4115 * allocate the per-slice firmware stats; stats
4116 * (including tx) are used used only on the first
4119 #ifndef IFNET_BUF_RING
4124 bytes = sizeof (*ss->fw_stats);
4125 err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4126 sizeof (*ss->fw_stats), 64);
4129 ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4130 ksnprintf(ss->tx.lock_name, sizeof(ss->tx.lock_name),
4131 "%s:tx(%d)", device_get_nameunit(sc->dev), i);
4132 lockinit(&ss->tx.lock, ss->tx.lock_name, 0, LK_CANRECURSE);
4133 #ifdef IFNET_BUF_RING
4134 ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4142 mxge_free_slices(sc);
4147 mxge_slice_probe(mxge_softc_t *sc)
4151 int msix_cnt, status, max_intr_slots;
4155 * don't enable multiple slices if they are not enabled,
4156 * or if this is not an SMP system
4159 if (mxge_max_slices == 0 || mxge_max_slices == 1 || ncpus < 2)
4162 /* see how many MSI-X interrupts are available */
4163 msix_cnt = pci_msix_count(sc->dev);
4167 /* now load the slice aware firmware see what it supports */
4168 old_fw = sc->fw_name;
4169 if (old_fw == mxge_fw_aligned)
4170 sc->fw_name = mxge_fw_rss_aligned;
4172 sc->fw_name = mxge_fw_rss_unaligned;
4173 status = mxge_load_firmware(sc, 0);
4175 device_printf(sc->dev, "Falling back to a single slice\n");
4179 /* try to send a reset command to the card to see if it
4181 memset(&cmd, 0, sizeof (cmd));
4182 status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4184 device_printf(sc->dev, "failed reset\n");
4188 /* get rx ring size */
4189 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4191 device_printf(sc->dev, "Cannot determine rx ring size\n");
4194 max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4196 /* tell it the size of the interrupt queues */
4197 cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4198 status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4200 device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4204 /* ask the maximum number of slices it supports */
4205 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4207 device_printf(sc->dev,
4208 "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4211 sc->num_slices = cmd.data0;
4212 if (sc->num_slices > msix_cnt)
4213 sc->num_slices = msix_cnt;
4215 if (mxge_max_slices == -1) {
4216 /* cap to number of CPUs in system */
4217 if (sc->num_slices > ncpus)
4218 sc->num_slices = ncpus;
4220 if (sc->num_slices > mxge_max_slices)
4221 sc->num_slices = mxge_max_slices;
4223 /* make sure it is a power of two */
4224 while (sc->num_slices & (sc->num_slices - 1))
4228 device_printf(sc->dev, "using %d slices\n",
4234 sc->fw_name = old_fw;
4235 (void) mxge_load_firmware(sc, 0);
4239 mxge_add_msix_irqs(mxge_softc_t *sc)
4242 int count, err, i, rid;
4245 sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4248 if (sc->msix_table_res == NULL) {
4249 device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4253 count = sc->num_slices;
4254 err = pci_alloc_msix(sc->dev, &count);
4256 device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4257 "err = %d \n", sc->num_slices, err);
4258 goto abort_with_msix_table;
4260 if (count < sc->num_slices) {
4261 device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4262 count, sc->num_slices);
4263 device_printf(sc->dev,
4264 "Try setting hw.mxge.max_slices to %d\n",
4267 goto abort_with_msix;
4269 bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4270 sc->msix_irq_res = kmalloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4271 if (sc->msix_irq_res == NULL) {
4273 goto abort_with_msix;
4276 for (i = 0; i < sc->num_slices; i++) {
4278 sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4281 if (sc->msix_irq_res[i] == NULL) {
4282 device_printf(sc->dev, "couldn't allocate IRQ res"
4283 " for message %d\n", i);
4285 goto abort_with_res;
4289 bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4290 sc->msix_ih = kmalloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4292 for (i = 0; i < sc->num_slices; i++) {
4293 err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4294 INTR_TYPE_NET | INTR_MPSAFE,
4295 #if __FreeBSD_version > 700030
4298 mxge_intr, &sc->ss[i], &sc->msix_ih[i]);
4300 device_printf(sc->dev, "couldn't setup intr for "
4302 goto abort_with_intr;
4307 device_printf(sc->dev, "using %d msix IRQs:",
4309 for (i = 0; i < sc->num_slices; i++)
4310 kprintf(" %ld", rman_get_start(sc->msix_irq_res[i]));
4316 for (i = 0; i < sc->num_slices; i++) {
4317 if (sc->msix_ih[i] != NULL) {
4318 bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4320 sc->msix_ih[i] = NULL;
4323 kfree(sc->msix_ih, M_DEVBUF);
4327 for (i = 0; i < sc->num_slices; i++) {
4329 if (sc->msix_irq_res[i] != NULL)
4330 bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4331 sc->msix_irq_res[i]);
4332 sc->msix_irq_res[i] = NULL;
4334 kfree(sc->msix_irq_res, M_DEVBUF);
4338 pci_release_msi(sc->dev);
4340 abort_with_msix_table:
4341 bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4342 sc->msix_table_res);
4348 mxge_add_single_irq(mxge_softc_t *sc)
4350 int count, err, rid;
4352 count = pci_msi_count(sc->dev);
4353 if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4359 sc->irq_res = bus_alloc_resource(sc->dev, SYS_RES_IRQ, &rid, 0, ~0,
4360 1, RF_SHAREABLE | RF_ACTIVE);
4361 if (sc->irq_res == NULL) {
4362 device_printf(sc->dev, "could not alloc interrupt\n");
4366 device_printf(sc->dev, "using %s irq %ld\n",
4367 sc->legacy_irq ? "INTx" : "MSI",
4368 rman_get_start(sc->irq_res));
4369 err = bus_setup_intr(sc->dev, sc->irq_res,
4370 INTR_TYPE_NET | INTR_MPSAFE,
4371 #if __FreeBSD_version > 700030
4374 mxge_intr, &sc->ss[0], &sc->ih);
4376 bus_release_resource(sc->dev, SYS_RES_IRQ,
4377 sc->legacy_irq ? 0 : 1, sc->irq_res);
4378 if (!sc->legacy_irq)
4379 pci_release_msi(sc->dev);
4385 mxge_rem_msix_irqs(mxge_softc_t *sc)
4389 for (i = 0; i < sc->num_slices; i++) {
4390 if (sc->msix_ih[i] != NULL) {
4391 bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4393 sc->msix_ih[i] = NULL;
4396 kfree(sc->msix_ih, M_DEVBUF);
4398 for (i = 0; i < sc->num_slices; i++) {
4400 if (sc->msix_irq_res[i] != NULL)
4401 bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4402 sc->msix_irq_res[i]);
4403 sc->msix_irq_res[i] = NULL;
4405 kfree(sc->msix_irq_res, M_DEVBUF);
4407 bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4408 sc->msix_table_res);
4410 pci_release_msi(sc->dev);
4415 mxge_rem_single_irq(mxge_softc_t *sc)
4417 bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4418 bus_release_resource(sc->dev, SYS_RES_IRQ,
4419 sc->legacy_irq ? 0 : 1, sc->irq_res);
4420 if (!sc->legacy_irq)
4421 pci_release_msi(sc->dev);
4425 mxge_rem_irq(mxge_softc_t *sc)
4427 if (sc->num_slices > 1)
4428 mxge_rem_msix_irqs(sc);
4430 mxge_rem_single_irq(sc);
4434 mxge_add_irq(mxge_softc_t *sc)
4438 if (sc->num_slices > 1)
4439 err = mxge_add_msix_irqs(sc);
4441 err = mxge_add_single_irq(sc);
4443 if (0 && err == 0 && sc->num_slices > 1) {
4444 mxge_rem_msix_irqs(sc);
4445 err = mxge_add_msix_irqs(sc);
4452 mxge_attach(device_t dev)
4454 mxge_softc_t *sc = device_get_softc(dev);
4455 struct ifnet *ifp = &sc->arpcom.ac_if;
4459 * avoid rewriting half the lines in this file to use
4460 * &sc->arpcom.ac_if instead
4464 mxge_fetch_tunables(sc);
4466 err = bus_dma_tag_create(NULL, /* parent */
4469 BUS_SPACE_MAXADDR, /* low */
4470 BUS_SPACE_MAXADDR, /* high */
4471 NULL, NULL, /* filter */
4472 65536 + 256, /* maxsize */
4473 MXGE_MAX_SEND_DESC, /* num segs */
4474 65536, /* maxsegsize */
4476 NULL, NULL, /* lock */
4477 &sc->parent_dmat); /* tag */
4480 device_printf(sc->dev, "Err %d allocating parent dmat\n",
4482 goto abort_with_nothing;
4486 if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4488 ksnprintf(sc->cmd_lock_name, sizeof(sc->cmd_lock_name), "%s:cmd",
4489 device_get_nameunit(dev));
4490 lockinit(&sc->cmd_lock, sc->cmd_lock_name, 0, LK_CANRECURSE);
4491 ksnprintf(sc->driver_lock_name, sizeof(sc->driver_lock_name),
4492 "%s:drv", device_get_nameunit(dev));
4493 lockinit(&sc->driver_lock, sc->driver_lock_name,
4496 callout_init(&sc->co_hdl);
4498 mxge_setup_cfg_space(sc);
4500 /* Map the board into the kernel */
4502 sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
4504 if (sc->mem_res == NULL) {
4505 device_printf(dev, "could not map memory\n");
4507 goto abort_with_lock;
4509 sc->sram = rman_get_virtual(sc->mem_res);
4510 sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4511 if (sc->sram_size > rman_get_size(sc->mem_res)) {
4512 device_printf(dev, "impossible memory region size %ld\n",
4513 rman_get_size(sc->mem_res));
4515 goto abort_with_mem_res;
4518 /* make NULL terminated copy of the EEPROM strings section of
4520 bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4521 bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4522 rman_get_bushandle(sc->mem_res),
4523 sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4525 MXGE_EEPROM_STRINGS_SIZE - 2);
4526 err = mxge_parse_strings(sc);
4528 goto abort_with_mem_res;
4530 /* Enable write combining for efficient use of PCIe bus */
4533 /* Allocate the out of band dma memory */
4534 err = mxge_dma_alloc(sc, &sc->cmd_dma,
4535 sizeof (mxge_cmd_t), 64);
4537 goto abort_with_mem_res;
4538 sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4539 err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4541 goto abort_with_cmd_dma;
4543 err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4545 goto abort_with_zeropad_dma;
4547 /* select & load the firmware */
4548 err = mxge_select_firmware(sc);
4550 goto abort_with_dmabench;
4551 sc->intr_coal_delay = mxge_intr_coal_delay;
4553 mxge_slice_probe(sc);
4554 err = mxge_alloc_slices(sc);
4556 goto abort_with_dmabench;
4558 err = mxge_reset(sc, 0);
4560 goto abort_with_slices;
4562 err = mxge_alloc_rings(sc);
4564 device_printf(sc->dev, "failed to allocate rings\n");
4565 goto abort_with_dmabench;
4568 err = mxge_add_irq(sc);
4570 device_printf(sc->dev, "failed to add irq\n");
4571 goto abort_with_rings;
4574 ifp->if_baudrate = IF_Gbps(10UL);
4575 ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4578 ifp->if_capabilities |= IFCAP_LRO;
4581 #ifdef MXGE_NEW_VLAN_API
4582 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4585 sc->max_mtu = mxge_max_mtu(sc);
4586 if (sc->max_mtu >= 9000)
4587 ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4589 device_printf(dev, "MTU limited to %d. Install "
4590 "latest firmware for 9000 byte jumbo support\n",
4591 sc->max_mtu - ETHER_HDR_LEN);
4592 ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4593 ifp->if_capenable = ifp->if_capabilities;
4594 if (sc->lro_cnt == 0)
4595 ifp->if_capenable &= ~IFCAP_LRO;
4597 ifp->if_init = mxge_init;
4599 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4600 ifp->if_ioctl = mxge_ioctl;
4601 ifp->if_start = mxge_start;
4602 /* Initialise the ifmedia structure */
4603 ifmedia_init(&sc->media, 0, mxge_media_change,
4605 mxge_set_media(sc, IFM_ETHER | IFM_AUTO);
4606 mxge_media_probe(sc);
4608 ether_ifattach(ifp, sc->mac_addr);
4609 /* ether_ifattach sets mtu to ETHERMTU */
4610 if (mxge_initial_mtu != ETHERMTU)
4611 mxge_change_mtu(sc, mxge_initial_mtu);
4613 mxge_add_sysctls(sc);
4614 #ifdef IFNET_BUF_RING
4615 ifp->if_transmit = mxge_transmit;
4616 ifp->if_qflush = mxge_qflush;
4621 mxge_free_rings(sc);
4623 mxge_free_slices(sc);
4624 abort_with_dmabench:
4625 mxge_dma_free(&sc->dmabench_dma);
4626 abort_with_zeropad_dma:
4627 mxge_dma_free(&sc->zeropad_dma);
4629 mxge_dma_free(&sc->cmd_dma);
4631 bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4633 pci_disable_busmaster(dev);
4634 lockuninit(&sc->cmd_lock);
4635 lockuninit(&sc->driver_lock);
4637 abort_with_parent_dmat:
4638 bus_dma_tag_destroy(sc->parent_dmat);
4645 mxge_detach(device_t dev)
4647 mxge_softc_t *sc = device_get_softc(dev);
4649 if (mxge_vlans_active(sc)) {
4650 device_printf(sc->dev,
4651 "Detach vlans before removing module\n");
4654 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
4656 if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
4658 lockmgr(&sc->driver_lock, LK_RELEASE);
4659 ether_ifdetach(sc->ifp);
4660 callout_drain(&sc->co_hdl);
4661 ifmedia_removeall(&sc->media);
4662 mxge_dummy_rdma(sc, 0);
4663 mxge_rem_sysctls(sc);
4665 mxge_free_rings(sc);
4666 mxge_free_slices(sc);
4667 mxge_dma_free(&sc->dmabench_dma);
4668 mxge_dma_free(&sc->zeropad_dma);
4669 mxge_dma_free(&sc->cmd_dma);
4670 bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4671 pci_disable_busmaster(dev);
4672 lockuninit(&sc->cmd_lock);
4673 lockuninit(&sc->driver_lock);
4675 bus_dma_tag_destroy(sc->parent_dmat);
4680 mxge_shutdown(device_t dev)
4686 This file uses Myri10GE driver indentation.
4689 c-file-style:"linux"