1 /******************************************************************************
3 Copyright (c) 2006-2013, Myricom Inc.
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
9 1. Redistributions of source code must retain the above copyright notice,
10 this list of conditions and the following disclaimer.
12 2. Neither the name of the Myricom Inc, nor the names of its
13 contributors may be used to endorse or promote products derived from
14 this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
28 $FreeBSD: head/sys/dev/mxge/if_mxge.c 254263 2013-08-12 23:30:01Z scottl $
30 ***************************************************************************/
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/linker.h>
37 #include <sys/firmware.h>
38 #include <sys/endian.h>
39 #include <sys/in_cksum.h>
40 #include <sys/sockio.h>
42 #include <sys/malloc.h>
43 #include <sys/kernel.h>
44 #include <sys/module.h>
45 #include <sys/serialize.h>
46 #include <sys/socket.h>
47 #include <sys/sysctl.h>
50 #include <net/if_arp.h>
51 #include <net/ifq_var.h>
52 #include <net/ethernet.h>
53 #include <net/if_dl.h>
54 #include <net/if_media.h>
58 #include <net/if_types.h>
59 #include <net/vlan/if_vlan_var.h>
62 #include <netinet/in_systm.h>
63 #include <netinet/in.h>
64 #include <netinet/ip.h>
65 #include <netinet/tcp.h>
70 #include <bus/pci/pcireg.h>
71 #include <bus/pci/pcivar.h>
72 #include <bus/pci/pci_private.h> /* XXX for pci_cfg_restore */
74 #include <vm/vm.h> /* for pmap_mapdev() */
77 #if defined(__i386__) || defined(__x86_64__)
78 #include <machine/specialreg.h>
81 #include <dev/netif/mxge/mxge_mcp.h>
82 #include <dev/netif/mxge/mcp_gen_header.h>
83 #include <dev/netif/mxge/if_mxge_var.h>
86 static int mxge_nvidia_ecrc_enable = 1;
87 static int mxge_force_firmware = 0;
88 static int mxge_intr_coal_delay = MXGE_INTR_COAL_DELAY;
89 static int mxge_deassert_wait = 1;
90 static int mxge_flow_control = 1;
91 static int mxge_ticks;
92 static int mxge_max_slices = 1;
93 static int mxge_always_promisc = 0;
94 static int mxge_throttle = 0;
95 static int mxge_msi_enable = 1;
97 static const char *mxge_fw_unaligned = "mxge_ethp_z8e";
98 static const char *mxge_fw_aligned = "mxge_eth_z8e";
99 static const char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
100 static const char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
102 TUNABLE_INT("hw.mxge.max_slices", &mxge_max_slices);
103 TUNABLE_INT("hw.mxge.flow_control_enabled", &mxge_flow_control);
104 TUNABLE_INT("hw.mxge.intr_coal_delay", &mxge_intr_coal_delay);
105 TUNABLE_INT("hw.mxge.nvidia_ecrc_enable", &mxge_nvidia_ecrc_enable);
106 TUNABLE_INT("hw.mxge.force_firmware", &mxge_force_firmware);
107 TUNABLE_INT("hw.mxge.deassert_wait", &mxge_deassert_wait);
108 TUNABLE_INT("hw.mxge.ticks", &mxge_ticks);
109 TUNABLE_INT("hw.mxge.always_promisc", &mxge_always_promisc);
110 TUNABLE_INT("hw.mxge.throttle", &mxge_throttle);
111 TUNABLE_INT("hw.mxge.msi.enable", &mxge_msi_enable);
113 static int mxge_probe(device_t dev);
114 static int mxge_attach(device_t dev);
115 static int mxge_detach(device_t dev);
116 static int mxge_shutdown(device_t dev);
117 static void mxge_intr(void *arg);
119 static device_method_t mxge_methods[] = {
120 /* Device interface */
121 DEVMETHOD(device_probe, mxge_probe),
122 DEVMETHOD(device_attach, mxge_attach),
123 DEVMETHOD(device_detach, mxge_detach),
124 DEVMETHOD(device_shutdown, mxge_shutdown),
128 static driver_t mxge_driver = {
131 sizeof(mxge_softc_t),
134 static devclass_t mxge_devclass;
136 /* Declare ourselves to be a child of the PCI bus.*/
137 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, NULL, NULL);
138 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
139 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
141 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
142 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
143 static void mxge_close(mxge_softc_t *sc, int down);
144 static int mxge_open(mxge_softc_t *sc);
145 static void mxge_tick(void *arg);
148 mxge_probe(device_t dev)
150 if (pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM &&
151 (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E ||
152 pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9)) {
153 int rev = pci_get_revid(dev);
156 case MXGE_PCI_REV_Z8E:
157 device_set_desc(dev, "Myri10G-PCIE-8A");
159 case MXGE_PCI_REV_Z8ES:
160 device_set_desc(dev, "Myri10G-PCIE-8B");
163 device_set_desc(dev, "Myri10G-PCIE-8??");
164 device_printf(dev, "Unrecognized rev %d NIC\n", rev);
173 mxge_enable_wc(mxge_softc_t *sc)
175 #if defined(__i386__) || defined(__x86_64__)
179 len = rman_get_size(sc->mem_res);
180 pmap_change_attr((vm_offset_t) sc->sram, len / PAGE_SIZE,
181 PAT_WRITE_COMBINING);
186 mxge_dma_alloc(mxge_softc_t *sc, bus_dmamem_t *dma, size_t bytes,
187 bus_size_t alignment)
192 if (bytes > 4096 && alignment == 4096)
197 err = bus_dmamem_coherent(sc->parent_dmat, alignment, boundary,
198 BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, bytes,
199 BUS_DMA_WAITOK | BUS_DMA_ZERO, dma);
201 device_printf(sc->dev, "bus_dmamem_coherent failed: %d\n", err);
208 mxge_dma_free(bus_dmamem_t *dma)
210 bus_dmamap_unload(dma->dmem_tag, dma->dmem_map);
211 bus_dmamem_free(dma->dmem_tag, dma->dmem_addr, dma->dmem_map);
212 bus_dma_tag_destroy(dma->dmem_tag);
216 * The eeprom strings on the lanaiX have the format
222 mxge_parse_strings(mxge_softc_t *sc)
225 int i, found_mac, found_sn2;
228 ptr = sc->eeprom_strings;
231 while (*ptr != '\0') {
232 if (strncmp(ptr, "MAC=", 4) == 0) {
235 sc->mac_addr[i] = strtoul(ptr, &endptr, 16);
236 if (endptr - ptr != 2)
245 } else if (strncmp(ptr, "PC=", 3) == 0) {
247 strlcpy(sc->product_code_string, ptr,
248 sizeof(sc->product_code_string));
249 } else if (!found_sn2 && (strncmp(ptr, "SN=", 3) == 0)) {
251 strlcpy(sc->serial_number_string, ptr,
252 sizeof(sc->serial_number_string));
253 } else if (strncmp(ptr, "SN2=", 4) == 0) {
254 /* SN2 takes precedence over SN */
257 strlcpy(sc->serial_number_string, ptr,
258 sizeof(sc->serial_number_string));
260 while (*ptr++ != '\0') {}
267 device_printf(sc->dev, "failed to parse eeprom_strings\n");
271 #if defined(__i386__) || defined(__x86_64__)
274 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
277 unsigned long base, off;
279 device_t pdev, mcp55;
280 uint16_t vendor_id, device_id, word;
281 uintptr_t bus, slot, func, ivend, idev;
284 if (!mxge_nvidia_ecrc_enable)
287 pdev = device_get_parent(device_get_parent(sc->dev));
289 device_printf(sc->dev, "could not find parent?\n");
292 vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
293 device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
295 if (vendor_id != 0x10de)
300 if (device_id == 0x005d) {
301 /* ck804, base address is magic */
303 } else if (device_id >= 0x0374 && device_id <= 0x378) {
304 /* mcp55, base address stored in chipset */
305 mcp55 = pci_find_bsf(0, 0, 0);
307 0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
308 0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
309 word = pci_read_config(mcp55, 0x90, 2);
310 base = ((unsigned long)word & 0x7ffeU) << 25;
318 * Test below is commented because it is believed that doing
319 * config read/write beyond 0xff will access the config space
320 * for the next larger function. Uncomment this and remove
321 * the hacky pmap_mapdev() way of accessing config space when
322 * FreeBSD grows support for extended pcie config space access
326 * See if we can, by some miracle, access the extended
329 val = pci_read_config(pdev, 0x178, 4);
330 if (val != 0xffffffff) {
332 pci_write_config(pdev, 0x178, val, 4);
337 * Rather than using normal pci config space writes, we must
338 * map the Nvidia config space ourselves. This is because on
339 * opteron/nvidia class machine the 0xe000000 mapping is
340 * handled by the nvidia chipset, that means the internal PCI
341 * device (the on-chip northbridge), or the amd-8131 bridge
342 * and things behind them are not visible by this method.
345 BUS_READ_IVAR(device_get_parent(pdev), pdev,
347 BUS_READ_IVAR(device_get_parent(pdev), pdev,
348 PCI_IVAR_SLOT, &slot);
349 BUS_READ_IVAR(device_get_parent(pdev), pdev,
350 PCI_IVAR_FUNCTION, &func);
351 BUS_READ_IVAR(device_get_parent(pdev), pdev,
352 PCI_IVAR_VENDOR, &ivend);
353 BUS_READ_IVAR(device_get_parent(pdev), pdev,
354 PCI_IVAR_DEVICE, &idev);
356 off = base + 0x00100000UL * (unsigned long)bus +
357 0x00001000UL * (unsigned long)(func + 8 * slot);
359 /* map it into the kernel */
360 va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
362 device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
365 /* get a pointer to the config space mapped into the kernel */
366 cfgptr = va + (off & PAGE_MASK);
368 /* make sure that we can really access it */
369 vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
370 device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
371 if (!(vendor_id == ivend && device_id == idev)) {
372 device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
373 vendor_id, device_id);
374 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
378 ptr32 = (uint32_t*)(cfgptr + 0x178);
381 if (val == 0xffffffff) {
382 device_printf(sc->dev, "extended mapping failed\n");
383 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
387 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
389 device_printf(sc->dev, "Enabled ECRC on upstream "
390 "Nvidia bridge at %d:%d:%d\n",
391 (int)bus, (int)slot, (int)func);
395 #else /* __i386__ || __x86_64__ */
398 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
400 device_printf(sc->dev, "Nforce 4 chipset on non-x86/x86_64!?!?!\n");
406 mxge_dma_test(mxge_softc_t *sc, int test_type)
409 bus_addr_t dmatest_bus = sc->dmabench_dma.dmem_busaddr;
412 const char *test = " ";
415 * Run a small DMA test.
416 * The magic multipliers to the length tell the firmware
417 * to do DMA read, write, or read+write tests. The
418 * results are returned in cmd.data0. The upper 16
419 * bits of the return is the number of transfers completed.
420 * The lower 16 bits is the time in 0.5us ticks that the
421 * transfers took to complete.
424 len = sc->tx_boundary;
426 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
427 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
428 cmd.data2 = len * 0x10000;
429 status = mxge_send_cmd(sc, test_type, &cmd);
434 sc->read_dma = ((cmd.data0>>16) * len * 2) / (cmd.data0 & 0xffff);
436 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
437 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
438 cmd.data2 = len * 0x1;
439 status = mxge_send_cmd(sc, test_type, &cmd);
444 sc->write_dma = ((cmd.data0>>16) * len * 2) / (cmd.data0 & 0xffff);
446 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
447 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
448 cmd.data2 = len * 0x10001;
449 status = mxge_send_cmd(sc, test_type, &cmd);
454 sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
455 (cmd.data0 & 0xffff);
458 if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST) {
459 device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
466 * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
467 * when the PCI-E Completion packets are aligned on an 8-byte
468 * boundary. Some PCI-E chip sets always align Completion packets; on
469 * the ones that do not, the alignment can be enforced by enabling
470 * ECRC generation (if supported).
472 * When PCI-E Completion packets are not aligned, it is actually more
473 * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
475 * If the driver can neither enable ECRC nor verify that it has
476 * already been enabled, then it must use a firmware image which works
477 * around unaligned completion packets (ethp_z8e.dat), and it should
478 * also ensure that it never gives the device a Read-DMA which is
479 * larger than 2KB by setting the tx_boundary to 2KB. If ECRC is
480 * enabled, then the driver should use the aligned (eth_z8e.dat)
481 * firmware image, and set tx_boundary to 4KB.
484 mxge_firmware_probe(mxge_softc_t *sc)
486 device_t dev = sc->dev;
490 sc->tx_boundary = 4096;
493 * Verify the max read request size was set to 4KB
494 * before trying the test with 4KB.
496 if (pci_find_extcap(dev, PCIY_EXPRESS, ®) == 0) {
497 pectl = pci_read_config(dev, reg + 0x8, 2);
498 if ((pectl & (5 << 12)) != (5 << 12)) {
499 device_printf(dev, "Max Read Req. size != 4k (0x%x)\n",
501 sc->tx_boundary = 2048;
506 * Load the optimized firmware (which assumes aligned PCIe
507 * completions) in order to see if it works on this host.
509 sc->fw_name = mxge_fw_aligned;
510 status = mxge_load_firmware(sc, 1);
515 * Enable ECRC if possible
517 mxge_enable_nvidia_ecrc(sc);
520 * Run a DMA test which watches for unaligned completions and
521 * aborts on the first one seen. Not required on Z8ES or newer.
523 if (pci_get_revid(sc->dev) >= MXGE_PCI_REV_Z8ES)
526 status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
528 return 0; /* keep the aligned firmware */
531 device_printf(dev, "DMA test failed: %d\n", status);
532 if (status == ENOSYS) {
533 device_printf(dev, "Falling back to ethp! "
534 "Please install up to date fw\n");
540 mxge_select_firmware(mxge_softc_t *sc)
543 int force_firmware = mxge_force_firmware;
546 force_firmware = sc->throttle;
548 if (force_firmware != 0) {
549 if (force_firmware == 1)
554 device_printf(sc->dev,
555 "Assuming %s completions (forced)\n",
556 aligned ? "aligned" : "unaligned");
562 * If the PCIe link width is 4 or less, we can use the aligned
563 * firmware and skip any checks
565 if (sc->link_width != 0 && sc->link_width <= 4) {
566 device_printf(sc->dev, "PCIe x%d Link, "
567 "expect reduced performance\n", sc->link_width);
572 if (mxge_firmware_probe(sc) == 0)
577 sc->fw_name = mxge_fw_aligned;
578 sc->tx_boundary = 4096;
580 sc->fw_name = mxge_fw_unaligned;
581 sc->tx_boundary = 2048;
583 return mxge_load_firmware(sc, 0);
587 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
589 if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
590 device_printf(sc->dev, "Bad firmware type: 0x%x\n",
591 be32toh(hdr->mcp_type));
595 /* Save firmware version for sysctl */
596 strlcpy(sc->fw_version, hdr->version, sizeof(sc->fw_version));
598 device_printf(sc->dev, "firmware id: %s\n", hdr->version);
600 ksscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
601 &sc->fw_ver_minor, &sc->fw_ver_tiny);
603 if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR &&
604 sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
605 device_printf(sc->dev, "Found firmware version %s\n",
607 device_printf(sc->dev, "Driver needs %d.%d\n",
608 MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
615 z_alloc(void *nil, u_int items, u_int size)
617 return kmalloc(items * size, M_TEMP, M_WAITOK);
621 z_free(void *nil, void *ptr)
627 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
630 char *inflate_buffer;
631 const struct firmware *fw;
632 const mcp_gen_header_t *hdr;
639 fw = firmware_get(sc->fw_name);
641 device_printf(sc->dev, "Could not find firmware image %s\n",
646 /* Setup zlib and decompress f/w */
647 bzero(&zs, sizeof(zs));
650 status = inflateInit(&zs);
651 if (status != Z_OK) {
657 * The uncompressed size is stored as the firmware version,
658 * which would otherwise go unused
660 fw_len = (size_t)fw->version;
661 inflate_buffer = kmalloc(fw_len, M_TEMP, M_WAITOK);
662 zs.avail_in = fw->datasize;
663 zs.next_in = __DECONST(char *, fw->data);
664 zs.avail_out = fw_len;
665 zs.next_out = inflate_buffer;
666 status = inflate(&zs, Z_FINISH);
667 if (status != Z_STREAM_END) {
668 device_printf(sc->dev, "zlib %d\n", status);
670 goto abort_with_buffer;
675 htobe32(*(const uint32_t *)(inflate_buffer + MCP_HEADER_PTR_OFFSET));
676 if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
677 device_printf(sc->dev, "Bad firmware file");
679 goto abort_with_buffer;
681 hdr = (const void*)(inflate_buffer + hdr_offset);
683 status = mxge_validate_firmware(sc, hdr);
685 goto abort_with_buffer;
687 /* Copy the inflated firmware to NIC SRAM. */
688 for (i = 0; i < fw_len; i += 256) {
689 mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i, inflate_buffer + i,
690 min(256U, (unsigned)(fw_len - i)));
699 kfree(inflate_buffer, M_TEMP);
702 firmware_put(fw, FIRMWARE_UNLOAD);
707 * Enable or disable periodic RDMAs from the host to make certain
708 * chipsets resend dropped PCIe messages
711 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
714 volatile uint32_t *confirm;
715 volatile char *submit;
716 uint32_t *buf, dma_low, dma_high;
719 buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
721 /* Clear confirmation addr */
722 confirm = (volatile uint32_t *)sc->cmd;
727 * Send an rdma command to the PCIe engine, and wait for the
728 * response in the confirmation address. The firmware should
729 * write a -1 there to indicate it is alive and well
731 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.dmem_busaddr);
732 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.dmem_busaddr);
733 buf[0] = htobe32(dma_high); /* confirm addr MSW */
734 buf[1] = htobe32(dma_low); /* confirm addr LSW */
735 buf[2] = htobe32(0xffffffff); /* confirm data */
736 dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.dmem_busaddr);
737 dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.dmem_busaddr);
738 buf[3] = htobe32(dma_high); /* dummy addr MSW */
739 buf[4] = htobe32(dma_low); /* dummy addr LSW */
740 buf[5] = htobe32(enable); /* enable? */
742 submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
744 mxge_pio_copy(submit, buf, 64);
749 while (*confirm != 0xffffffff && i < 20) {
753 if (*confirm != 0xffffffff) {
754 if_printf(sc->ifp, "dummy rdma %s failed (%p = 0x%x)",
755 (enable ? "enable" : "disable"), confirm, *confirm);
760 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
763 char buf_bytes[sizeof(*buf) + 8];
764 volatile mcp_cmd_response_t *response = sc->cmd;
765 volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
766 uint32_t dma_low, dma_high;
767 int err, sleep_total = 0;
769 /* Ensure buf is aligned to 8 bytes */
770 buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
772 buf->data0 = htobe32(data->data0);
773 buf->data1 = htobe32(data->data1);
774 buf->data2 = htobe32(data->data2);
775 buf->cmd = htobe32(cmd);
776 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.dmem_busaddr);
777 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.dmem_busaddr);
779 buf->response_addr.low = htobe32(dma_low);
780 buf->response_addr.high = htobe32(dma_high);
782 response->result = 0xffffffff;
784 mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
790 for (sleep_total = 0; sleep_total < 20; sleep_total++) {
792 switch (be32toh(response->result)) {
794 data->data0 = be32toh(response->data);
800 case MXGEFW_CMD_UNKNOWN:
803 case MXGEFW_CMD_ERROR_UNALIGNED:
806 case MXGEFW_CMD_ERROR_BUSY:
809 case MXGEFW_CMD_ERROR_I2C_ABSENT:
813 if_printf(sc->ifp, "command %d failed, result = %d\n",
814 cmd, be32toh(response->result));
822 if_printf(sc->ifp, "command %d timed out result = %d\n",
823 cmd, be32toh(response->result));
829 mxge_adopt_running_firmware(mxge_softc_t *sc)
831 struct mcp_gen_header *hdr;
832 const size_t bytes = sizeof(struct mcp_gen_header);
837 * Find running firmware header
840 htobe32(*(volatile uint32_t *)(sc->sram + MCP_HEADER_PTR_OFFSET));
842 if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
843 device_printf(sc->dev,
844 "Running firmware has bad header offset (%zu)\n",
850 * Copy header of running firmware from SRAM to host memory to
853 hdr = kmalloc(bytes, M_DEVBUF, M_WAITOK);
854 bus_space_read_region_1(rman_get_bustag(sc->mem_res),
855 rman_get_bushandle(sc->mem_res), hdr_offset, (char *)hdr, bytes);
856 status = mxge_validate_firmware(sc, hdr);
857 kfree(hdr, M_DEVBUF);
860 * Check to see if adopted firmware has bug where adopting
861 * it will cause broadcasts to be filtered unless the NIC
862 * is kept in ALLMULTI mode
864 if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
865 sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
866 sc->adopted_rx_filter_bug = 1;
867 device_printf(sc->dev, "Adopting fw %d.%d.%d: "
868 "working around rx filter bug\n",
869 sc->fw_ver_major, sc->fw_ver_minor, sc->fw_ver_tiny);
876 mxge_load_firmware(mxge_softc_t *sc, int adopt)
878 volatile uint32_t *confirm;
879 volatile char *submit;
881 uint32_t *buf, size, dma_low, dma_high;
884 buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
886 size = sc->sram_size;
887 status = mxge_load_firmware_helper(sc, &size);
893 * Try to use the currently running firmware, if
896 status = mxge_adopt_running_firmware(sc);
898 device_printf(sc->dev,
899 "failed to adopt running firmware\n");
902 device_printf(sc->dev,
903 "Successfully adopted running firmware\n");
905 if (sc->tx_boundary == 4096) {
906 device_printf(sc->dev,
907 "Using firmware currently running on NIC. "
909 device_printf(sc->dev,
910 "performance consider loading "
911 "optimized firmware\n");
913 sc->fw_name = mxge_fw_unaligned;
914 sc->tx_boundary = 2048;
918 /* Clear confirmation addr */
919 confirm = (volatile uint32_t *)sc->cmd;
924 * Send a reload command to the bootstrap MCP, and wait for the
925 * response in the confirmation address. The firmware should
926 * write a -1 there to indicate it is alive and well
929 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.dmem_busaddr);
930 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.dmem_busaddr);
932 buf[0] = htobe32(dma_high); /* confirm addr MSW */
933 buf[1] = htobe32(dma_low); /* confirm addr LSW */
934 buf[2] = htobe32(0xffffffff); /* confirm data */
937 * FIX: All newest firmware should un-protect the bottom of
938 * the sram before handoff. However, the very first interfaces
939 * do not. Therefore the handoff copy must skip the first 8 bytes
941 /* where the code starts*/
942 buf[3] = htobe32(MXGE_FW_OFFSET + 8);
943 buf[4] = htobe32(size - 8); /* length of code */
944 buf[5] = htobe32(8); /* where to copy to */
945 buf[6] = htobe32(0); /* where to jump to */
947 submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
948 mxge_pio_copy(submit, buf, 64);
953 while (*confirm != 0xffffffff && i < 20) {
957 if (*confirm != 0xffffffff) {
958 device_printf(sc->dev,"handoff failed (%p = 0x%x)",
966 mxge_update_mac_address(mxge_softc_t *sc)
969 uint8_t *addr = sc->mac_addr;
971 cmd.data0 = (addr[0] << 24) | (addr[1] << 16) |
972 (addr[2] << 8) | addr[3];
973 cmd.data1 = (addr[4] << 8) | (addr[5]);
974 return mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
978 mxge_change_pause(mxge_softc_t *sc, int pause)
984 status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL, &cmd);
986 status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL, &cmd);
988 device_printf(sc->dev, "Failed to set flow control mode\n");
996 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1001 if (mxge_always_promisc)
1005 status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC, &cmd);
1007 status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC, &cmd);
1009 device_printf(sc->dev, "Failed to set promisc mode\n");
1013 mxge_set_multicast_list(mxge_softc_t *sc)
1016 struct ifmultiaddr *ifma;
1017 struct ifnet *ifp = sc->ifp;
1020 /* This firmware is known to not support multicast */
1021 if (!sc->fw_multicast_support)
1024 /* Disable multicast filtering while we play with the lists*/
1025 err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1027 device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1028 " error status: %d\n", err);
1032 if (sc->adopted_rx_filter_bug)
1035 if (ifp->if_flags & IFF_ALLMULTI) {
1036 /* Request to disable multicast filtering, so quit here */
1040 /* Flush all the filters */
1041 err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1043 device_printf(sc->dev,
1044 "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, "
1045 "error status: %d\n", err);
1050 * Walk the multicast list, and add each address
1052 TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1053 if (ifma->ifma_addr->sa_family != AF_LINK)
1056 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1058 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1060 cmd.data0 = htonl(cmd.data0);
1061 cmd.data1 = htonl(cmd.data1);
1062 err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1064 device_printf(sc->dev, "Failed "
1065 "MXGEFW_JOIN_MULTICAST_GROUP, "
1066 "error status: %d\n", err);
1067 /* Abort, leaving multicast filtering off */
1072 /* Enable multicast filtering */
1073 err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1075 device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI, "
1076 "error status: %d\n", err);
1082 mxge_max_mtu(mxge_softc_t *sc)
1087 if (MJUMPAGESIZE - MXGEFW_PAD > MXGEFW_MAX_MTU)
1088 return MXGEFW_MAX_MTU - MXGEFW_PAD;
1090 /* try to set nbufs to see if it we can
1091 use virtually contiguous jumbos */
1093 status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1096 return MXGEFW_MAX_MTU - MXGEFW_PAD;
1098 /* otherwise, we're limited to MJUMPAGESIZE */
1099 return MJUMPAGESIZE - MXGEFW_PAD;
1104 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1106 struct mxge_slice_state *ss;
1107 mxge_rx_done_t *rx_done;
1108 volatile uint32_t *irq_claim;
1113 * Try to send a reset command to the card to see if it
1116 memset(&cmd, 0, sizeof (cmd));
1117 status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1119 if_printf(sc->ifp, "failed reset\n");
1123 mxge_dummy_rdma(sc, 1);
1125 /* Set the intrq size */
1126 cmd.data0 = sc->rx_ring_size;
1127 status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1130 * Even though we already know how many slices are supported
1131 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1132 * has magic side effects, and must be called after a reset.
1133 * It must be called prior to calling any RSS related cmds,
1134 * including assigning an interrupt queue for anything but
1135 * slice 0. It must also be called *after*
1136 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1137 * the firmware to compute offsets.
1139 if (sc->num_slices > 1) {
1140 /* Ask the maximum number of slices it supports */
1141 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
1143 if_printf(sc->ifp, "failed to get number of slices\n");
1148 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1149 * to setting up the interrupt queue DMA
1151 cmd.data0 = sc->num_slices;
1152 cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1153 #ifdef IFNET_BUF_RING
1154 cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1156 status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES, &cmd);
1158 if_printf(sc->ifp, "failed to set number of slices\n");
1163 if (interrupts_setup) {
1164 /* Now exchange information about interrupts */
1165 for (slice = 0; slice < sc->num_slices; slice++) {
1166 rx_done = &sc->ss[slice].rx_done;
1167 memset(rx_done->entry, 0, sc->rx_ring_size);
1169 MXGE_LOWPART_TO_U32(rx_done->dma.dmem_busaddr);
1171 MXGE_HIGHPART_TO_U32(rx_done->dma.dmem_busaddr);
1173 status |= mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_DMA,
1178 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET,
1180 sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1182 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1183 irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1185 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET, &cmd);
1186 sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1189 if_printf(sc->ifp, "failed set interrupt parameters\n");
1193 *sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1195 /* Run a DMA benchmark */
1196 mxge_dma_test(sc, MXGEFW_DMA_TEST);
1198 for (slice = 0; slice < sc->num_slices; slice++) {
1199 ss = &sc->ss[slice];
1201 ss->irq_claim = irq_claim + (2 * slice);
1203 /* Reset mcp/driver shared state back to 0 */
1204 ss->rx_done.idx = 0;
1205 ss->rx_done.cnt = 0;
1208 ss->tx.pkt_done = 0;
1209 ss->tx.queue_active = 0;
1210 ss->tx.activate = 0;
1211 ss->tx.deactivate = 0;
1216 ss->rx_small.cnt = 0;
1217 if (ss->fw_stats != NULL)
1218 bzero(ss->fw_stats, sizeof(*ss->fw_stats));
1220 sc->rdma_tags_available = 15;
1222 status = mxge_update_mac_address(sc);
1223 mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1224 mxge_change_pause(sc, sc->pause);
1225 mxge_set_multicast_list(sc);
1228 cmd.data0 = sc->throttle;
1229 if (mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd))
1230 if_printf(sc->ifp, "can't enable throttle\n");
1236 mxge_change_throttle(SYSCTL_HANDLER_ARGS)
1241 unsigned int throttle;
1244 throttle = sc->throttle;
1245 err = sysctl_handle_int(oidp, &throttle, arg2, req);
1250 if (throttle == sc->throttle)
1253 if (throttle < MXGE_MIN_THROTTLE || throttle > MXGE_MAX_THROTTLE)
1257 lwkt_serialize_enter(sc->ifp->if_serializer);
1259 cmd.data0 = throttle;
1260 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd);
1262 sc->throttle = throttle;
1264 lwkt_serialize_exit(sc->ifp->if_serializer);
1269 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1272 unsigned int intr_coal_delay;
1276 intr_coal_delay = sc->intr_coal_delay;
1277 err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1281 if (intr_coal_delay == sc->intr_coal_delay)
1284 if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1288 lwkt_serialize_enter(sc->ifp->if_serializer);
1290 *sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1291 sc->intr_coal_delay = intr_coal_delay;
1293 lwkt_serialize_exit(sc->ifp->if_serializer);
1298 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1301 unsigned int enabled;
1305 enabled = sc->pause;
1306 err = sysctl_handle_int(oidp, &enabled, arg2, req);
1310 if (enabled == sc->pause)
1314 lwkt_serialize_enter(sc->ifp->if_serializer);
1315 err = mxge_change_pause(sc, enabled);
1316 lwkt_serialize_exit(sc->ifp->if_serializer);
1322 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1328 arg2 = be32toh(*(int *)arg1);
1330 err = sysctl_handle_int(oidp, arg1, arg2, req);
1336 mxge_rem_sysctls(mxge_softc_t *sc)
1338 if (sc->ss != NULL) {
1339 struct mxge_slice_state *ss;
1342 for (slice = 0; slice < sc->num_slices; slice++) {
1343 ss = &sc->ss[slice];
1344 if (ss->sysctl_tree != NULL) {
1345 sysctl_ctx_free(&ss->sysctl_ctx);
1346 ss->sysctl_tree = NULL;
1351 if (sc->slice_sysctl_tree != NULL) {
1352 sysctl_ctx_free(&sc->slice_sysctl_ctx);
1353 sc->slice_sysctl_tree = NULL;
1356 if (sc->sysctl_tree != NULL) {
1357 sysctl_ctx_free(&sc->sysctl_ctx);
1358 sc->sysctl_tree = NULL;
1363 mxge_add_sysctls(mxge_softc_t *sc)
1365 struct sysctl_ctx_list *ctx;
1366 struct sysctl_oid_list *children;
1368 struct mxge_slice_state *ss;
1372 ctx = &sc->sysctl_ctx;
1373 sysctl_ctx_init(ctx);
1374 sc->sysctl_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_STATIC_CHILDREN(_hw),
1375 OID_AUTO, device_get_nameunit(sc->dev), CTLFLAG_RD, 0, "");
1376 if (sc->sysctl_tree == NULL) {
1377 device_printf(sc->dev, "can't add sysctl node\n");
1381 children = SYSCTL_CHILDREN(sc->sysctl_tree);
1382 fw = sc->ss[0].fw_stats;
1385 * Random information
1387 SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "firmware_version",
1388 CTLFLAG_RD, &sc->fw_version, 0, "firmware version");
1390 SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "serial_number",
1391 CTLFLAG_RD, &sc->serial_number_string, 0, "serial number");
1393 SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "product_code",
1394 CTLFLAG_RD, &sc->product_code_string, 0, "product code");
1396 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "pcie_link_width",
1397 CTLFLAG_RD, &sc->link_width, 0, "link width");
1399 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_boundary",
1400 CTLFLAG_RD, &sc->tx_boundary, 0, "tx boundary");
1402 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "write_combine",
1403 CTLFLAG_RD, &sc->wc, 0, "write combining PIO");
1405 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "read_dma_MBs",
1406 CTLFLAG_RD, &sc->read_dma, 0, "DMA Read speed in MB/s");
1408 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "write_dma_MBs",
1409 CTLFLAG_RD, &sc->write_dma, 0, "DMA Write speed in MB/s");
1411 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "read_write_dma_MBs",
1412 CTLFLAG_RD, &sc->read_write_dma, 0,
1413 "DMA concurrent Read/Write speed in MB/s");
1415 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "watchdog_resets",
1416 CTLFLAG_RD, &sc->watchdog_resets, 0,
1417 "Number of times NIC was reset");
1420 * Performance related tunables
1422 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "intr_coal_delay",
1423 CTLTYPE_INT|CTLFLAG_RW, sc, 0, mxge_change_intr_coal, "I",
1424 "Interrupt coalescing delay in usecs");
1426 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "throttle",
1427 CTLTYPE_INT|CTLFLAG_RW, sc, 0, mxge_change_throttle, "I",
1428 "Transmit throttling");
1430 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "flow_control_enabled",
1431 CTLTYPE_INT|CTLFLAG_RW, sc, 0, mxge_change_flow_control, "I",
1432 "Interrupt coalescing delay in usecs");
1434 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "deassert_wait",
1435 CTLFLAG_RW, &mxge_deassert_wait, 0,
1436 "Wait for IRQ line to go low in ihandler");
1439 * Stats block from firmware is in network byte order.
1442 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "link_up",
1443 CTLTYPE_INT|CTLFLAG_RD, &fw->link_up, 0,
1444 mxge_handle_be32, "I", "link up");
1446 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rdma_tags_available",
1447 CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available, 0,
1448 mxge_handle_be32, "I", "rdma_tags_available");
1450 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_bad_crc32",
1451 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_bad_crc32, 0,
1452 mxge_handle_be32, "I", "dropped_bad_crc32");
1454 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_bad_phy",
1455 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_bad_phy, 0,
1456 mxge_handle_be32, "I", "dropped_bad_phy");
1458 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_link_error_or_filtered",
1459 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_error_or_filtered, 0,
1460 mxge_handle_be32, "I", "dropped_link_error_or_filtered");
1462 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_link_overflow",
1463 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow, 0,
1464 mxge_handle_be32, "I", "dropped_link_overflow");
1466 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_multicast_filtered",
1467 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_multicast_filtered, 0,
1468 mxge_handle_be32, "I", "dropped_multicast_filtered");
1470 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_no_big_buffer",
1471 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer, 0,
1472 mxge_handle_be32, "I", "dropped_no_big_buffer");
1474 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_no_small_buffer",
1475 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_small_buffer, 0,
1476 mxge_handle_be32, "I", "dropped_no_small_buffer");
1478 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_overrun",
1479 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun, 0,
1480 mxge_handle_be32, "I", "dropped_overrun");
1482 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_pause",
1483 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_pause, 0,
1484 mxge_handle_be32, "I", "dropped_pause");
1486 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_runt",
1487 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt, 0,
1488 mxge_handle_be32, "I", "dropped_runt");
1490 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_unicast_filtered",
1491 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered, 0,
1492 mxge_handle_be32, "I", "dropped_unicast_filtered");
1494 /* add counters exported for debugging from all slices */
1495 sysctl_ctx_init(&sc->slice_sysctl_ctx);
1496 sc->slice_sysctl_tree = SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx,
1497 children, OID_AUTO, "slice", CTLFLAG_RD, 0, "");
1498 if (sc->slice_sysctl_tree == NULL) {
1499 device_printf(sc->dev, "can't add slice sysctl node\n");
1503 for (slice = 0; slice < sc->num_slices; slice++) {
1504 ss = &sc->ss[slice];
1505 sysctl_ctx_init(&ss->sysctl_ctx);
1506 ctx = &ss->sysctl_ctx;
1507 children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1508 ksprintf(slice_num, "%d", slice);
1509 ss->sysctl_tree = SYSCTL_ADD_NODE(ctx, children, OID_AUTO,
1510 slice_num, CTLFLAG_RD, 0, "");
1511 if (ss->sysctl_tree == NULL) {
1512 device_printf(sc->dev,
1513 "can't add %d slice sysctl node\n", slice);
1514 return; /* XXX continue? */
1516 children = SYSCTL_CHILDREN(ss->sysctl_tree);
1519 * XXX change to ULONG
1522 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rx_small_cnt",
1523 CTLFLAG_RD, &ss->rx_small.cnt, 0, "rx_small_cnt");
1525 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rx_big_cnt",
1526 CTLFLAG_RD, &ss->rx_big.cnt, 0, "rx_small_cnt");
1528 #ifndef IFNET_BUF_RING
1529 /* only transmit from slice 0 for now */
1534 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_req",
1535 CTLFLAG_RD, &ss->tx.req, 0, "tx_req");
1537 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_done",
1538 CTLFLAG_RD, &ss->tx.done, 0, "tx_done");
1540 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_pkt_done",
1541 CTLFLAG_RD, &ss->tx.pkt_done, 0, "tx_done");
1543 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_stall",
1544 CTLFLAG_RD, &ss->tx.stall, 0, "tx_stall");
1546 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_wake",
1547 CTLFLAG_RD, &ss->tx.wake, 0, "tx_wake");
1549 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_defrag",
1550 CTLFLAG_RD, &ss->tx.defrag, 0, "tx_defrag");
1552 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_queue_active",
1553 CTLFLAG_RD, &ss->tx.queue_active, 0, "tx_queue_active");
1555 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_activate",
1556 CTLFLAG_RD, &ss->tx.activate, 0, "tx_activate");
1558 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_deactivate",
1559 CTLFLAG_RD, &ss->tx.deactivate, 0, "tx_deactivate");
1564 * Copy an array of mcp_kreq_ether_send_t's to the mcp. Copy
1565 * backwards one at a time and handle ring wraps
1568 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1569 mcp_kreq_ether_send_t *src, int cnt)
1571 int idx, starting_slot;
1572 starting_slot = tx->req;
1575 idx = (starting_slot + cnt) & tx->mask;
1576 mxge_pio_copy(&tx->lanai[idx],
1577 &src[cnt], sizeof(*src));
1583 * Copy an array of mcp_kreq_ether_send_t's to the mcp. Copy
1584 * at most 32 bytes at a time, so as to avoid involving the software
1585 * pio handler in the nic. We re-write the first segment's flags
1586 * to mark them valid only after writing the entire chain
1589 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src, int cnt)
1593 volatile uint32_t *dst_ints;
1594 mcp_kreq_ether_send_t *srcp;
1595 volatile mcp_kreq_ether_send_t *dstp, *dst;
1598 idx = tx->req & tx->mask;
1600 last_flags = src->flags;
1603 dst = dstp = &tx->lanai[idx];
1606 if ((idx + cnt) < tx->mask) {
1607 for (i = 0; i < (cnt - 1); i += 2) {
1608 mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1609 wmb(); /* force write every 32 bytes */
1614 /* submit all but the first request, and ensure
1615 that it is submitted below */
1616 mxge_submit_req_backwards(tx, src, cnt);
1620 /* submit the first request */
1621 mxge_pio_copy(dstp, srcp, sizeof(*src));
1622 wmb(); /* barrier before setting valid flag */
1625 /* re-write the last 32-bits with the valid flags */
1626 src->flags = last_flags;
1627 src_ints = (uint32_t *)src;
1629 dst_ints = (volatile uint32_t *)dst;
1631 *dst_ints = *src_ints;
1637 mxge_pullup_tso(struct mbuf **mp)
1639 int hoff, iphlen, thoff;
1643 KASSERT(M_WRITABLE(m), ("TSO mbuf not writable"));
1645 iphlen = m->m_pkthdr.csum_iphlen;
1646 thoff = m->m_pkthdr.csum_thlen;
1647 hoff = m->m_pkthdr.csum_lhlen;
1649 KASSERT(iphlen > 0, ("invalid ip hlen"));
1650 KASSERT(thoff > 0, ("invalid tcp hlen"));
1651 KASSERT(hoff > 0, ("invalid ether hlen"));
1653 if (__predict_false(m->m_len < hoff + iphlen + thoff)) {
1654 m = m_pullup(m, hoff + iphlen + thoff);
1665 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1669 mcp_kreq_ether_send_t *req;
1670 bus_dma_segment_t *seg;
1671 uint32_t low, high_swapped;
1672 int len, seglen, cum_len, cum_len_next;
1673 int next_is_first, chop, cnt, rdma_count, small;
1674 uint16_t pseudo_hdr_offset, cksum_offset, mss;
1675 uint8_t flags, flags_next;
1678 mss = m->m_pkthdr.tso_segsz;
1680 /* negative cum_len signifies to the
1681 * send loop that we are still in the
1682 * header portion of the TSO packet.
1684 cum_len = -(m->m_pkthdr.csum_lhlen + m->m_pkthdr.csum_iphlen +
1685 m->m_pkthdr.csum_thlen);
1687 /* TSO implies checksum offload on this hardware */
1688 cksum_offset = m->m_pkthdr.csum_lhlen + m->m_pkthdr.csum_iphlen;
1689 flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1691 /* for TSO, pseudo_hdr_offset holds mss.
1692 * The firmware figures out where to put
1693 * the checksum by parsing the header. */
1694 pseudo_hdr_offset = htobe16(mss);
1701 /* "rdma_count" is the number of RDMAs belonging to the
1702 * current packet BEFORE the current send request. For
1703 * non-TSO packets, this is equal to "count".
1704 * For TSO packets, rdma_count needs to be reset
1705 * to 0 after a segment cut.
1707 * The rdma_count field of the send request is
1708 * the number of RDMAs of the packet starting at
1709 * that request. For TSO send requests with one ore more cuts
1710 * in the middle, this is the number of RDMAs starting
1711 * after the last cut in the request. All previous
1712 * segments before the last cut implicitly have 1 RDMA.
1714 * Since the number of RDMAs is not known beforehand,
1715 * it must be filled-in retroactively - after each
1716 * segmentation cut or at the end of the entire packet.
1719 while (busdma_seg_cnt) {
1720 /* Break the busdma segment up into pieces*/
1721 low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1722 high_swapped = htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1726 flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1728 cum_len_next = cum_len + seglen;
1729 (req-rdma_count)->rdma_count = rdma_count + 1;
1730 if (__predict_true(cum_len >= 0)) {
1732 chop = (cum_len_next > mss);
1733 cum_len_next = cum_len_next % mss;
1734 next_is_first = (cum_len_next == 0);
1735 flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1736 flags_next |= next_is_first *
1738 rdma_count |= -(chop | next_is_first);
1739 rdma_count += chop & !next_is_first;
1740 } else if (cum_len_next >= 0) {
1745 small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1746 flags_next = MXGEFW_FLAGS_TSO_PLD |
1747 MXGEFW_FLAGS_FIRST |
1748 (small * MXGEFW_FLAGS_SMALL);
1751 req->addr_high = high_swapped;
1752 req->addr_low = htobe32(low);
1753 req->pseudo_hdr_offset = pseudo_hdr_offset;
1755 req->rdma_count = 1;
1756 req->length = htobe16(seglen);
1757 req->cksum_offset = cksum_offset;
1758 req->flags = flags | ((cum_len & 1) *
1759 MXGEFW_FLAGS_ALIGN_ODD);
1762 cum_len = cum_len_next;
1767 if (__predict_false(cksum_offset > seglen))
1768 cksum_offset -= seglen;
1771 if (__predict_false(cnt > tx->max_desc))
1777 (req-rdma_count)->rdma_count = rdma_count;
1781 req->flags |= MXGEFW_FLAGS_TSO_LAST;
1782 } while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1784 tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1785 mxge_submit_req(tx, tx->req_list, cnt);
1786 #ifdef IFNET_BUF_RING
1787 if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1788 /* tell the NIC to start polling this slice */
1790 tx->queue_active = 1;
1798 bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1802 kprintf("tx->max_desc exceeded via TSO!\n");
1803 kprintf("mss = %d, %ld, %d!\n", mss,
1804 (long)seg - (long)tx->seg_list, tx->max_desc);
1810 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
1813 mcp_kreq_ether_send_t *req;
1814 bus_dma_segment_t *seg;
1816 int cnt, cum_len, err, i, idx, odd_flag;
1817 uint16_t pseudo_hdr_offset;
1818 uint8_t flags, cksum_offset;
1823 if (m->m_pkthdr.csum_flags & CSUM_TSO) {
1824 if (mxge_pullup_tso(&m))
1828 /* (try to) map the frame for DMA */
1829 idx = tx->req & tx->mask;
1830 err = bus_dmamap_load_mbuf_defrag(tx->dmat, tx->info[idx].map, &m,
1831 tx->seg_list, tx->max_desc - 2, &cnt, BUS_DMA_NOWAIT);
1832 if (__predict_false(err != 0))
1834 bus_dmamap_sync(tx->dmat, tx->info[idx].map, BUS_DMASYNC_PREWRITE);
1835 tx->info[idx].m = m;
1837 /* TSO is different enough, we handle it in another routine */
1838 if (m->m_pkthdr.csum_flags & CSUM_TSO) {
1839 mxge_encap_tso(ss, m, cnt);
1845 pseudo_hdr_offset = 0;
1846 flags = MXGEFW_FLAGS_NO_TSO;
1848 /* checksum offloading? */
1849 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
1850 cksum_offset = m->m_pkthdr.csum_lhlen + m->m_pkthdr.csum_iphlen;
1851 pseudo_hdr_offset = cksum_offset + m->m_pkthdr.csum_data;
1852 pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
1853 req->cksum_offset = cksum_offset;
1854 flags |= MXGEFW_FLAGS_CKSUM;
1855 odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
1859 if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
1860 flags |= MXGEFW_FLAGS_SMALL;
1862 /* convert segments into a request list */
1865 req->flags = MXGEFW_FLAGS_FIRST;
1866 for (i = 0; i < cnt; i++) {
1868 htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
1870 htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1871 req->length = htobe16(seg->ds_len);
1872 req->cksum_offset = cksum_offset;
1873 if (cksum_offset > seg->ds_len)
1874 cksum_offset -= seg->ds_len;
1877 req->pseudo_hdr_offset = pseudo_hdr_offset;
1878 req->pad = 0; /* complete solid 16-byte block */
1879 req->rdma_count = 1;
1880 req->flags |= flags | ((cum_len & 1) * odd_flag);
1881 cum_len += seg->ds_len;
1887 /* pad runts to 60 bytes */
1891 htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.dmem_busaddr));
1893 htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.dmem_busaddr));
1894 req->length = htobe16(60 - cum_len);
1895 req->cksum_offset = 0;
1896 req->pseudo_hdr_offset = pseudo_hdr_offset;
1897 req->pad = 0; /* complete solid 16-byte block */
1898 req->rdma_count = 1;
1899 req->flags |= flags | ((cum_len & 1) * odd_flag);
1903 tx->req_list[0].rdma_count = cnt;
1905 /* print what the firmware will see */
1906 for (i = 0; i < cnt; i++) {
1907 kprintf("%d: addr: 0x%x 0x%x len:%d pso%d,"
1908 "cso:%d, flags:0x%x, rdma:%d\n",
1909 i, (int)ntohl(tx->req_list[i].addr_high),
1910 (int)ntohl(tx->req_list[i].addr_low),
1911 (int)ntohs(tx->req_list[i].length),
1912 (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
1913 tx->req_list[i].cksum_offset, tx->req_list[i].flags,
1914 tx->req_list[i].rdma_count);
1916 kprintf("--------------\n");
1918 tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1919 mxge_submit_req(tx, tx->req_list, cnt);
1920 #ifdef IFNET_BUF_RING
1921 if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1922 /* tell the NIC to start polling this slice */
1924 tx->queue_active = 1;
1937 mxge_start_locked(struct mxge_slice_state *ss)
1947 while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
1948 m = ifq_dequeue(&ifp->if_snd);
1952 /* let BPF see it */
1955 /* give it to the nic */
1959 /* ran out of transmit slots */
1960 ifq_set_oactive(&ifp->if_snd);
1964 mxge_start(struct ifnet *ifp, struct ifaltq_subque *ifsq)
1966 mxge_softc_t *sc = ifp->if_softc;
1967 struct mxge_slice_state *ss;
1969 ASSERT_ALTQ_SQ_DEFAULT(ifp, ifsq);
1970 ASSERT_SERIALIZED(sc->ifp->if_serializer);
1971 /* only use the first slice for now */
1973 mxge_start_locked(ss);
1977 * copy an array of mcp_kreq_ether_recv_t's to the mcp. Copy
1978 * at most 32 bytes at a time, so as to avoid involving the software
1979 * pio handler in the nic. We re-write the first segment's low
1980 * DMA address to mark it valid only after we write the entire chunk
1984 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
1985 mcp_kreq_ether_recv_t *src)
1989 low = src->addr_low;
1990 src->addr_low = 0xffffffff;
1991 mxge_pio_copy(dst, src, 4 * sizeof (*src));
1993 mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
1995 src->addr_low = low;
1996 dst->addr_low = low;
2001 mxge_get_buf_small(mxge_rx_ring_t *rx, bus_dmamap_t map, int idx,
2004 bus_dma_segment_t seg;
2006 int cnt, err, mflag;
2008 mflag = MB_DONTWAIT;
2009 if (__predict_false(init))
2012 m = m_gethdr(mflag, MT_DATA);
2016 if (__predict_false(init)) {
2018 * During initialization, there
2019 * is nothing to setup; bail out
2025 m->m_len = m->m_pkthdr.len = MHLEN;
2027 err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2028 &seg, 1, &cnt, BUS_DMA_NOWAIT);
2031 if (__predict_false(init)) {
2033 * During initialization, there
2034 * is nothing to setup; bail out
2041 rx->info[idx].m = m;
2042 rx->shadow[idx].addr_low = htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2043 rx->shadow[idx].addr_high = htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2047 mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2052 mxge_get_buf_big(mxge_rx_ring_t *rx, bus_dmamap_t map, int idx,
2055 bus_dma_segment_t seg;
2057 int cnt, err, mflag;
2059 mflag = MB_DONTWAIT;
2060 if (__predict_false(init))
2063 if (rx->cl_size == MCLBYTES)
2064 m = m_getcl(mflag, MT_DATA, M_PKTHDR);
2066 m = m_getjcl(mflag, MT_DATA, M_PKTHDR, MJUMPAGESIZE);
2070 if (__predict_false(init)) {
2072 * During initialization, there
2073 * is nothing to setup; bail out
2079 m->m_len = m->m_pkthdr.len = rx->mlen;
2081 err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2082 &seg, 1, &cnt, BUS_DMA_NOWAIT);
2085 if (__predict_false(init)) {
2087 * During initialization, there
2088 * is nothing to setup; bail out
2095 rx->info[idx].m = m;
2096 rx->shadow[idx].addr_low = htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2097 rx->shadow[idx].addr_high = htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2101 mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2106 * Myri10GE hardware checksums are not valid if the sender
2107 * padded the frame with non-zero padding. This is because
2108 * the firmware just does a simple 16-bit 1s complement
2109 * checksum across the entire frame, excluding the first 14
2110 * bytes. It is best to simply to check the checksum and
2111 * tell the stack about it only if the checksum is good
2113 static inline uint16_t
2114 mxge_rx_csum(struct mbuf *m, int csum)
2116 struct ether_header *eh;
2120 eh = mtod(m, struct ether_header *);
2122 /* only deal with IPv4 TCP & UDP for now */
2123 if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2125 ip = (struct ip *)(eh + 1);
2126 if (__predict_false(ip->ip_p != IPPROTO_TCP &&
2127 ip->ip_p != IPPROTO_UDP))
2130 c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2131 htonl(ntohs(csum) + ntohs(ip->ip_len) +
2132 - (ip->ip_hl << 2) + ip->ip_p));
2141 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2143 struct ether_vlan_header *evl;
2146 evl = mtod(m, struct ether_vlan_header *);
2149 * fix checksum by subtracting EVL_ENCAPLEN bytes
2150 * after what the firmware thought was the end of the ethernet
2154 /* put checksum into host byte order */
2155 *csum = ntohs(*csum);
2156 partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2157 (*csum) += ~partial;
2158 (*csum) += ((*csum) < ~partial);
2159 (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2160 (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2162 /* restore checksum to network byte order;
2163 later consumers expect this */
2164 *csum = htons(*csum);
2167 m->m_pkthdr.ether_vlantag = ntohs(evl->evl_tag);
2168 m->m_flags |= M_VLANTAG;
2171 * Remove the 802.1q header by copying the Ethernet
2172 * addresses over it and adjusting the beginning of
2173 * the data in the mbuf. The encapsulated Ethernet
2174 * type field is already in place.
2176 bcopy((char *)evl, (char *)evl + EVL_ENCAPLEN,
2177 ETHER_HDR_LEN - ETHER_TYPE_LEN);
2178 m_adj(m, EVL_ENCAPLEN);
2183 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2188 struct ether_header *eh;
2190 bus_dmamap_t old_map;
2196 idx = rx->cnt & rx->mask;
2198 /* save a pointer to the received mbuf */
2199 m = rx->info[idx].m;
2200 /* try to replace the received mbuf */
2201 if (mxge_get_buf_big(rx, rx->extra_map, idx, FALSE)) {
2202 /* drop the frame -- the old mbuf is re-cycled */
2203 IFNET_STAT_INC(ifp, ierrors, 1);
2207 /* unmap the received buffer */
2208 old_map = rx->info[idx].map;
2209 bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2210 bus_dmamap_unload(rx->dmat, old_map);
2212 /* swap the bus_dmamap_t's */
2213 rx->info[idx].map = rx->extra_map;
2214 rx->extra_map = old_map;
2216 /* mcp implicitly skips 1st 2 bytes so that packet is properly
2218 m->m_data += MXGEFW_PAD;
2220 m->m_pkthdr.rcvif = ifp;
2221 m->m_len = m->m_pkthdr.len = len;
2223 eh = mtod(m, struct ether_header *);
2224 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2225 mxge_vlan_tag_remove(m, &csum);
2227 /* if the checksum is valid, mark it in the mbuf header */
2228 if ((ifp->if_capenable & IFCAP_RXCSUM) &&
2229 0 == mxge_rx_csum(m, csum)) {
2230 /* Tell the stack that the checksum is good */
2231 m->m_pkthdr.csum_data = 0xffff;
2232 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2236 /* flowid only valid if RSS hashing is enabled */
2237 if (sc->num_slices > 1) {
2238 m->m_pkthdr.flowid = (ss - sc->ss);
2239 m->m_flags |= M_FLOWID;
2242 ifp->if_input(ifp, m);
2246 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2250 struct ether_header *eh;
2253 bus_dmamap_t old_map;
2259 idx = rx->cnt & rx->mask;
2261 /* save a pointer to the received mbuf */
2262 m = rx->info[idx].m;
2263 /* try to replace the received mbuf */
2264 if (mxge_get_buf_small(rx, rx->extra_map, idx, FALSE)) {
2265 /* drop the frame -- the old mbuf is re-cycled */
2266 IFNET_STAT_INC(ifp, ierrors, 1);
2270 /* unmap the received buffer */
2271 old_map = rx->info[idx].map;
2272 bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2273 bus_dmamap_unload(rx->dmat, old_map);
2275 /* swap the bus_dmamap_t's */
2276 rx->info[idx].map = rx->extra_map;
2277 rx->extra_map = old_map;
2279 /* mcp implicitly skips 1st 2 bytes so that packet is properly
2281 m->m_data += MXGEFW_PAD;
2283 m->m_pkthdr.rcvif = ifp;
2284 m->m_len = m->m_pkthdr.len = len;
2286 eh = mtod(m, struct ether_header *);
2287 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2288 mxge_vlan_tag_remove(m, &csum);
2290 /* if the checksum is valid, mark it in the mbuf header */
2291 if ((ifp->if_capenable & IFCAP_RXCSUM) &&
2292 0 == mxge_rx_csum(m, csum)) {
2293 /* Tell the stack that the checksum is good */
2294 m->m_pkthdr.csum_data = 0xffff;
2295 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2299 /* flowid only valid if RSS hashing is enabled */
2300 if (sc->num_slices > 1) {
2301 m->m_pkthdr.flowid = (ss - sc->ss);
2302 m->m_flags |= M_FLOWID;
2305 ifp->if_input(ifp, m);
2309 mxge_clean_rx_done(struct mxge_slice_state *ss)
2311 mxge_rx_done_t *rx_done = &ss->rx_done;
2316 while (rx_done->entry[rx_done->idx].length != 0) {
2317 length = ntohs(rx_done->entry[rx_done->idx].length);
2318 rx_done->entry[rx_done->idx].length = 0;
2319 checksum = rx_done->entry[rx_done->idx].checksum;
2320 if (length <= (MHLEN - MXGEFW_PAD))
2321 mxge_rx_done_small(ss, length, checksum);
2323 mxge_rx_done_big(ss, length, checksum);
2325 rx_done->idx = rx_done->cnt & rx_done->mask;
2327 /* limit potential for livelock */
2328 if (__predict_false(++limit > rx_done->mask / 2))
2334 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2344 ASSERT_SERIALIZED(ifp->if_serializer);
2345 while (tx->pkt_done != mcp_idx) {
2346 idx = tx->done & tx->mask;
2348 m = tx->info[idx].m;
2349 /* mbuf and DMA map only attached to the first
2352 ss->obytes += m->m_pkthdr.len;
2353 if (m->m_flags & M_MCAST)
2356 tx->info[idx].m = NULL;
2357 map = tx->info[idx].map;
2358 bus_dmamap_unload(tx->dmat, map);
2361 if (tx->info[idx].flag) {
2362 tx->info[idx].flag = 0;
2367 /* If we have space, clear OACTIVE to tell the stack that
2368 its OK to send packets */
2369 if (tx->req - tx->done < (tx->mask + 1)/4)
2370 ifq_clr_oactive(&ifp->if_snd);
2372 if (!ifq_is_empty(&ifp->if_snd))
2375 #ifdef IFNET_BUF_RING
2376 if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2377 /* let the NIC stop polling this queue, since there
2378 * are no more transmits pending */
2379 if (tx->req == tx->done) {
2381 tx->queue_active = 0;
2389 static struct mxge_media_type mxge_xfp_media_types[] = {
2390 {IFM_10G_CX4, 0x7f, "10GBASE-CX4 (module)"},
2391 {IFM_10G_SR, (1 << 7), "10GBASE-SR"},
2392 {IFM_10G_LR, (1 << 6), "10GBASE-LR"},
2393 {0, (1 << 5), "10GBASE-ER"},
2394 {IFM_10G_LRM, (1 << 4), "10GBASE-LRM"},
2395 {0, (1 << 3), "10GBASE-SW"},
2396 {0, (1 << 2), "10GBASE-LW"},
2397 {0, (1 << 1), "10GBASE-EW"},
2398 {0, (1 << 0), "Reserved"}
2401 static struct mxge_media_type mxge_sfp_media_types[] = {
2402 {IFM_10G_TWINAX, 0, "10GBASE-Twinax"},
2403 {0, (1 << 7), "Reserved"},
2404 {IFM_10G_LRM, (1 << 6), "10GBASE-LRM"},
2405 {IFM_10G_LR, (1 << 5), "10GBASE-LR"},
2406 {IFM_10G_SR, (1 << 4), "10GBASE-SR"},
2407 {IFM_10G_TWINAX,(1 << 0), "10GBASE-Twinax"}
2411 mxge_media_set(mxge_softc_t *sc, int media_type)
2413 ifmedia_add(&sc->media, IFM_ETHER | IFM_FDX | media_type, 0, NULL);
2414 ifmedia_set(&sc->media, IFM_ETHER | IFM_FDX | media_type);
2415 sc->current_media = media_type;
2416 sc->media.ifm_media = sc->media.ifm_cur->ifm_media;
2420 mxge_media_init(mxge_softc_t *sc)
2425 ifmedia_removeall(&sc->media);
2426 mxge_media_set(sc, IFM_AUTO);
2429 * parse the product code to deterimine the interface type
2430 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2431 * after the 3rd dash in the driver's cached copy of the
2432 * EEPROM's product code string.
2434 ptr = sc->product_code_string;
2436 device_printf(sc->dev, "Missing product code\n");
2440 for (i = 0; i < 3; i++, ptr++) {
2441 ptr = strchr(ptr, '-');
2443 device_printf(sc->dev, "only %d dashes in PC?!?\n", i);
2447 if (*ptr == 'C' || *(ptr +1) == 'C') {
2449 sc->connector = MXGE_CX4;
2450 mxge_media_set(sc, IFM_10G_CX4);
2451 } else if (*ptr == 'Q') {
2452 /* -Q is Quad Ribbon Fiber */
2453 sc->connector = MXGE_QRF;
2454 device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2455 /* FreeBSD has no media type for Quad ribbon fiber */
2456 } else if (*ptr == 'R') {
2458 sc->connector = MXGE_XFP;
2459 } else if (*ptr == 'S' || *(ptr +1) == 'S') {
2460 /* -S or -2S is SFP+ */
2461 sc->connector = MXGE_SFP;
2463 device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2468 * Determine the media type for a NIC. Some XFPs will identify
2469 * themselves only when their link is up, so this is initiated via a
2470 * link up interrupt. However, this can potentially take up to
2471 * several milliseconds, so it is run via the watchdog routine, rather
2472 * than in the interrupt handler itself.
2475 mxge_media_probe(mxge_softc_t *sc)
2478 const char *cage_type;
2479 struct mxge_media_type *mxge_media_types = NULL;
2480 int i, err, ms, mxge_media_type_entries;
2483 sc->need_media_probe = 0;
2485 if (sc->connector == MXGE_XFP) {
2487 mxge_media_types = mxge_xfp_media_types;
2488 mxge_media_type_entries = sizeof(mxge_xfp_media_types) /
2489 sizeof(mxge_xfp_media_types[0]);
2490 byte = MXGE_XFP_COMPLIANCE_BYTE;
2492 } else if (sc->connector == MXGE_SFP) {
2493 /* -S or -2S is SFP+ */
2494 mxge_media_types = mxge_sfp_media_types;
2495 mxge_media_type_entries = sizeof(mxge_sfp_media_types) /
2496 sizeof(mxge_sfp_media_types[0]);
2500 /* nothing to do; media type cannot change */
2505 * At this point we know the NIC has an XFP cage, so now we
2506 * try to determine what is in the cage by using the
2507 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2508 * register. We read just one byte, which may take over
2512 cmd.data0 = 0; /* just fetch 1 byte, not all 256 */
2514 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2515 if (err == MXGEFW_CMD_ERROR_I2C_FAILURE)
2516 device_printf(sc->dev, "failed to read XFP\n");
2517 if (err == MXGEFW_CMD_ERROR_I2C_ABSENT)
2518 device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
2519 if (err != MXGEFW_CMD_OK)
2522 /* Now we wait for the data to be cached */
2524 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2525 for (ms = 0; err == EBUSY && ms < 50; ms++) {
2528 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2530 if (err != MXGEFW_CMD_OK) {
2531 device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
2532 cage_type, err, ms);
2536 if (cmd.data0 == mxge_media_types[0].bitmask) {
2538 device_printf(sc->dev, "%s:%s\n", cage_type,
2539 mxge_media_types[0].name);
2541 if (sc->current_media != mxge_media_types[0].flag) {
2542 mxge_media_init(sc);
2543 mxge_media_set(sc, mxge_media_types[0].flag);
2547 for (i = 1; i < mxge_media_type_entries; i++) {
2548 if (cmd.data0 & mxge_media_types[i].bitmask) {
2550 device_printf(sc->dev, "%s:%s\n", cage_type,
2551 mxge_media_types[i].name);
2554 if (sc->current_media != mxge_media_types[i].flag) {
2555 mxge_media_init(sc);
2556 mxge_media_set(sc, mxge_media_types[i].flag);
2562 device_printf(sc->dev, "%s media 0x%x unknown\n", cage_type,
2568 mxge_intr(void *arg)
2570 struct mxge_slice_state *ss = arg;
2571 mxge_softc_t *sc = ss->sc;
2572 mcp_irq_data_t *stats = ss->fw_stats;
2573 mxge_tx_ring_t *tx = &ss->tx;
2574 mxge_rx_done_t *rx_done = &ss->rx_done;
2575 uint32_t send_done_count;
2579 #ifndef IFNET_BUF_RING
2580 /* an interrupt on a non-zero slice is implicitly valid
2581 since MSI-X irqs are not shared */
2583 mxge_clean_rx_done(ss);
2584 *ss->irq_claim = be32toh(3);
2589 /* make sure the DMA has finished */
2590 if (!stats->valid) {
2593 valid = stats->valid;
2595 if (sc->irq_type == PCI_INTR_TYPE_LEGACY) {
2596 /* lower legacy IRQ */
2597 *sc->irq_deassert = 0;
2598 if (!mxge_deassert_wait)
2599 /* don't wait for conf. that irq is low */
2605 /* loop while waiting for legacy irq deassertion */
2607 /* check for transmit completes and receives */
2608 send_done_count = be32toh(stats->send_done_count);
2609 while ((send_done_count != tx->pkt_done) ||
2610 (rx_done->entry[rx_done->idx].length != 0)) {
2611 if (send_done_count != tx->pkt_done)
2612 mxge_tx_done(ss, (int)send_done_count);
2613 mxge_clean_rx_done(ss);
2614 send_done_count = be32toh(stats->send_done_count);
2616 if (sc->irq_type == PCI_INTR_TYPE_LEGACY && mxge_deassert_wait)
2618 } while (*((volatile uint8_t *) &stats->valid));
2620 /* fw link & error stats meaningful only on the first slice */
2621 if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
2622 if (sc->link_state != stats->link_up) {
2623 sc->link_state = stats->link_up;
2624 if (sc->link_state) {
2625 sc->ifp->if_link_state = LINK_STATE_UP;
2626 if_link_state_change(sc->ifp);
2628 device_printf(sc->dev, "link up\n");
2630 sc->ifp->if_link_state = LINK_STATE_DOWN;
2631 if_link_state_change(sc->ifp);
2633 device_printf(sc->dev, "link down\n");
2635 sc->need_media_probe = 1;
2637 if (sc->rdma_tags_available !=
2638 be32toh(stats->rdma_tags_available)) {
2639 sc->rdma_tags_available =
2640 be32toh(stats->rdma_tags_available);
2641 device_printf(sc->dev, "RDMA timed out! %d tags "
2642 "left\n", sc->rdma_tags_available);
2645 if (stats->link_down) {
2646 sc->down_cnt += stats->link_down;
2648 sc->ifp->if_link_state = LINK_STATE_DOWN;
2649 if_link_state_change(sc->ifp);
2653 /* check to see if we have rx token to pass back */
2655 *ss->irq_claim = be32toh(3);
2656 *(ss->irq_claim + 1) = be32toh(3);
2660 mxge_init(void *arg)
2662 struct mxge_softc *sc = arg;
2664 ASSERT_SERIALIZED(sc->ifp->if_serializer);
2665 if ((sc->ifp->if_flags & IFF_RUNNING) == 0)
2670 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
2674 for (i = 0; i <= ss->rx_big.mask; i++) {
2675 if (ss->rx_big.info[i].m == NULL)
2677 bus_dmamap_unload(ss->rx_big.dmat,
2678 ss->rx_big.info[i].map);
2679 m_freem(ss->rx_big.info[i].m);
2680 ss->rx_big.info[i].m = NULL;
2683 for (i = 0; i <= ss->rx_small.mask; i++) {
2684 if (ss->rx_small.info[i].m == NULL)
2686 bus_dmamap_unload(ss->rx_small.dmat,
2687 ss->rx_small.info[i].map);
2688 m_freem(ss->rx_small.info[i].m);
2689 ss->rx_small.info[i].m = NULL;
2692 /* transmit ring used only on the first slice */
2693 if (ss->tx.info == NULL)
2696 for (i = 0; i <= ss->tx.mask; i++) {
2697 ss->tx.info[i].flag = 0;
2698 if (ss->tx.info[i].m == NULL)
2700 bus_dmamap_unload(ss->tx.dmat,
2701 ss->tx.info[i].map);
2702 m_freem(ss->tx.info[i].m);
2703 ss->tx.info[i].m = NULL;
2708 mxge_free_mbufs(mxge_softc_t *sc)
2712 for (slice = 0; slice < sc->num_slices; slice++)
2713 mxge_free_slice_mbufs(&sc->ss[slice]);
2717 mxge_free_slice_rings(struct mxge_slice_state *ss)
2721 if (ss->rx_done.entry != NULL) {
2722 mxge_dma_free(&ss->rx_done.dma);
2723 ss->rx_done.entry = NULL;
2726 if (ss->tx.req_bytes != NULL) {
2727 kfree(ss->tx.req_bytes, M_DEVBUF);
2728 ss->tx.req_bytes = NULL;
2731 if (ss->tx.seg_list != NULL) {
2732 kfree(ss->tx.seg_list, M_DEVBUF);
2733 ss->tx.seg_list = NULL;
2736 if (ss->rx_small.shadow != NULL) {
2737 kfree(ss->rx_small.shadow, M_DEVBUF);
2738 ss->rx_small.shadow = NULL;
2741 if (ss->rx_big.shadow != NULL) {
2742 kfree(ss->rx_big.shadow, M_DEVBUF);
2743 ss->rx_big.shadow = NULL;
2746 if (ss->tx.info != NULL) {
2747 if (ss->tx.dmat != NULL) {
2748 for (i = 0; i <= ss->tx.mask; i++) {
2749 bus_dmamap_destroy(ss->tx.dmat,
2750 ss->tx.info[i].map);
2752 bus_dma_tag_destroy(ss->tx.dmat);
2754 kfree(ss->tx.info, M_DEVBUF);
2758 if (ss->rx_small.info != NULL) {
2759 if (ss->rx_small.dmat != NULL) {
2760 for (i = 0; i <= ss->rx_small.mask; i++) {
2761 bus_dmamap_destroy(ss->rx_small.dmat,
2762 ss->rx_small.info[i].map);
2764 bus_dmamap_destroy(ss->rx_small.dmat,
2765 ss->rx_small.extra_map);
2766 bus_dma_tag_destroy(ss->rx_small.dmat);
2768 kfree(ss->rx_small.info, M_DEVBUF);
2769 ss->rx_small.info = NULL;
2772 if (ss->rx_big.info != NULL) {
2773 if (ss->rx_big.dmat != NULL) {
2774 for (i = 0; i <= ss->rx_big.mask; i++) {
2775 bus_dmamap_destroy(ss->rx_big.dmat,
2776 ss->rx_big.info[i].map);
2778 bus_dmamap_destroy(ss->rx_big.dmat,
2779 ss->rx_big.extra_map);
2780 bus_dma_tag_destroy(ss->rx_big.dmat);
2782 kfree(ss->rx_big.info, M_DEVBUF);
2783 ss->rx_big.info = NULL;
2788 mxge_free_rings(mxge_softc_t *sc)
2795 for (slice = 0; slice < sc->num_slices; slice++)
2796 mxge_free_slice_rings(&sc->ss[slice]);
2800 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
2801 int tx_ring_entries)
2803 mxge_softc_t *sc = ss->sc;
2808 * Allocate per-slice receive resources
2811 ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
2812 ss->rx_done.mask = (2 * rx_ring_entries) - 1;
2814 /* Allocate the rx shadow rings */
2815 bytes = rx_ring_entries * sizeof(*ss->rx_small.shadow);
2816 ss->rx_small.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2818 bytes = rx_ring_entries * sizeof(*ss->rx_big.shadow);
2819 ss->rx_big.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2821 /* Allocate the rx host info rings */
2822 bytes = rx_ring_entries * sizeof(*ss->rx_small.info);
2823 ss->rx_small.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2825 bytes = rx_ring_entries * sizeof(*ss->rx_big.info);
2826 ss->rx_big.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2828 /* Allocate the rx busdma resources */
2829 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
2831 4096, /* boundary */
2832 BUS_SPACE_MAXADDR, /* low */
2833 BUS_SPACE_MAXADDR, /* high */
2834 NULL, NULL, /* filter */
2835 MHLEN, /* maxsize */
2837 MHLEN, /* maxsegsize */
2838 BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW,
2840 &ss->rx_small.dmat); /* tag */
2842 device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
2847 err = bus_dmamap_create(ss->rx_small.dmat, BUS_DMA_WAITOK,
2848 &ss->rx_small.extra_map);
2850 device_printf(sc->dev, "Err %d extra rx_small dmamap\n", err);
2851 bus_dma_tag_destroy(ss->rx_small.dmat);
2852 ss->rx_small.dmat = NULL;
2855 for (i = 0; i <= ss->rx_small.mask; i++) {
2856 err = bus_dmamap_create(ss->rx_small.dmat, BUS_DMA_WAITOK,
2857 &ss->rx_small.info[i].map);
2861 device_printf(sc->dev, "Err %d rx_small dmamap\n", err);
2863 for (j = 0; j < i; ++j) {
2864 bus_dmamap_destroy(ss->rx_small.dmat,
2865 ss->rx_small.info[j].map);
2867 bus_dmamap_destroy(ss->rx_small.dmat,
2868 ss->rx_small.extra_map);
2869 bus_dma_tag_destroy(ss->rx_small.dmat);
2870 ss->rx_small.dmat = NULL;
2875 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
2877 4096, /* boundary */
2878 BUS_SPACE_MAXADDR, /* low */
2879 BUS_SPACE_MAXADDR, /* high */
2880 NULL, NULL, /* filter */
2883 4096, /* maxsegsize*/
2884 BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW,
2886 &ss->rx_big.dmat); /* tag */
2888 device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
2893 err = bus_dmamap_create(ss->rx_big.dmat, BUS_DMA_WAITOK,
2894 &ss->rx_big.extra_map);
2896 device_printf(sc->dev, "Err %d extra rx_big dmamap\n", err);
2897 bus_dma_tag_destroy(ss->rx_big.dmat);
2898 ss->rx_big.dmat = NULL;
2901 for (i = 0; i <= ss->rx_big.mask; i++) {
2902 err = bus_dmamap_create(ss->rx_big.dmat, BUS_DMA_WAITOK,
2903 &ss->rx_big.info[i].map);
2907 device_printf(sc->dev, "Err %d rx_big dmamap\n", err);
2908 for (j = 0; j < i; ++j) {
2909 bus_dmamap_destroy(ss->rx_big.dmat,
2910 ss->rx_big.info[j].map);
2912 bus_dmamap_destroy(ss->rx_big.dmat,
2913 ss->rx_big.extra_map);
2914 bus_dma_tag_destroy(ss->rx_big.dmat);
2915 ss->rx_big.dmat = NULL;
2921 * Now allocate TX resources
2924 #ifndef IFNET_BUF_RING
2925 /* only use a single TX ring for now */
2926 if (ss != ss->sc->ss)
2930 ss->tx.mask = tx_ring_entries - 1;
2931 ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
2933 /* Allocate the tx request copy block XXX */
2934 bytes = 8 + sizeof(*ss->tx.req_list) * (ss->tx.max_desc + 4);
2935 ss->tx.req_bytes = kmalloc(bytes, M_DEVBUF, M_WAITOK);
2936 /* Ensure req_list entries are aligned to 8 bytes */
2937 ss->tx.req_list = (mcp_kreq_ether_send_t *)
2938 ((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
2940 /* Allocate the tx busdma segment list */
2941 bytes = sizeof(*ss->tx.seg_list) * ss->tx.max_desc;
2942 ss->tx.seg_list = kmalloc(bytes, M_DEVBUF, M_WAITOK);
2944 /* Allocate the tx host info ring */
2945 bytes = tx_ring_entries * sizeof(*ss->tx.info);
2946 ss->tx.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2948 /* Allocate the tx busdma resources */
2949 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
2951 sc->tx_boundary, /* boundary */
2952 BUS_SPACE_MAXADDR, /* low */
2953 BUS_SPACE_MAXADDR, /* high */
2954 NULL, NULL, /* filter */
2956 sizeof(struct ether_vlan_header),
2958 ss->tx.max_desc - 2, /* num segs */
2959 sc->tx_boundary, /* maxsegsz */
2960 BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW |
2961 BUS_DMA_ONEBPAGE, /* flags */
2962 &ss->tx.dmat); /* tag */
2964 device_printf(sc->dev, "Err %d allocating tx dmat\n", err);
2969 * Now use these tags to setup DMA maps for each slot in the ring
2971 for (i = 0; i <= ss->tx.mask; i++) {
2972 err = bus_dmamap_create(ss->tx.dmat,
2973 BUS_DMA_WAITOK | BUS_DMA_ONEBPAGE, &ss->tx.info[i].map);
2977 device_printf(sc->dev, "Err %d tx dmamap\n", err);
2978 for (j = 0; j < i; ++j) {
2979 bus_dmamap_destroy(ss->tx.dmat,
2980 ss->tx.info[j].map);
2982 bus_dma_tag_destroy(ss->tx.dmat);
2991 mxge_alloc_rings(mxge_softc_t *sc)
2995 int tx_ring_entries, rx_ring_entries;
2998 /* Get ring sizes */
2999 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3001 device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3004 tx_ring_size = cmd.data0;
3006 tx_ring_entries = tx_ring_size / sizeof(mcp_kreq_ether_send_t);
3007 rx_ring_entries = sc->rx_ring_size / sizeof(mcp_dma_addr_t);
3008 ifq_set_maxlen(&sc->ifp->if_snd, tx_ring_entries - 1);
3009 ifq_set_ready(&sc->ifp->if_snd);
3011 for (slice = 0; slice < sc->num_slices; slice++) {
3012 err = mxge_alloc_slice_rings(&sc->ss[slice],
3013 rx_ring_entries, tx_ring_entries);
3015 device_printf(sc->dev,
3016 "alloc %d slice rings failed\n", slice);
3024 mxge_choose_params(int mtu, int *cl_size)
3026 int bufsize = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN + MXGEFW_PAD;
3028 if (bufsize < MCLBYTES) {
3029 *cl_size = MCLBYTES;
3031 KASSERT(bufsize < MJUMPAGESIZE, ("invalid MTU %d", mtu));
3032 *cl_size = MJUMPAGESIZE;
3037 mxge_slice_open(struct mxge_slice_state *ss, int cl_size)
3042 slice = ss - ss->sc->ss;
3045 * Get the lanai pointers to the send and receive rings
3048 #ifndef IFNET_BUF_RING
3049 /* We currently only send from the first slice */
3053 err = mxge_send_cmd(ss->sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3054 ss->tx.lanai = (volatile mcp_kreq_ether_send_t *)
3055 (ss->sc->sram + cmd.data0);
3056 ss->tx.send_go = (volatile uint32_t *)
3057 (ss->sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3058 ss->tx.send_stop = (volatile uint32_t *)
3059 (ss->sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3060 #ifndef IFNET_BUF_RING
3065 err |= mxge_send_cmd(ss->sc, MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3066 ss->rx_small.lanai =
3067 (volatile mcp_kreq_ether_recv_t *)(ss->sc->sram + cmd.data0);
3070 err |= mxge_send_cmd(ss->sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3072 (volatile mcp_kreq_ether_recv_t *)(ss->sc->sram + cmd.data0);
3075 if_printf(ss->sc->ifp,
3076 "failed to get ring sizes or locations\n");
3081 * Stock small receive ring
3083 for (i = 0; i <= ss->rx_small.mask; i++) {
3084 err = mxge_get_buf_small(&ss->rx_small,
3085 ss->rx_small.info[i].map, i, TRUE);
3087 if_printf(ss->sc->ifp, "alloced %d/%d smalls\n", i,
3088 ss->rx_small.mask + 1);
3094 * Stock big receive ring
3096 for (i = 0; i <= ss->rx_big.mask; i++) {
3097 ss->rx_big.shadow[i].addr_low = 0xffffffff;
3098 ss->rx_big.shadow[i].addr_high = 0xffffffff;
3101 ss->rx_big.cl_size = cl_size;
3102 ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3103 EVL_ENCAPLEN + MXGEFW_PAD;
3105 for (i = 0; i <= ss->rx_big.mask; i++) {
3106 err = mxge_get_buf_big(&ss->rx_big,
3107 ss->rx_big.info[i].map, i, TRUE);
3109 if_printf(ss->sc->ifp, "alloced %d/%d bigs\n", i,
3110 ss->rx_big.mask + 1);
3118 mxge_open(mxge_softc_t *sc)
3120 struct ifnet *ifp = sc->ifp;
3122 int err, slice, cl_size, i;
3124 volatile uint8_t *itable;
3125 struct mxge_slice_state *ss;
3127 ASSERT_SERIALIZED(ifp->if_serializer);
3129 /* Copy the MAC address in case it was overridden */
3130 bcopy(IF_LLADDR(ifp), sc->mac_addr, ETHER_ADDR_LEN);
3132 err = mxge_reset(sc, 1);
3134 if_printf(ifp, "failed to reset\n");
3138 if (sc->num_slices > 1) {
3139 /* Setup the indirection table */
3140 cmd.data0 = sc->num_slices;
3141 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE, &cmd);
3143 err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET, &cmd);
3145 if_printf(ifp, "failed to setup rss tables\n");
3149 /* Just enable an identity mapping */
3150 itable = sc->sram + cmd.data0;
3151 for (i = 0; i < sc->num_slices; i++)
3152 itable[i] = (uint8_t)i;
3155 cmd.data1 = MXGEFW_RSS_HASH_TYPE_TCP_IPV4;
3156 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3158 if_printf(ifp, "failed to enable slices\n");
3163 cmd.data0 = MXGEFW_TSO_MODE_NDIS;
3164 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_TSO_MODE, &cmd);
3167 * Can't change TSO mode to NDIS, never allow TSO then
3169 if_printf(ifp, "failed to set TSO mode\n");
3170 ifp->if_capenable &= ~IFCAP_TSO;
3171 ifp->if_capabilities &= ~IFCAP_TSO;
3172 ifp->if_hwassist &= ~CSUM_TSO;
3175 mxge_choose_params(ifp->if_mtu, &cl_size);
3178 err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS, &cmd);
3180 * Error is only meaningful if we're trying to set
3181 * MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1
3185 * Give the firmware the mtu and the big and small buffer
3186 * sizes. The firmware wants the big buf size to be a power
3187 * of two. Luckily, FreeBSD's clusters are powers of two
3189 cmd.data0 = ifp->if_mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
3190 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3192 /* XXX need to cut MXGEFW_PAD here? */
3193 cmd.data0 = MHLEN - MXGEFW_PAD;
3194 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE, &cmd);
3196 cmd.data0 = cl_size;
3197 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3200 if_printf(ifp, "failed to setup params\n");
3204 /* Now give him the pointer to the stats block */
3206 #ifdef IFNET_BUF_RING
3207 slice < sc->num_slices;
3212 ss = &sc->ss[slice];
3213 cmd.data0 = MXGE_LOWPART_TO_U32(ss->fw_stats_dma.dmem_busaddr);
3214 cmd.data1 = MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.dmem_busaddr);
3215 cmd.data2 = sizeof(struct mcp_irq_data);
3216 cmd.data2 |= (slice << 16);
3217 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3221 bus = sc->ss->fw_stats_dma.dmem_busaddr;
3222 bus += offsetof(struct mcp_irq_data, send_done_count);
3223 cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3224 cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3225 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3228 /* Firmware cannot support multicast without STATS_DMA_V2 */
3229 sc->fw_multicast_support = 0;
3231 sc->fw_multicast_support = 1;
3235 if_printf(ifp, "failed to setup params\n");
3239 for (slice = 0; slice < sc->num_slices; slice++) {
3240 err = mxge_slice_open(&sc->ss[slice], cl_size);
3242 if_printf(ifp, "couldn't open slice %d\n", slice);
3247 /* Finally, start the firmware running */
3248 err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3250 if_printf(ifp, "Couldn't bring up link\n");
3253 ifp->if_flags |= IFF_RUNNING;
3254 ifq_clr_oactive(&ifp->if_snd);
3259 mxge_free_mbufs(sc);
3264 mxge_close(mxge_softc_t *sc, int down)
3266 struct ifnet *ifp = sc->ifp;
3268 int err, old_down_cnt;
3270 ASSERT_SERIALIZED(ifp->if_serializer);
3272 ifp->if_flags &= ~IFF_RUNNING;
3273 ifq_clr_oactive(&ifp->if_snd);
3276 old_down_cnt = sc->down_cnt;
3279 err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3281 if_printf(ifp, "Couldn't bring down link\n");
3283 if (old_down_cnt == sc->down_cnt) {
3284 /* Wait for down irq */
3285 lwkt_serialize_exit(ifp->if_serializer);
3286 DELAY(10 * sc->intr_coal_delay);
3287 lwkt_serialize_enter(ifp->if_serializer);
3291 if (old_down_cnt == sc->down_cnt)
3292 if_printf(ifp, "never got down irq\n");
3294 mxge_free_mbufs(sc);
3298 mxge_setup_cfg_space(mxge_softc_t *sc)
3300 device_t dev = sc->dev;
3302 uint16_t lnk, pectl;
3304 /* Find the PCIe link width and set max read request to 4KB */
3305 if (pci_find_extcap(dev, PCIY_EXPRESS, ®) == 0) {
3306 lnk = pci_read_config(dev, reg + 0x12, 2);
3307 sc->link_width = (lnk >> 4) & 0x3f;
3309 if (sc->pectl == 0) {
3310 pectl = pci_read_config(dev, reg + 0x8, 2);
3311 pectl = (pectl & ~0x7000) | (5 << 12);
3312 pci_write_config(dev, reg + 0x8, pectl, 2);
3315 /* Restore saved pectl after watchdog reset */
3316 pci_write_config(dev, reg + 0x8, sc->pectl, 2);
3320 /* Enable DMA and memory space access */
3321 pci_enable_busmaster(dev);
3325 mxge_read_reboot(mxge_softc_t *sc)
3327 device_t dev = sc->dev;
3330 /* find the vendor specific offset */
3331 if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
3332 device_printf(sc->dev,
3333 "could not find vendor specific offset\n");
3334 return (uint32_t)-1;
3336 /* enable read32 mode */
3337 pci_write_config(dev, vs + 0x10, 0x3, 1);
3338 /* tell NIC which register to read */
3339 pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3340 return (pci_read_config(dev, vs + 0x14, 4));
3344 mxge_watchdog_reset(mxge_softc_t *sc)
3346 struct pci_devinfo *dinfo;
3353 device_printf(sc->dev, "Watchdog reset!\n");
3356 * check to see if the NIC rebooted. If it did, then all of
3357 * PCI config space has been reset, and things like the
3358 * busmaster bit will be zero. If this is the case, then we
3359 * must restore PCI config space before the NIC can be used
3362 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3363 if (cmd == 0xffff) {
3365 * maybe the watchdog caught the NIC rebooting; wait
3366 * up to 100ms for it to finish. If it does not come
3367 * back, then give up
3370 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3371 if (cmd == 0xffff) {
3372 device_printf(sc->dev, "NIC disappeared!\n");
3375 if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3376 /* print the reboot status */
3377 reboot = mxge_read_reboot(sc);
3378 device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3380 running = sc->ifp->if_flags & IFF_RUNNING;
3384 * quiesce NIC so that TX routines will not try to
3385 * xmit after restoration of BAR
3388 /* Mark the link as down */
3389 if (sc->link_state) {
3390 sc->ifp->if_link_state = LINK_STATE_DOWN;
3391 if_link_state_change(sc->ifp);
3395 /* restore PCI configuration space */
3396 dinfo = device_get_ivars(sc->dev);
3397 pci_cfg_restore(sc->dev, dinfo);
3399 /* and redo any changes we made to our config space */
3400 mxge_setup_cfg_space(sc);
3403 err = mxge_load_firmware(sc, 0);
3405 device_printf(sc->dev,
3406 "Unable to re-load f/w\n");
3410 err = mxge_open(sc);
3411 if_devstart_sched(sc->ifp);
3414 sc->watchdog_resets++;
3416 device_printf(sc->dev,
3417 "NIC did not reboot, not resetting\n");
3421 device_printf(sc->dev, "watchdog reset failed\n");
3425 callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3430 mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice)
3432 tx = &sc->ss[slice].tx;
3433 device_printf(sc->dev, "slice %d struck? ring state:\n", slice);
3434 device_printf(sc->dev,
3435 "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3436 tx->req, tx->done, tx->queue_active);
3437 device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3438 tx->activate, tx->deactivate);
3439 device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3441 be32toh(sc->ss->fw_stats->send_done_count));
3445 mxge_watchdog(mxge_softc_t *sc)
3448 uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3451 /* see if we have outstanding transmits, which
3452 have been pending for more than mxge_ticks */
3454 #ifdef IFNET_BUF_RING
3455 (i < sc->num_slices) && (err == 0);
3457 (i < 1) && (err == 0);
3461 if (tx->req != tx->done &&
3462 tx->watchdog_req != tx->watchdog_done &&
3463 tx->done == tx->watchdog_done) {
3464 /* check for pause blocking before resetting */
3465 if (tx->watchdog_rx_pause == rx_pause) {
3466 mxge_warn_stuck(sc, tx, i);
3467 mxge_watchdog_reset(sc);
3471 device_printf(sc->dev, "Flow control blocking "
3472 "xmits, check link partner\n");
3475 tx->watchdog_req = tx->req;
3476 tx->watchdog_done = tx->done;
3477 tx->watchdog_rx_pause = rx_pause;
3480 if (sc->need_media_probe)
3481 mxge_media_probe(sc);
3486 mxge_update_stats(mxge_softc_t *sc)
3488 struct mxge_slice_state *ss;
3490 u_long ipackets = 0, old_ipackets;
3491 u_long opackets = 0, old_opackets;
3492 #ifdef IFNET_BUF_RING
3500 for (slice = 0; slice < sc->num_slices; slice++) {
3501 ss = &sc->ss[slice];
3502 ipackets += ss->ipackets;
3503 opackets += ss->opackets;
3504 #ifdef IFNET_BUF_RING
3505 obytes += ss->obytes;
3506 omcasts += ss->omcasts;
3507 odrops += ss->tx.br->br_drops;
3509 oerrors += ss->oerrors;
3511 IFNET_STAT_GET(sc->ifp, ipackets, old_ipackets);
3512 IFNET_STAT_GET(sc->ifp, opackets, old_opackets);
3514 pkts = ipackets - old_ipackets;
3515 pkts += opackets - old_opackets;
3517 IFNET_STAT_SET(sc->ifp, ipackets, ipackets);
3518 IFNET_STAT_SET(sc->ifp, opackets, opackets);
3519 #ifdef IFNET_BUF_RING
3520 sc->ifp->if_obytes = obytes;
3521 sc->ifp->if_omcasts = omcasts;
3522 sc->ifp->if_snd.ifq_drops = odrops;
3524 IFNET_STAT_SET(sc->ifp, oerrors, oerrors);
3529 mxge_tick(void *arg)
3531 mxge_softc_t *sc = arg;
3537 lwkt_serialize_enter(sc->ifp->if_serializer);
3540 running = sc->ifp->if_flags & IFF_RUNNING;
3542 /* aggregate stats from different slices */
3543 pkts = mxge_update_stats(sc);
3544 if (!sc->watchdog_countdown) {
3545 err = mxge_watchdog(sc);
3546 sc->watchdog_countdown = 4;
3548 sc->watchdog_countdown--;
3551 /* ensure NIC did not suffer h/w fault while idle */
3552 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3553 if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3555 mxge_watchdog_reset(sc);
3558 /* look less often if NIC is idle */
3563 callout_reset(&sc->co_hdl, ticks, mxge_tick, sc);
3565 lwkt_serialize_exit(sc->ifp->if_serializer);
3569 mxge_media_change(struct ifnet *ifp)
3575 mxge_change_mtu(mxge_softc_t *sc, int mtu)
3577 struct ifnet *ifp = sc->ifp;
3578 int real_mtu, old_mtu;
3581 real_mtu = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
3582 if (mtu > sc->max_mtu || real_mtu < 60)
3585 old_mtu = ifp->if_mtu;
3587 if (ifp->if_flags & IFF_RUNNING) {
3589 err = mxge_open(sc);
3591 ifp->if_mtu = old_mtu;
3600 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
3602 mxge_softc_t *sc = ifp->if_softc;
3607 ifmr->ifm_status = IFM_AVALID;
3608 ifmr->ifm_active = IFM_ETHER | IFM_FDX;
3609 ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
3610 ifmr->ifm_active |= sc->current_media;
3614 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data,
3615 struct ucred *cr __unused)
3617 mxge_softc_t *sc = ifp->if_softc;
3618 struct ifreq *ifr = (struct ifreq *)data;
3622 ASSERT_SERIALIZED(ifp->if_serializer);
3625 err = mxge_change_mtu(sc, ifr->ifr_mtu);
3632 if (ifp->if_flags & IFF_UP) {
3633 if (!(ifp->if_flags & IFF_RUNNING)) {
3634 err = mxge_open(sc);
3636 /* take care of promis can allmulti
3638 mxge_change_promisc(sc,
3639 ifp->if_flags & IFF_PROMISC);
3640 mxge_set_multicast_list(sc);
3643 if (ifp->if_flags & IFF_RUNNING) {
3651 mxge_set_multicast_list(sc);
3655 mask = ifr->ifr_reqcap ^ ifp->if_capenable;
3656 if (mask & IFCAP_TXCSUM) {
3657 ifp->if_capenable ^= IFCAP_TXCSUM;
3658 if (ifp->if_capenable & IFCAP_TXCSUM)
3659 ifp->if_hwassist |= CSUM_TCP | CSUM_UDP;
3661 ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP);
3663 if (mask & IFCAP_TSO) {
3664 ifp->if_capenable ^= IFCAP_TSO;
3665 if (ifp->if_capenable & IFCAP_TSO)
3666 ifp->if_hwassist |= CSUM_TSO;
3668 ifp->if_hwassist &= ~CSUM_TSO;
3670 if (mask & IFCAP_RXCSUM)
3671 ifp->if_capenable ^= IFCAP_RXCSUM;
3672 if (mask & IFCAP_VLAN_HWTAGGING)
3673 ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
3677 mxge_media_probe(sc);
3678 err = ifmedia_ioctl(ifp, (struct ifreq *)data,
3679 &sc->media, command);
3683 err = ether_ioctl(ifp, command, data);
3690 mxge_fetch_tunables(mxge_softc_t *sc)
3692 sc->intr_coal_delay = mxge_intr_coal_delay;
3693 if (sc->intr_coal_delay < 0 || sc->intr_coal_delay > (10 * 1000))
3694 sc->intr_coal_delay = MXGE_INTR_COAL_DELAY;
3697 if (mxge_ticks == 0)
3698 mxge_ticks = hz / 2;
3700 sc->pause = mxge_flow_control;
3702 sc->throttle = mxge_throttle;
3703 if (sc->throttle && sc->throttle > MXGE_MAX_THROTTLE)
3704 sc->throttle = MXGE_MAX_THROTTLE;
3705 if (sc->throttle && sc->throttle < MXGE_MIN_THROTTLE)
3706 sc->throttle = MXGE_MIN_THROTTLE;
3710 mxge_free_slices(mxge_softc_t *sc)
3712 struct mxge_slice_state *ss;
3718 for (i = 0; i < sc->num_slices; i++) {
3720 if (ss->fw_stats != NULL) {
3721 mxge_dma_free(&ss->fw_stats_dma);
3722 ss->fw_stats = NULL;
3724 if (ss->rx_done.entry != NULL) {
3725 mxge_dma_free(&ss->rx_done.dma);
3726 ss->rx_done.entry = NULL;
3729 kfree(sc->ss, M_DEVBUF);
3734 mxge_alloc_slices(mxge_softc_t *sc)
3737 struct mxge_slice_state *ss;
3739 int err, i, max_intr_slots;
3741 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
3743 device_printf(sc->dev, "Cannot determine rx ring size\n");
3746 sc->rx_ring_size = cmd.data0;
3747 max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
3749 bytes = sizeof(*sc->ss) * sc->num_slices;
3750 sc->ss = kmalloc(bytes, M_DEVBUF, M_WAITOK | M_ZERO);
3752 for (i = 0; i < sc->num_slices; i++) {
3758 * Allocate per-slice rx interrupt queues
3760 bytes = max_intr_slots * sizeof(*ss->rx_done.entry);
3761 err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
3763 device_printf(sc->dev,
3764 "alloc %d slice rx_done failed\n", i);
3767 ss->rx_done.entry = ss->rx_done.dma.dmem_addr;
3770 * Allocate the per-slice firmware stats; stats
3771 * (including tx) are used used only on the first
3774 #ifndef IFNET_BUF_RING
3779 bytes = sizeof(*ss->fw_stats);
3780 err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
3781 sizeof(*ss->fw_stats), 64);
3783 device_printf(sc->dev,
3784 "alloc %d fw_stats failed\n", i);
3787 ss->fw_stats = ss->fw_stats_dma.dmem_addr;
3793 mxge_slice_probe(mxge_softc_t *sc)
3797 int msix_cnt, status, max_intr_slots;
3804 * Don't enable multiple slices if they are not enabled,
3805 * or if this is not an SMP system
3807 if (mxge_max_slices == 0 || mxge_max_slices == 1 || ncpus < 2)
3810 /* see how many MSI-X interrupts are available */
3811 msix_cnt = pci_msix_count(sc->dev);
3815 /* now load the slice aware firmware see what it supports */
3816 old_fw = sc->fw_name;
3817 if (old_fw == mxge_fw_aligned)
3818 sc->fw_name = mxge_fw_rss_aligned;
3820 sc->fw_name = mxge_fw_rss_unaligned;
3821 status = mxge_load_firmware(sc, 0);
3823 device_printf(sc->dev, "Falling back to a single slice\n");
3827 /* try to send a reset command to the card to see if it
3829 memset(&cmd, 0, sizeof (cmd));
3830 status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
3832 device_printf(sc->dev, "failed reset\n");
3836 /* get rx ring size */
3837 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
3839 device_printf(sc->dev, "Cannot determine rx ring size\n");
3842 max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
3844 /* tell it the size of the interrupt queues */
3845 cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
3846 status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
3848 device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
3852 /* ask the maximum number of slices it supports */
3853 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
3855 device_printf(sc->dev,
3856 "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
3859 sc->num_slices = cmd.data0;
3860 if (sc->num_slices > msix_cnt)
3861 sc->num_slices = msix_cnt;
3863 if (mxge_max_slices == -1) {
3864 /* cap to number of CPUs in system */
3865 if (sc->num_slices > ncpus)
3866 sc->num_slices = ncpus;
3868 if (sc->num_slices > mxge_max_slices)
3869 sc->num_slices = mxge_max_slices;
3871 /* make sure it is a power of two */
3872 while (sc->num_slices & (sc->num_slices - 1))
3876 device_printf(sc->dev, "using %d slices\n",
3882 sc->fw_name = old_fw;
3883 (void) mxge_load_firmware(sc, 0);
3888 mxge_add_msix_irqs(mxge_softc_t *sc)
3891 int count, err, i, rid;
3894 sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
3897 if (sc->msix_table_res == NULL) {
3898 device_printf(sc->dev, "couldn't alloc MSIX table res\n");
3902 count = sc->num_slices;
3903 err = pci_alloc_msix(sc->dev, &count);
3905 device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
3906 "err = %d \n", sc->num_slices, err);
3907 goto abort_with_msix_table;
3909 if (count < sc->num_slices) {
3910 device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
3911 count, sc->num_slices);
3912 device_printf(sc->dev,
3913 "Try setting hw.mxge.max_slices to %d\n",
3916 goto abort_with_msix;
3918 bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
3919 sc->msix_irq_res = kmalloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
3920 if (sc->msix_irq_res == NULL) {
3922 goto abort_with_msix;
3925 for (i = 0; i < sc->num_slices; i++) {
3927 sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
3930 if (sc->msix_irq_res[i] == NULL) {
3931 device_printf(sc->dev, "couldn't allocate IRQ res"
3932 " for message %d\n", i);
3934 goto abort_with_res;
3938 bytes = sizeof (*sc->msix_ih) * sc->num_slices;
3939 sc->msix_ih = kmalloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
3941 for (i = 0; i < sc->num_slices; i++) {
3942 err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
3944 mxge_intr, &sc->ss[i], &sc->msix_ih[i],
3945 sc->ifp->if_serializer);
3947 device_printf(sc->dev, "couldn't setup intr for "
3949 goto abort_with_intr;
3954 device_printf(sc->dev, "using %d msix IRQs:",
3956 for (i = 0; i < sc->num_slices; i++)
3957 kprintf(" %ld", rman_get_start(sc->msix_irq_res[i]));
3963 for (i = 0; i < sc->num_slices; i++) {
3964 if (sc->msix_ih[i] != NULL) {
3965 bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
3967 sc->msix_ih[i] = NULL;
3970 kfree(sc->msix_ih, M_DEVBUF);
3974 for (i = 0; i < sc->num_slices; i++) {
3976 if (sc->msix_irq_res[i] != NULL)
3977 bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
3978 sc->msix_irq_res[i]);
3979 sc->msix_irq_res[i] = NULL;
3981 kfree(sc->msix_irq_res, M_DEVBUF);
3985 pci_release_msi(sc->dev);
3987 abort_with_msix_table:
3988 bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
3989 sc->msix_table_res);
3996 mxge_add_single_irq(mxge_softc_t *sc)
4000 sc->irq_type = pci_alloc_1intr(sc->dev, mxge_msi_enable,
4001 &sc->irq_rid, &irq_flags);
4003 sc->irq_res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ,
4004 &sc->irq_rid, irq_flags);
4005 if (sc->irq_res == NULL) {
4006 device_printf(sc->dev, "could not alloc interrupt\n");
4010 return bus_setup_intr(sc->dev, sc->irq_res, INTR_MPSAFE,
4011 mxge_intr, &sc->ss[0], &sc->ih, sc->ifp->if_serializer);
4016 mxge_rem_msix_irqs(mxge_softc_t *sc)
4020 for (i = 0; i < sc->num_slices; i++) {
4021 if (sc->msix_ih[i] != NULL) {
4022 bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4024 sc->msix_ih[i] = NULL;
4027 kfree(sc->msix_ih, M_DEVBUF);
4029 for (i = 0; i < sc->num_slices; i++) {
4031 if (sc->msix_irq_res[i] != NULL)
4032 bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4033 sc->msix_irq_res[i]);
4034 sc->msix_irq_res[i] = NULL;
4036 kfree(sc->msix_irq_res, M_DEVBUF);
4038 bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4039 sc->msix_table_res);
4041 pci_release_msi(sc->dev);
4047 mxge_add_irq(mxge_softc_t *sc)
4052 if (sc->num_slices > 1)
4053 err = mxge_add_msix_irqs(sc);
4055 err = mxge_add_single_irq(sc);
4057 if (0 && err == 0 && sc->num_slices > 1) {
4058 mxge_rem_msix_irqs(sc);
4059 err = mxge_add_msix_irqs(sc);
4063 return mxge_add_single_irq(sc);
4068 mxge_attach(device_t dev)
4070 mxge_softc_t *sc = device_get_softc(dev);
4071 struct ifnet *ifp = &sc->arpcom.ac_if;
4075 * Avoid rewriting half the lines in this file to use
4076 * &sc->arpcom.ac_if instead
4080 if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4081 ifmedia_init(&sc->media, 0, mxge_media_change, mxge_media_status);
4083 mxge_fetch_tunables(sc);
4085 err = bus_dma_tag_create(NULL, /* parent */
4088 BUS_SPACE_MAXADDR, /* low */
4089 BUS_SPACE_MAXADDR, /* high */
4090 NULL, NULL, /* filter */
4091 BUS_SPACE_MAXSIZE_32BIT,/* maxsize */
4093 BUS_SPACE_MAXSIZE_32BIT,/* maxsegsize */
4095 &sc->parent_dmat); /* tag */
4097 device_printf(dev, "Err %d allocating parent dmat\n", err);
4101 callout_init_mp(&sc->co_hdl);
4103 mxge_setup_cfg_space(sc);
4106 * Map the board into the kernel
4109 sc->mem_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY,
4111 if (sc->mem_res == NULL) {
4112 device_printf(dev, "could not map memory\n");
4117 sc->sram = rman_get_virtual(sc->mem_res);
4118 sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4119 if (sc->sram_size > rman_get_size(sc->mem_res)) {
4120 device_printf(dev, "impossible memory region size %ld\n",
4121 rman_get_size(sc->mem_res));
4127 * Make NULL terminated copy of the EEPROM strings section of
4130 bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4131 bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4132 rman_get_bushandle(sc->mem_res),
4133 sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4134 sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE - 2);
4135 err = mxge_parse_strings(sc);
4137 device_printf(dev, "parse EEPROM string failed\n");
4142 * Enable write combining for efficient use of PCIe bus
4147 * Allocate the out of band DMA memory
4149 err = mxge_dma_alloc(sc, &sc->cmd_dma, sizeof(mxge_cmd_t), 64);
4151 device_printf(dev, "alloc cmd DMA buf failed\n");
4154 sc->cmd = sc->cmd_dma.dmem_addr;
4156 err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4158 device_printf(dev, "alloc zeropad DMA buf failed\n");
4162 err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4164 device_printf(dev, "alloc dmabench DMA buf failed\n");
4168 /* Select & load the firmware */
4169 err = mxge_select_firmware(sc);
4171 device_printf(dev, "select firmware failed\n");
4175 mxge_slice_probe(sc);
4176 err = mxge_alloc_slices(sc);
4178 device_printf(dev, "alloc slices failed\n");
4182 err = mxge_reset(sc, 0);
4184 device_printf(dev, "reset failed\n");
4188 err = mxge_alloc_rings(sc);
4190 device_printf(dev, "failed to allocate rings\n");
4194 ifp->if_baudrate = IF_Gbps(10UL);
4195 ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO;
4196 ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4198 ifp->if_capabilities |= IFCAP_VLAN_MTU;
4200 /* Well, its software, sigh */
4201 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING;
4203 ifp->if_capenable = ifp->if_capabilities;
4206 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4207 ifp->if_init = mxge_init;
4208 ifp->if_ioctl = mxge_ioctl;
4209 ifp->if_start = mxge_start;
4212 /* Increase TSO burst length */
4213 ifp->if_tsolen = (32 * ETHERMTU);
4215 /* Initialise the ifmedia structure */
4216 mxge_media_init(sc);
4217 mxge_media_probe(sc);
4219 ether_ifattach(ifp, sc->mac_addr, NULL);
4223 * We are not ready to do "gather" jumbo frame, so
4224 * limit MTU to MJUMPAGESIZE
4226 sc->max_mtu = MJUMPAGESIZE -
4227 ETHER_HDR_LEN - EVL_ENCAPLEN - MXGEFW_PAD - 1;
4230 /* must come after ether_ifattach() */
4231 err = mxge_add_irq(sc);
4233 device_printf(dev, "alloc and setup intr failed\n");
4234 ether_ifdetach(ifp);
4237 ifq_set_cpuid(&ifp->if_snd, rman_get_cpuid(sc->irq_res));
4239 mxge_add_sysctls(sc);
4241 callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
4250 mxge_detach(device_t dev)
4252 mxge_softc_t *sc = device_get_softc(dev);
4254 if (device_is_attached(dev)) {
4255 struct ifnet *ifp = sc->ifp;
4257 lwkt_serialize_enter(ifp->if_serializer);
4260 if (ifp->if_flags & IFF_RUNNING)
4262 callout_stop(&sc->co_hdl);
4264 bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4266 lwkt_serialize_exit(ifp->if_serializer);
4268 callout_terminate(&sc->co_hdl);
4270 ether_ifdetach(ifp);
4272 ifmedia_removeall(&sc->media);
4274 if (sc->cmd != NULL && sc->zeropad_dma.dmem_addr != NULL &&
4276 mxge_dummy_rdma(sc, 0);
4278 mxge_rem_sysctls(sc);
4279 mxge_free_rings(sc);
4281 /* MUST after sysctls and rings are freed */
4282 mxge_free_slices(sc);
4284 if (sc->dmabench_dma.dmem_addr != NULL)
4285 mxge_dma_free(&sc->dmabench_dma);
4286 if (sc->zeropad_dma.dmem_addr != NULL)
4287 mxge_dma_free(&sc->zeropad_dma);
4288 if (sc->cmd_dma.dmem_addr != NULL)
4289 mxge_dma_free(&sc->cmd_dma);
4291 if (sc->irq_res != NULL) {
4292 bus_release_resource(dev, SYS_RES_IRQ, sc->irq_rid,
4295 if (sc->irq_type == PCI_INTR_TYPE_MSI)
4296 pci_release_msi(dev);
4298 if (sc->mem_res != NULL) {
4299 bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS,
4303 if (sc->parent_dmat != NULL)
4304 bus_dma_tag_destroy(sc->parent_dmat);
4310 mxge_shutdown(device_t dev)