1 /******************************************************************************
3 Copyright (c) 2006-2013, Myricom Inc.
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
9 1. Redistributions of source code must retain the above copyright notice,
10 this list of conditions and the following disclaimer.
12 2. Neither the name of the Myricom Inc, nor the names of its
13 contributors may be used to endorse or promote products derived from
14 this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
28 $FreeBSD: head/sys/dev/mxge/if_mxge.c 254263 2013-08-12 23:30:01Z scottl $
30 ***************************************************************************/
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/linker.h>
37 #include <sys/firmware.h>
38 #include <sys/endian.h>
39 #include <sys/in_cksum.h>
40 #include <sys/sockio.h>
42 #include <sys/malloc.h>
43 #include <sys/kernel.h>
44 #include <sys/module.h>
45 #include <sys/serialize.h>
46 #include <sys/socket.h>
47 #include <sys/sysctl.h>
50 #include <net/if_arp.h>
51 #include <net/ifq_var.h>
52 #include <net/ethernet.h>
53 #include <net/if_dl.h>
54 #include <net/if_media.h>
58 #include <net/if_types.h>
59 #include <net/vlan/if_vlan_var.h>
62 #include <netinet/in_systm.h>
63 #include <netinet/in.h>
64 #include <netinet/ip.h>
65 #include <netinet/tcp.h>
70 #include <bus/pci/pcireg.h>
71 #include <bus/pci/pcivar.h>
72 #include <bus/pci/pci_private.h> /* XXX for pci_cfg_restore */
74 #include <vm/vm.h> /* for pmap_mapdev() */
77 #if defined(__i386__) || defined(__x86_64__)
78 #include <machine/specialreg.h>
81 #include <dev/netif/mxge/mxge_mcp.h>
82 #include <dev/netif/mxge/mcp_gen_header.h>
83 #include <dev/netif/mxge/if_mxge_var.h>
86 static int mxge_nvidia_ecrc_enable = 1;
87 static int mxge_force_firmware = 0;
88 static int mxge_intr_coal_delay = MXGE_INTR_COAL_DELAY;
89 static int mxge_deassert_wait = 1;
90 static int mxge_flow_control = 1;
91 static int mxge_ticks;
92 static int mxge_max_slices = 1;
93 static int mxge_always_promisc = 0;
94 static int mxge_throttle = 0;
95 static int mxge_msi_enable = 1;
97 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
98 static char *mxge_fw_aligned = "mxge_eth_z8e";
99 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
100 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
102 TUNABLE_INT("hw.mxge.max_slices", &mxge_max_slices);
103 TUNABLE_INT("hw.mxge.flow_control_enabled", &mxge_flow_control);
104 TUNABLE_INT("hw.mxge.intr_coal_delay", &mxge_intr_coal_delay);
105 TUNABLE_INT("hw.mxge.nvidia_ecrc_enable", &mxge_nvidia_ecrc_enable);
106 TUNABLE_INT("hw.mxge.force_firmware", &mxge_force_firmware);
107 TUNABLE_INT("hw.mxge.deassert_wait", &mxge_deassert_wait);
108 TUNABLE_INT("hw.mxge.ticks", &mxge_ticks);
109 TUNABLE_INT("hw.mxge.always_promisc", &mxge_always_promisc);
110 TUNABLE_INT("hw.mxge.throttle", &mxge_throttle);
111 TUNABLE_INT("hw.mxge.msi.enable", &mxge_msi_enable);
113 static int mxge_probe(device_t dev);
114 static int mxge_attach(device_t dev);
115 static int mxge_detach(device_t dev);
116 static int mxge_shutdown(device_t dev);
117 static void mxge_intr(void *arg);
119 static device_method_t mxge_methods[] = {
120 /* Device interface */
121 DEVMETHOD(device_probe, mxge_probe),
122 DEVMETHOD(device_attach, mxge_attach),
123 DEVMETHOD(device_detach, mxge_detach),
124 DEVMETHOD(device_shutdown, mxge_shutdown),
128 static driver_t mxge_driver = {
131 sizeof(mxge_softc_t),
134 static devclass_t mxge_devclass;
136 /* Declare ourselves to be a child of the PCI bus.*/
137 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, NULL, NULL);
138 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
139 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
141 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
142 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
143 static int mxge_close(mxge_softc_t *sc, int down);
144 static int mxge_open(mxge_softc_t *sc);
145 static void mxge_tick(void *arg);
148 mxge_probe(device_t dev)
152 if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
153 ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
154 (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
155 rev = pci_get_revid(dev);
157 case MXGE_PCI_REV_Z8E:
158 device_set_desc(dev, "Myri10G-PCIE-8A");
160 case MXGE_PCI_REV_Z8ES:
161 device_set_desc(dev, "Myri10G-PCIE-8B");
164 device_set_desc(dev, "Myri10G-PCIE-8??");
165 device_printf(dev, "Unrecognized rev %d NIC\n",
175 mxge_enable_wc(mxge_softc_t *sc)
177 #if defined(__i386__) || defined(__x86_64__)
181 len = rman_get_size(sc->mem_res);
182 pmap_change_attr((vm_offset_t) sc->sram, len / PAGE_SIZE,
183 PAT_WRITE_COMBINING);
188 mxge_dma_alloc(mxge_softc_t *sc, bus_dmamem_t *dma, size_t bytes,
189 bus_size_t alignment)
194 if (bytes > 4096 && alignment == 4096)
199 err = bus_dmamem_coherent(sc->parent_dmat, alignment, boundary,
200 BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, bytes,
201 BUS_DMA_WAITOK | BUS_DMA_ZERO, dma);
203 device_printf(sc->dev, "bus_dmamem_coherent failed: %d\n", err);
210 mxge_dma_free(bus_dmamem_t *dma)
212 bus_dmamap_unload(dma->dmem_tag, dma->dmem_map);
213 bus_dmamem_free(dma->dmem_tag, dma->dmem_addr, dma->dmem_map);
214 bus_dma_tag_destroy(dma->dmem_tag);
218 * The eeprom strings on the lanaiX have the format
224 mxge_parse_strings(mxge_softc_t *sc)
227 int i, found_mac, found_sn2;
230 ptr = sc->eeprom_strings;
233 while (*ptr != '\0') {
234 if (strncmp(ptr, "MAC=", 4) == 0) {
237 sc->mac_addr[i] = strtoul(ptr, &endptr, 16);
238 if (endptr - ptr != 2)
247 } else if (strncmp(ptr, "PC=", 3) == 0) {
249 strlcpy(sc->product_code_string, ptr,
250 sizeof(sc->product_code_string));
251 } else if (!found_sn2 && (strncmp(ptr, "SN=", 3) == 0)) {
253 strlcpy(sc->serial_number_string, ptr,
254 sizeof(sc->serial_number_string));
255 } else if (strncmp(ptr, "SN2=", 4) == 0) {
256 /* SN2 takes precedence over SN */
259 strlcpy(sc->serial_number_string, ptr,
260 sizeof(sc->serial_number_string));
262 while (*ptr++ != '\0') {}
269 device_printf(sc->dev, "failed to parse eeprom_strings\n");
273 #if defined(__i386__) || defined(__x86_64__)
276 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
279 unsigned long base, off;
281 device_t pdev, mcp55;
282 uint16_t vendor_id, device_id, word;
283 uintptr_t bus, slot, func, ivend, idev;
286 if (!mxge_nvidia_ecrc_enable)
289 pdev = device_get_parent(device_get_parent(sc->dev));
291 device_printf(sc->dev, "could not find parent?\n");
294 vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
295 device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
297 if (vendor_id != 0x10de)
302 if (device_id == 0x005d) {
303 /* ck804, base address is magic */
305 } else if (device_id >= 0x0374 && device_id <= 0x378) {
306 /* mcp55, base address stored in chipset */
307 mcp55 = pci_find_bsf(0, 0, 0);
309 0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
310 0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
311 word = pci_read_config(mcp55, 0x90, 2);
312 base = ((unsigned long)word & 0x7ffeU) << 25;
320 * Test below is commented because it is believed that doing
321 * config read/write beyond 0xff will access the config space
322 * for the next larger function. Uncomment this and remove
323 * the hacky pmap_mapdev() way of accessing config space when
324 * FreeBSD grows support for extended pcie config space access
328 * See if we can, by some miracle, access the extended
331 val = pci_read_config(pdev, 0x178, 4);
332 if (val != 0xffffffff) {
334 pci_write_config(pdev, 0x178, val, 4);
339 * Rather than using normal pci config space writes, we must
340 * map the Nvidia config space ourselves. This is because on
341 * opteron/nvidia class machine the 0xe000000 mapping is
342 * handled by the nvidia chipset, that means the internal PCI
343 * device (the on-chip northbridge), or the amd-8131 bridge
344 * and things behind them are not visible by this method.
347 BUS_READ_IVAR(device_get_parent(pdev), pdev,
349 BUS_READ_IVAR(device_get_parent(pdev), pdev,
350 PCI_IVAR_SLOT, &slot);
351 BUS_READ_IVAR(device_get_parent(pdev), pdev,
352 PCI_IVAR_FUNCTION, &func);
353 BUS_READ_IVAR(device_get_parent(pdev), pdev,
354 PCI_IVAR_VENDOR, &ivend);
355 BUS_READ_IVAR(device_get_parent(pdev), pdev,
356 PCI_IVAR_DEVICE, &idev);
358 off = base + 0x00100000UL * (unsigned long)bus +
359 0x00001000UL * (unsigned long)(func + 8 * slot);
361 /* map it into the kernel */
362 va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
364 device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
367 /* get a pointer to the config space mapped into the kernel */
368 cfgptr = va + (off & PAGE_MASK);
370 /* make sure that we can really access it */
371 vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
372 device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
373 if (!(vendor_id == ivend && device_id == idev)) {
374 device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
375 vendor_id, device_id);
376 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
380 ptr32 = (uint32_t*)(cfgptr + 0x178);
383 if (val == 0xffffffff) {
384 device_printf(sc->dev, "extended mapping failed\n");
385 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
389 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
391 device_printf(sc->dev, "Enabled ECRC on upstream "
392 "Nvidia bridge at %d:%d:%d\n",
393 (int)bus, (int)slot, (int)func);
397 #else /* __i386__ || __x86_64__ */
400 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
402 device_printf(sc->dev, "Nforce 4 chipset on non-x86/x86_64!?!?!\n");
408 mxge_dma_test(mxge_softc_t *sc, int test_type)
411 bus_addr_t dmatest_bus = sc->dmabench_dma.dmem_busaddr;
414 const char *test = " ";
417 * Run a small DMA test.
418 * The magic multipliers to the length tell the firmware
419 * to do DMA read, write, or read+write tests. The
420 * results are returned in cmd.data0. The upper 16
421 * bits of the return is the number of transfers completed.
422 * The lower 16 bits is the time in 0.5us ticks that the
423 * transfers took to complete.
426 len = sc->tx_boundary;
428 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
429 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
430 cmd.data2 = len * 0x10000;
431 status = mxge_send_cmd(sc, test_type, &cmd);
436 sc->read_dma = ((cmd.data0>>16) * len * 2) / (cmd.data0 & 0xffff);
438 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
439 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
440 cmd.data2 = len * 0x1;
441 status = mxge_send_cmd(sc, test_type, &cmd);
446 sc->write_dma = ((cmd.data0>>16) * len * 2) / (cmd.data0 & 0xffff);
448 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
449 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
450 cmd.data2 = len * 0x10001;
451 status = mxge_send_cmd(sc, test_type, &cmd);
456 sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
457 (cmd.data0 & 0xffff);
460 if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST) {
461 device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
468 * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
469 * when the PCI-E Completion packets are aligned on an 8-byte
470 * boundary. Some PCI-E chip sets always align Completion packets; on
471 * the ones that do not, the alignment can be enforced by enabling
472 * ECRC generation (if supported).
474 * When PCI-E Completion packets are not aligned, it is actually more
475 * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
477 * If the driver can neither enable ECRC nor verify that it has
478 * already been enabled, then it must use a firmware image which works
479 * around unaligned completion packets (ethp_z8e.dat), and it should
480 * also ensure that it never gives the device a Read-DMA which is
481 * larger than 2KB by setting the tx_boundary to 2KB. If ECRC is
482 * enabled, then the driver should use the aligned (eth_z8e.dat)
483 * firmware image, and set tx_boundary to 4KB.
486 mxge_firmware_probe(mxge_softc_t *sc)
488 device_t dev = sc->dev;
492 sc->tx_boundary = 4096;
495 * Verify the max read request size was set to 4KB
496 * before trying the test with 4KB.
498 if (pci_find_extcap(dev, PCIY_EXPRESS, ®) == 0) {
499 pectl = pci_read_config(dev, reg + 0x8, 2);
500 if ((pectl & (5 << 12)) != (5 << 12)) {
501 device_printf(dev, "Max Read Req. size != 4k (0x%x)\n",
503 sc->tx_boundary = 2048;
508 * Load the optimized firmware (which assumes aligned PCIe
509 * completions) in order to see if it works on this host.
511 sc->fw_name = mxge_fw_aligned;
512 status = mxge_load_firmware(sc, 1);
517 * Enable ECRC if possible
519 mxge_enable_nvidia_ecrc(sc);
522 * Run a DMA test which watches for unaligned completions and
523 * aborts on the first one seen. Not required on Z8ES or newer.
525 if (pci_get_revid(sc->dev) >= MXGE_PCI_REV_Z8ES)
528 status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
530 return 0; /* keep the aligned firmware */
533 device_printf(dev, "DMA test failed: %d\n", status);
534 if (status == ENOSYS) {
535 device_printf(dev, "Falling back to ethp! "
536 "Please install up to date fw\n");
542 mxge_select_firmware(mxge_softc_t *sc)
545 int force_firmware = mxge_force_firmware;
548 force_firmware = sc->throttle;
550 if (force_firmware != 0) {
551 if (force_firmware == 1)
556 device_printf(sc->dev,
557 "Assuming %s completions (forced)\n",
558 aligned ? "aligned" : "unaligned");
564 * If the PCIe link width is 4 or less, we can use the aligned
565 * firmware and skip any checks
567 if (sc->link_width != 0 && sc->link_width <= 4) {
568 device_printf(sc->dev, "PCIe x%d Link, "
569 "expect reduced performance\n", sc->link_width);
574 if (mxge_firmware_probe(sc) == 0)
579 sc->fw_name = mxge_fw_aligned;
580 sc->tx_boundary = 4096;
582 sc->fw_name = mxge_fw_unaligned;
583 sc->tx_boundary = 2048;
585 return mxge_load_firmware(sc, 0);
589 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
591 if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
592 device_printf(sc->dev, "Bad firmware type: 0x%x\n",
593 be32toh(hdr->mcp_type));
597 /* Save firmware version for sysctl */
598 strlcpy(sc->fw_version, hdr->version, sizeof(sc->fw_version));
600 device_printf(sc->dev, "firmware id: %s\n", hdr->version);
602 ksscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
603 &sc->fw_ver_minor, &sc->fw_ver_tiny);
605 if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR &&
606 sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
607 device_printf(sc->dev, "Found firmware version %s\n",
609 device_printf(sc->dev, "Driver needs %d.%d\n",
610 MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
617 z_alloc(void *nil, u_int items, u_int size)
619 return kmalloc(items * size, M_TEMP, M_WAITOK);
623 z_free(void *nil, void *ptr)
629 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
632 char *inflate_buffer;
633 const struct firmware *fw;
634 const mcp_gen_header_t *hdr;
641 fw = firmware_get(sc->fw_name);
643 device_printf(sc->dev, "Could not find firmware image %s\n",
648 /* Setup zlib and decompress f/w */
649 bzero(&zs, sizeof(zs));
652 status = inflateInit(&zs);
653 if (status != Z_OK) {
659 * The uncompressed size is stored as the firmware version,
660 * which would otherwise go unused
662 fw_len = (size_t)fw->version;
663 inflate_buffer = kmalloc(fw_len, M_TEMP, M_WAITOK);
664 zs.avail_in = fw->datasize;
665 zs.next_in = __DECONST(char *, fw->data);
666 zs.avail_out = fw_len;
667 zs.next_out = inflate_buffer;
668 status = inflate(&zs, Z_FINISH);
669 if (status != Z_STREAM_END) {
670 device_printf(sc->dev, "zlib %d\n", status);
672 goto abort_with_buffer;
677 htobe32(*(const uint32_t *)(inflate_buffer + MCP_HEADER_PTR_OFFSET));
678 if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
679 device_printf(sc->dev, "Bad firmware file");
681 goto abort_with_buffer;
683 hdr = (const void*)(inflate_buffer + hdr_offset);
685 status = mxge_validate_firmware(sc, hdr);
687 goto abort_with_buffer;
689 /* Copy the inflated firmware to NIC SRAM. */
690 for (i = 0; i < fw_len; i += 256) {
691 mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i, inflate_buffer + i,
692 min(256U, (unsigned)(fw_len - i)));
701 kfree(inflate_buffer, M_TEMP);
704 firmware_put(fw, FIRMWARE_UNLOAD);
709 * Enable or disable periodic RDMAs from the host to make certain
710 * chipsets resend dropped PCIe messages
713 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
716 volatile uint32_t *confirm;
717 volatile char *submit;
718 uint32_t *buf, dma_low, dma_high;
721 buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
723 /* Clear confirmation addr */
724 confirm = (volatile uint32_t *)sc->cmd;
729 * Send an rdma command to the PCIe engine, and wait for the
730 * response in the confirmation address. The firmware should
731 * write a -1 there to indicate it is alive and well
733 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.dmem_busaddr);
734 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.dmem_busaddr);
735 buf[0] = htobe32(dma_high); /* confirm addr MSW */
736 buf[1] = htobe32(dma_low); /* confirm addr LSW */
737 buf[2] = htobe32(0xffffffff); /* confirm data */
738 dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.dmem_busaddr);
739 dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.dmem_busaddr);
740 buf[3] = htobe32(dma_high); /* dummy addr MSW */
741 buf[4] = htobe32(dma_low); /* dummy addr LSW */
742 buf[5] = htobe32(enable); /* enable? */
744 submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
746 mxge_pio_copy(submit, buf, 64);
751 while (*confirm != 0xffffffff && i < 20) {
755 if (*confirm != 0xffffffff) {
756 if_printf(sc->ifp, "dummy rdma %s failed (%p = 0x%x)",
757 (enable ? "enable" : "disable"), confirm, *confirm);
762 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
765 char buf_bytes[sizeof(*buf) + 8];
766 volatile mcp_cmd_response_t *response = sc->cmd;
767 volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
768 uint32_t dma_low, dma_high;
769 int err, sleep_total = 0;
771 /* Ensure buf is aligned to 8 bytes */
772 buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
774 buf->data0 = htobe32(data->data0);
775 buf->data1 = htobe32(data->data1);
776 buf->data2 = htobe32(data->data2);
777 buf->cmd = htobe32(cmd);
778 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.dmem_busaddr);
779 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.dmem_busaddr);
781 buf->response_addr.low = htobe32(dma_low);
782 buf->response_addr.high = htobe32(dma_high);
784 response->result = 0xffffffff;
786 mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
792 for (sleep_total = 0; sleep_total < 20; sleep_total++) {
794 switch (be32toh(response->result)) {
796 data->data0 = be32toh(response->data);
802 case MXGEFW_CMD_UNKNOWN:
805 case MXGEFW_CMD_ERROR_UNALIGNED:
808 case MXGEFW_CMD_ERROR_BUSY:
811 case MXGEFW_CMD_ERROR_I2C_ABSENT:
815 if_printf(sc->ifp, "command %d failed, result = %d\n",
816 cmd, be32toh(response->result));
824 if_printf(sc->ifp, "command %d timed out result = %d\n",
825 cmd, be32toh(response->result));
831 mxge_adopt_running_firmware(mxge_softc_t *sc)
833 struct mcp_gen_header *hdr;
834 const size_t bytes = sizeof(struct mcp_gen_header);
839 * Find running firmware header
842 htobe32(*(volatile uint32_t *)(sc->sram + MCP_HEADER_PTR_OFFSET));
844 if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
845 device_printf(sc->dev,
846 "Running firmware has bad header offset (%zu)\n",
852 * Copy header of running firmware from SRAM to host memory to
855 hdr = kmalloc(bytes, M_DEVBUF, M_WAITOK);
856 bus_space_read_region_1(rman_get_bustag(sc->mem_res),
857 rman_get_bushandle(sc->mem_res), hdr_offset, (char *)hdr, bytes);
858 status = mxge_validate_firmware(sc, hdr);
859 kfree(hdr, M_DEVBUF);
862 * Check to see if adopted firmware has bug where adopting
863 * it will cause broadcasts to be filtered unless the NIC
864 * is kept in ALLMULTI mode
866 if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
867 sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
868 sc->adopted_rx_filter_bug = 1;
869 device_printf(sc->dev, "Adopting fw %d.%d.%d: "
870 "working around rx filter bug\n",
871 sc->fw_ver_major, sc->fw_ver_minor, sc->fw_ver_tiny);
878 mxge_load_firmware(mxge_softc_t *sc, int adopt)
880 volatile uint32_t *confirm;
881 volatile char *submit;
883 uint32_t *buf, size, dma_low, dma_high;
886 buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
888 size = sc->sram_size;
889 status = mxge_load_firmware_helper(sc, &size);
895 * Try to use the currently running firmware, if
898 status = mxge_adopt_running_firmware(sc);
900 device_printf(sc->dev,
901 "failed to adopt running firmware\n");
904 device_printf(sc->dev,
905 "Successfully adopted running firmware\n");
907 if (sc->tx_boundary == 4096) {
908 device_printf(sc->dev,
909 "Using firmware currently running on NIC. "
911 device_printf(sc->dev,
912 "performance consider loading "
913 "optimized firmware\n");
915 sc->fw_name = mxge_fw_unaligned;
916 sc->tx_boundary = 2048;
920 /* Clear confirmation addr */
921 confirm = (volatile uint32_t *)sc->cmd;
926 * Send a reload command to the bootstrap MCP, and wait for the
927 * response in the confirmation address. The firmware should
928 * write a -1 there to indicate it is alive and well
931 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.dmem_busaddr);
932 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.dmem_busaddr);
934 buf[0] = htobe32(dma_high); /* confirm addr MSW */
935 buf[1] = htobe32(dma_low); /* confirm addr LSW */
936 buf[2] = htobe32(0xffffffff); /* confirm data */
939 * FIX: All newest firmware should un-protect the bottom of
940 * the sram before handoff. However, the very first interfaces
941 * do not. Therefore the handoff copy must skip the first 8 bytes
943 /* where the code starts*/
944 buf[3] = htobe32(MXGE_FW_OFFSET + 8);
945 buf[4] = htobe32(size - 8); /* length of code */
946 buf[5] = htobe32(8); /* where to copy to */
947 buf[6] = htobe32(0); /* where to jump to */
949 submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
950 mxge_pio_copy(submit, buf, 64);
955 while (*confirm != 0xffffffff && i < 20) {
959 if (*confirm != 0xffffffff) {
960 device_printf(sc->dev,"handoff failed (%p = 0x%x)",
968 mxge_update_mac_address(mxge_softc_t *sc)
971 uint8_t *addr = sc->mac_addr;
973 cmd.data0 = (addr[0] << 24) | (addr[1] << 16) |
974 (addr[2] << 8) | addr[3];
975 cmd.data1 = (addr[4] << 8) | (addr[5]);
976 return mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
980 mxge_change_pause(mxge_softc_t *sc, int pause)
986 status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL, &cmd);
988 status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL, &cmd);
990 device_printf(sc->dev, "Failed to set flow control mode\n");
998 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1003 if (mxge_always_promisc)
1007 status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC, &cmd);
1009 status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC, &cmd);
1011 device_printf(sc->dev, "Failed to set promisc mode\n");
1015 mxge_set_multicast_list(mxge_softc_t *sc)
1018 struct ifmultiaddr *ifma;
1019 struct ifnet *ifp = sc->ifp;
1022 /* This firmware is known to not support multicast */
1023 if (!sc->fw_multicast_support)
1026 /* Disable multicast filtering while we play with the lists*/
1027 err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1029 device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1030 " error status: %d\n", err);
1034 if (sc->adopted_rx_filter_bug)
1037 if (ifp->if_flags & IFF_ALLMULTI) {
1038 /* Request to disable multicast filtering, so quit here */
1042 /* Flush all the filters */
1043 err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1045 device_printf(sc->dev,
1046 "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, "
1047 "error status: %d\n", err);
1052 * Walk the multicast list, and add each address
1054 TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1055 if (ifma->ifma_addr->sa_family != AF_LINK)
1058 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1060 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1062 cmd.data0 = htonl(cmd.data0);
1063 cmd.data1 = htonl(cmd.data1);
1064 err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1066 device_printf(sc->dev, "Failed "
1067 "MXGEFW_JOIN_MULTICAST_GROUP, "
1068 "error status: %d\n", err);
1069 /* Abort, leaving multicast filtering off */
1074 /* Enable multicast filtering */
1075 err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1077 device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI, "
1078 "error status: %d\n", err);
1084 mxge_max_mtu(mxge_softc_t *sc)
1089 if (MJUMPAGESIZE - MXGEFW_PAD > MXGEFW_MAX_MTU)
1090 return MXGEFW_MAX_MTU - MXGEFW_PAD;
1092 /* try to set nbufs to see if it we can
1093 use virtually contiguous jumbos */
1095 status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1098 return MXGEFW_MAX_MTU - MXGEFW_PAD;
1100 /* otherwise, we're limited to MJUMPAGESIZE */
1101 return MJUMPAGESIZE - MXGEFW_PAD;
1106 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1108 struct mxge_slice_state *ss;
1109 mxge_rx_done_t *rx_done;
1110 volatile uint32_t *irq_claim;
1115 * Try to send a reset command to the card to see if it
1118 memset(&cmd, 0, sizeof (cmd));
1119 status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1121 if_printf(sc->ifp, "failed reset\n");
1125 mxge_dummy_rdma(sc, 1);
1127 /* Set the intrq size */
1128 cmd.data0 = sc->rx_ring_size;
1129 status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1132 * Even though we already know how many slices are supported
1133 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1134 * has magic side effects, and must be called after a reset.
1135 * It must be called prior to calling any RSS related cmds,
1136 * including assigning an interrupt queue for anything but
1137 * slice 0. It must also be called *after*
1138 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1139 * the firmware to compute offsets.
1141 if (sc->num_slices > 1) {
1142 /* Ask the maximum number of slices it supports */
1143 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
1145 if_printf(sc->ifp, "failed to get number of slices\n");
1150 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1151 * to setting up the interrupt queue DMA
1153 cmd.data0 = sc->num_slices;
1154 cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1155 #ifdef IFNET_BUF_RING
1156 cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1158 status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES, &cmd);
1160 if_printf(sc->ifp, "failed to set number of slices\n");
1165 if (interrupts_setup) {
1166 /* Now exchange information about interrupts */
1167 for (slice = 0; slice < sc->num_slices; slice++) {
1168 rx_done = &sc->ss[slice].rx_done;
1169 memset(rx_done->entry, 0, sc->rx_ring_size);
1171 MXGE_LOWPART_TO_U32(rx_done->dma.dmem_busaddr);
1173 MXGE_HIGHPART_TO_U32(rx_done->dma.dmem_busaddr);
1175 status |= mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_DMA,
1180 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET,
1182 sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1184 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1185 irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1187 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET, &cmd);
1188 sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1191 if_printf(sc->ifp, "failed set interrupt parameters\n");
1195 *sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1197 /* Run a DMA benchmark */
1198 mxge_dma_test(sc, MXGEFW_DMA_TEST);
1200 for (slice = 0; slice < sc->num_slices; slice++) {
1201 ss = &sc->ss[slice];
1203 ss->irq_claim = irq_claim + (2 * slice);
1205 /* Reset mcp/driver shared state back to 0 */
1206 ss->rx_done.idx = 0;
1207 ss->rx_done.cnt = 0;
1210 ss->tx.pkt_done = 0;
1211 ss->tx.queue_active = 0;
1212 ss->tx.activate = 0;
1213 ss->tx.deactivate = 0;
1218 ss->rx_small.cnt = 0;
1219 if (ss->fw_stats != NULL)
1220 bzero(ss->fw_stats, sizeof(*ss->fw_stats));
1222 sc->rdma_tags_available = 15;
1224 status = mxge_update_mac_address(sc);
1225 mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1226 mxge_change_pause(sc, sc->pause);
1227 mxge_set_multicast_list(sc);
1230 cmd.data0 = sc->throttle;
1231 if (mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd))
1232 if_printf(sc->ifp, "can't enable throttle\n");
1238 mxge_change_throttle(SYSCTL_HANDLER_ARGS)
1243 unsigned int throttle;
1246 throttle = sc->throttle;
1247 err = sysctl_handle_int(oidp, &throttle, arg2, req);
1252 if (throttle == sc->throttle)
1255 if (throttle < MXGE_MIN_THROTTLE || throttle > MXGE_MAX_THROTTLE)
1259 lwkt_serialize_enter(sc->ifp->if_serializer);
1261 cmd.data0 = throttle;
1262 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd);
1264 sc->throttle = throttle;
1266 lwkt_serialize_exit(sc->ifp->if_serializer);
1271 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1274 unsigned int intr_coal_delay;
1278 intr_coal_delay = sc->intr_coal_delay;
1279 err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1283 if (intr_coal_delay == sc->intr_coal_delay)
1286 if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1290 lwkt_serialize_enter(sc->ifp->if_serializer);
1292 *sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1293 sc->intr_coal_delay = intr_coal_delay;
1295 lwkt_serialize_exit(sc->ifp->if_serializer);
1300 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1303 unsigned int enabled;
1307 enabled = sc->pause;
1308 err = sysctl_handle_int(oidp, &enabled, arg2, req);
1312 if (enabled == sc->pause)
1316 lwkt_serialize_enter(sc->ifp->if_serializer);
1317 err = mxge_change_pause(sc, enabled);
1318 lwkt_serialize_exit(sc->ifp->if_serializer);
1324 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1330 arg2 = be32toh(*(int *)arg1);
1332 err = sysctl_handle_int(oidp, arg1, arg2, req);
1338 mxge_rem_sysctls(mxge_softc_t *sc)
1340 if (sc->ss != NULL) {
1341 struct mxge_slice_state *ss;
1344 for (slice = 0; slice < sc->num_slices; slice++) {
1345 ss = &sc->ss[slice];
1346 if (ss->sysctl_tree != NULL) {
1347 sysctl_ctx_free(&ss->sysctl_ctx);
1348 ss->sysctl_tree = NULL;
1353 if (sc->slice_sysctl_tree != NULL) {
1354 sysctl_ctx_free(&sc->slice_sysctl_ctx);
1355 sc->slice_sysctl_tree = NULL;
1358 if (sc->sysctl_tree != NULL) {
1359 sysctl_ctx_free(&sc->sysctl_ctx);
1360 sc->sysctl_tree = NULL;
1365 mxge_add_sysctls(mxge_softc_t *sc)
1367 struct sysctl_ctx_list *ctx;
1368 struct sysctl_oid_list *children;
1370 struct mxge_slice_state *ss;
1374 ctx = &sc->sysctl_ctx;
1375 sysctl_ctx_init(ctx);
1376 sc->sysctl_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_STATIC_CHILDREN(_hw),
1377 OID_AUTO, device_get_nameunit(sc->dev), CTLFLAG_RD, 0, "");
1378 if (sc->sysctl_tree == NULL) {
1379 device_printf(sc->dev, "can't add sysctl node\n");
1383 children = SYSCTL_CHILDREN(sc->sysctl_tree);
1384 fw = sc->ss[0].fw_stats;
1387 * Random information
1389 SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "firmware_version",
1390 CTLFLAG_RD, &sc->fw_version, 0, "firmware version");
1392 SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "serial_number",
1393 CTLFLAG_RD, &sc->serial_number_string, 0, "serial number");
1395 SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "product_code",
1396 CTLFLAG_RD, &sc->product_code_string, 0, "product code");
1398 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "pcie_link_width",
1399 CTLFLAG_RD, &sc->link_width, 0, "link width");
1401 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_boundary",
1402 CTLFLAG_RD, &sc->tx_boundary, 0, "tx boundary");
1404 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "write_combine",
1405 CTLFLAG_RD, &sc->wc, 0, "write combining PIO");
1407 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "read_dma_MBs",
1408 CTLFLAG_RD, &sc->read_dma, 0, "DMA Read speed in MB/s");
1410 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "write_dma_MBs",
1411 CTLFLAG_RD, &sc->write_dma, 0, "DMA Write speed in MB/s");
1413 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "read_write_dma_MBs",
1414 CTLFLAG_RD, &sc->read_write_dma, 0,
1415 "DMA concurrent Read/Write speed in MB/s");
1417 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "watchdog_resets",
1418 CTLFLAG_RD, &sc->watchdog_resets, 0,
1419 "Number of times NIC was reset");
1422 * Performance related tunables
1424 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "intr_coal_delay",
1425 CTLTYPE_INT|CTLFLAG_RW, sc, 0, mxge_change_intr_coal, "I",
1426 "Interrupt coalescing delay in usecs");
1428 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "throttle",
1429 CTLTYPE_INT|CTLFLAG_RW, sc, 0, mxge_change_throttle, "I",
1430 "Transmit throttling");
1432 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "flow_control_enabled",
1433 CTLTYPE_INT|CTLFLAG_RW, sc, 0, mxge_change_flow_control, "I",
1434 "Interrupt coalescing delay in usecs");
1436 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "deassert_wait",
1437 CTLFLAG_RW, &mxge_deassert_wait, 0,
1438 "Wait for IRQ line to go low in ihandler");
1441 * Stats block from firmware is in network byte order.
1444 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "link_up",
1445 CTLTYPE_INT|CTLFLAG_RD, &fw->link_up, 0,
1446 mxge_handle_be32, "I", "link up");
1448 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rdma_tags_available",
1449 CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available, 0,
1450 mxge_handle_be32, "I", "rdma_tags_available");
1452 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_bad_crc32",
1453 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_bad_crc32, 0,
1454 mxge_handle_be32, "I", "dropped_bad_crc32");
1456 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_bad_phy",
1457 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_bad_phy, 0,
1458 mxge_handle_be32, "I", "dropped_bad_phy");
1460 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_link_error_or_filtered",
1461 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_error_or_filtered, 0,
1462 mxge_handle_be32, "I", "dropped_link_error_or_filtered");
1464 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_link_overflow",
1465 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow, 0,
1466 mxge_handle_be32, "I", "dropped_link_overflow");
1468 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_multicast_filtered",
1469 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_multicast_filtered, 0,
1470 mxge_handle_be32, "I", "dropped_multicast_filtered");
1472 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_no_big_buffer",
1473 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer, 0,
1474 mxge_handle_be32, "I", "dropped_no_big_buffer");
1476 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_no_small_buffer",
1477 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_small_buffer, 0,
1478 mxge_handle_be32, "I", "dropped_no_small_buffer");
1480 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_overrun",
1481 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun, 0,
1482 mxge_handle_be32, "I", "dropped_overrun");
1484 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_pause",
1485 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_pause, 0,
1486 mxge_handle_be32, "I", "dropped_pause");
1488 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_runt",
1489 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt, 0,
1490 mxge_handle_be32, "I", "dropped_runt");
1492 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_unicast_filtered",
1493 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered, 0,
1494 mxge_handle_be32, "I", "dropped_unicast_filtered");
1496 /* add counters exported for debugging from all slices */
1497 sysctl_ctx_init(&sc->slice_sysctl_ctx);
1498 sc->slice_sysctl_tree = SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx,
1499 children, OID_AUTO, "slice", CTLFLAG_RD, 0, "");
1500 if (sc->slice_sysctl_tree == NULL) {
1501 device_printf(sc->dev, "can't add slice sysctl node\n");
1505 for (slice = 0; slice < sc->num_slices; slice++) {
1506 ss = &sc->ss[slice];
1507 sysctl_ctx_init(&ss->sysctl_ctx);
1508 ctx = &ss->sysctl_ctx;
1509 children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1510 ksprintf(slice_num, "%d", slice);
1511 ss->sysctl_tree = SYSCTL_ADD_NODE(ctx, children, OID_AUTO,
1512 slice_num, CTLFLAG_RD, 0, "");
1513 if (ss->sysctl_tree == NULL) {
1514 device_printf(sc->dev,
1515 "can't add %d slice sysctl node\n", slice);
1516 return; /* XXX continue? */
1518 children = SYSCTL_CHILDREN(ss->sysctl_tree);
1521 * XXX change to ULONG
1524 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rx_small_cnt",
1525 CTLFLAG_RD, &ss->rx_small.cnt, 0, "rx_small_cnt");
1527 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rx_big_cnt",
1528 CTLFLAG_RD, &ss->rx_big.cnt, 0, "rx_small_cnt");
1530 #ifndef IFNET_BUF_RING
1531 /* only transmit from slice 0 for now */
1536 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_req",
1537 CTLFLAG_RD, &ss->tx.req, 0, "tx_req");
1539 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_done",
1540 CTLFLAG_RD, &ss->tx.done, 0, "tx_done");
1542 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_pkt_done",
1543 CTLFLAG_RD, &ss->tx.pkt_done, 0, "tx_done");
1545 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_stall",
1546 CTLFLAG_RD, &ss->tx.stall, 0, "tx_stall");
1548 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_wake",
1549 CTLFLAG_RD, &ss->tx.wake, 0, "tx_wake");
1551 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_defrag",
1552 CTLFLAG_RD, &ss->tx.defrag, 0, "tx_defrag");
1554 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_queue_active",
1555 CTLFLAG_RD, &ss->tx.queue_active, 0, "tx_queue_active");
1557 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_activate",
1558 CTLFLAG_RD, &ss->tx.activate, 0, "tx_activate");
1560 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_deactivate",
1561 CTLFLAG_RD, &ss->tx.deactivate, 0, "tx_deactivate");
1566 * Copy an array of mcp_kreq_ether_send_t's to the mcp. Copy
1567 * backwards one at a time and handle ring wraps
1570 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1571 mcp_kreq_ether_send_t *src, int cnt)
1573 int idx, starting_slot;
1574 starting_slot = tx->req;
1577 idx = (starting_slot + cnt) & tx->mask;
1578 mxge_pio_copy(&tx->lanai[idx],
1579 &src[cnt], sizeof(*src));
1585 * Copy an array of mcp_kreq_ether_send_t's to the mcp. Copy
1586 * at most 32 bytes at a time, so as to avoid involving the software
1587 * pio handler in the nic. We re-write the first segment's flags
1588 * to mark them valid only after writing the entire chain
1591 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src, int cnt)
1595 volatile uint32_t *dst_ints;
1596 mcp_kreq_ether_send_t *srcp;
1597 volatile mcp_kreq_ether_send_t *dstp, *dst;
1600 idx = tx->req & tx->mask;
1602 last_flags = src->flags;
1605 dst = dstp = &tx->lanai[idx];
1608 if ((idx + cnt) < tx->mask) {
1609 for (i = 0; i < (cnt - 1); i += 2) {
1610 mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1611 wmb(); /* force write every 32 bytes */
1616 /* submit all but the first request, and ensure
1617 that it is submitted below */
1618 mxge_submit_req_backwards(tx, src, cnt);
1622 /* submit the first request */
1623 mxge_pio_copy(dstp, srcp, sizeof(*src));
1624 wmb(); /* barrier before setting valid flag */
1627 /* re-write the last 32-bits with the valid flags */
1628 src->flags = last_flags;
1629 src_ints = (uint32_t *)src;
1631 dst_ints = (volatile uint32_t *)dst;
1633 *dst_ints = *src_ints;
1639 mxge_pullup_tso(struct mbuf **mp)
1641 int hoff, iphlen, thoff;
1645 KASSERT(M_WRITABLE(m), ("TSO mbuf not writable"));
1647 iphlen = m->m_pkthdr.csum_iphlen;
1648 thoff = m->m_pkthdr.csum_thlen;
1649 hoff = m->m_pkthdr.csum_lhlen;
1651 KASSERT(iphlen > 0, ("invalid ip hlen"));
1652 KASSERT(thoff > 0, ("invalid tcp hlen"));
1653 KASSERT(hoff > 0, ("invalid ether hlen"));
1655 if (__predict_false(m->m_len < hoff + iphlen + thoff)) {
1656 m = m_pullup(m, hoff + iphlen + thoff);
1667 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1671 mcp_kreq_ether_send_t *req;
1672 bus_dma_segment_t *seg;
1673 uint32_t low, high_swapped;
1674 int len, seglen, cum_len, cum_len_next;
1675 int next_is_first, chop, cnt, rdma_count, small;
1676 uint16_t pseudo_hdr_offset, cksum_offset, mss;
1677 uint8_t flags, flags_next;
1680 mss = m->m_pkthdr.tso_segsz;
1682 /* negative cum_len signifies to the
1683 * send loop that we are still in the
1684 * header portion of the TSO packet.
1686 cum_len = -(m->m_pkthdr.csum_lhlen + m->m_pkthdr.csum_iphlen +
1687 m->m_pkthdr.csum_thlen);
1689 /* TSO implies checksum offload on this hardware */
1690 cksum_offset = m->m_pkthdr.csum_lhlen + m->m_pkthdr.csum_iphlen;
1691 flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1693 /* for TSO, pseudo_hdr_offset holds mss.
1694 * The firmware figures out where to put
1695 * the checksum by parsing the header. */
1696 pseudo_hdr_offset = htobe16(mss);
1703 /* "rdma_count" is the number of RDMAs belonging to the
1704 * current packet BEFORE the current send request. For
1705 * non-TSO packets, this is equal to "count".
1706 * For TSO packets, rdma_count needs to be reset
1707 * to 0 after a segment cut.
1709 * The rdma_count field of the send request is
1710 * the number of RDMAs of the packet starting at
1711 * that request. For TSO send requests with one ore more cuts
1712 * in the middle, this is the number of RDMAs starting
1713 * after the last cut in the request. All previous
1714 * segments before the last cut implicitly have 1 RDMA.
1716 * Since the number of RDMAs is not known beforehand,
1717 * it must be filled-in retroactively - after each
1718 * segmentation cut or at the end of the entire packet.
1721 while (busdma_seg_cnt) {
1722 /* Break the busdma segment up into pieces*/
1723 low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1724 high_swapped = htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1728 flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1730 cum_len_next = cum_len + seglen;
1731 (req-rdma_count)->rdma_count = rdma_count + 1;
1732 if (__predict_true(cum_len >= 0)) {
1734 chop = (cum_len_next > mss);
1735 cum_len_next = cum_len_next % mss;
1736 next_is_first = (cum_len_next == 0);
1737 flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1738 flags_next |= next_is_first *
1740 rdma_count |= -(chop | next_is_first);
1741 rdma_count += chop & !next_is_first;
1742 } else if (cum_len_next >= 0) {
1747 small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1748 flags_next = MXGEFW_FLAGS_TSO_PLD |
1749 MXGEFW_FLAGS_FIRST |
1750 (small * MXGEFW_FLAGS_SMALL);
1753 req->addr_high = high_swapped;
1754 req->addr_low = htobe32(low);
1755 req->pseudo_hdr_offset = pseudo_hdr_offset;
1757 req->rdma_count = 1;
1758 req->length = htobe16(seglen);
1759 req->cksum_offset = cksum_offset;
1760 req->flags = flags | ((cum_len & 1) *
1761 MXGEFW_FLAGS_ALIGN_ODD);
1764 cum_len = cum_len_next;
1769 if (__predict_false(cksum_offset > seglen))
1770 cksum_offset -= seglen;
1773 if (__predict_false(cnt > tx->max_desc))
1779 (req-rdma_count)->rdma_count = rdma_count;
1783 req->flags |= MXGEFW_FLAGS_TSO_LAST;
1784 } while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1786 tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1787 mxge_submit_req(tx, tx->req_list, cnt);
1788 #ifdef IFNET_BUF_RING
1789 if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1790 /* tell the NIC to start polling this slice */
1792 tx->queue_active = 1;
1800 bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1804 kprintf("tx->max_desc exceeded via TSO!\n");
1805 kprintf("mss = %d, %ld, %d!\n", mss,
1806 (long)seg - (long)tx->seg_list, tx->max_desc);
1812 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
1815 mcp_kreq_ether_send_t *req;
1816 bus_dma_segment_t *seg;
1818 int cnt, cum_len, err, i, idx, odd_flag;
1819 uint16_t pseudo_hdr_offset;
1820 uint8_t flags, cksum_offset;
1825 if (m->m_pkthdr.csum_flags & CSUM_TSO) {
1826 if (mxge_pullup_tso(&m))
1830 /* (try to) map the frame for DMA */
1831 idx = tx->req & tx->mask;
1832 err = bus_dmamap_load_mbuf_defrag(tx->dmat, tx->info[idx].map, &m,
1833 tx->seg_list, tx->max_desc - 2, &cnt, BUS_DMA_NOWAIT);
1834 if (__predict_false(err != 0))
1836 bus_dmamap_sync(tx->dmat, tx->info[idx].map, BUS_DMASYNC_PREWRITE);
1837 tx->info[idx].m = m;
1839 /* TSO is different enough, we handle it in another routine */
1840 if (m->m_pkthdr.csum_flags & CSUM_TSO) {
1841 mxge_encap_tso(ss, m, cnt);
1847 pseudo_hdr_offset = 0;
1848 flags = MXGEFW_FLAGS_NO_TSO;
1850 /* checksum offloading? */
1851 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
1852 cksum_offset = m->m_pkthdr.csum_lhlen + m->m_pkthdr.csum_iphlen;
1853 pseudo_hdr_offset = cksum_offset + m->m_pkthdr.csum_data;
1854 pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
1855 req->cksum_offset = cksum_offset;
1856 flags |= MXGEFW_FLAGS_CKSUM;
1857 odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
1861 if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
1862 flags |= MXGEFW_FLAGS_SMALL;
1864 /* convert segments into a request list */
1867 req->flags = MXGEFW_FLAGS_FIRST;
1868 for (i = 0; i < cnt; i++) {
1870 htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
1872 htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1873 req->length = htobe16(seg->ds_len);
1874 req->cksum_offset = cksum_offset;
1875 if (cksum_offset > seg->ds_len)
1876 cksum_offset -= seg->ds_len;
1879 req->pseudo_hdr_offset = pseudo_hdr_offset;
1880 req->pad = 0; /* complete solid 16-byte block */
1881 req->rdma_count = 1;
1882 req->flags |= flags | ((cum_len & 1) * odd_flag);
1883 cum_len += seg->ds_len;
1889 /* pad runts to 60 bytes */
1893 htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.dmem_busaddr));
1895 htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.dmem_busaddr));
1896 req->length = htobe16(60 - cum_len);
1897 req->cksum_offset = 0;
1898 req->pseudo_hdr_offset = pseudo_hdr_offset;
1899 req->pad = 0; /* complete solid 16-byte block */
1900 req->rdma_count = 1;
1901 req->flags |= flags | ((cum_len & 1) * odd_flag);
1905 tx->req_list[0].rdma_count = cnt;
1907 /* print what the firmware will see */
1908 for (i = 0; i < cnt; i++) {
1909 kprintf("%d: addr: 0x%x 0x%x len:%d pso%d,"
1910 "cso:%d, flags:0x%x, rdma:%d\n",
1911 i, (int)ntohl(tx->req_list[i].addr_high),
1912 (int)ntohl(tx->req_list[i].addr_low),
1913 (int)ntohs(tx->req_list[i].length),
1914 (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
1915 tx->req_list[i].cksum_offset, tx->req_list[i].flags,
1916 tx->req_list[i].rdma_count);
1918 kprintf("--------------\n");
1920 tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1921 mxge_submit_req(tx, tx->req_list, cnt);
1922 #ifdef IFNET_BUF_RING
1923 if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1924 /* tell the NIC to start polling this slice */
1926 tx->queue_active = 1;
1939 mxge_start_locked(struct mxge_slice_state *ss)
1949 while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
1950 m = ifq_dequeue(&ifp->if_snd);
1954 /* let BPF see it */
1957 /* give it to the nic */
1961 /* ran out of transmit slots */
1962 ifq_set_oactive(&ifp->if_snd);
1966 mxge_start(struct ifnet *ifp, struct ifaltq_subque *ifsq)
1968 mxge_softc_t *sc = ifp->if_softc;
1969 struct mxge_slice_state *ss;
1971 ASSERT_ALTQ_SQ_DEFAULT(ifp, ifsq);
1972 ASSERT_SERIALIZED(sc->ifp->if_serializer);
1973 /* only use the first slice for now */
1975 mxge_start_locked(ss);
1979 * copy an array of mcp_kreq_ether_recv_t's to the mcp. Copy
1980 * at most 32 bytes at a time, so as to avoid involving the software
1981 * pio handler in the nic. We re-write the first segment's low
1982 * DMA address to mark it valid only after we write the entire chunk
1986 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
1987 mcp_kreq_ether_recv_t *src)
1991 low = src->addr_low;
1992 src->addr_low = 0xffffffff;
1993 mxge_pio_copy(dst, src, 4 * sizeof (*src));
1995 mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
1997 src->addr_low = low;
1998 dst->addr_low = low;
2003 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2005 bus_dma_segment_t seg;
2007 mxge_rx_ring_t *rx = &ss->rx_small;
2010 m = m_gethdr(MB_DONTWAIT, MT_DATA);
2016 m->m_len = m->m_pkthdr.len = MHLEN;
2017 err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2018 &seg, 1, &cnt, BUS_DMA_NOWAIT);
2020 kprintf("can't dmamap small (%d)\n", err);
2024 rx->info[idx].m = m;
2025 rx->shadow[idx].addr_low =
2026 htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2027 rx->shadow[idx].addr_high =
2028 htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2032 mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2037 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2039 bus_dma_segment_t seg[3];
2041 mxge_rx_ring_t *rx = &ss->rx_big;
2044 if (rx->cl_size == MCLBYTES)
2045 m = m_getcl(MB_DONTWAIT, MT_DATA, M_PKTHDR);
2048 m = m_getjcl(MB_DONTWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2051 * XXX: allocate normal sized buffers for big buffers.
2052 * We should be fine as long as we don't get any jumbo frames
2054 m = m_getcl(MB_DONTWAIT, MT_DATA, M_PKTHDR);
2062 m->m_len = m->m_pkthdr.len = rx->mlen;
2063 err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2064 seg, 1, &cnt, BUS_DMA_NOWAIT);
2066 kprintf("can't dmamap big (%d)\n", err);
2070 rx->info[idx].m = m;
2071 rx->shadow[idx].addr_low =
2072 htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2073 rx->shadow[idx].addr_high =
2074 htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2076 #if MXGE_VIRT_JUMBOS
2077 for (i = 1; i < cnt; i++) {
2078 rx->shadow[idx + i].addr_low =
2079 htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2080 rx->shadow[idx + i].addr_high =
2081 htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2086 for (i = 0; i < rx->nbufs; i++) {
2087 if ((idx & 7) == 7) {
2088 mxge_submit_8rx(&rx->lanai[idx - 7],
2089 &rx->shadow[idx - 7]);
2097 * Myri10GE hardware checksums are not valid if the sender
2098 * padded the frame with non-zero padding. This is because
2099 * the firmware just does a simple 16-bit 1s complement
2100 * checksum across the entire frame, excluding the first 14
2101 * bytes. It is best to simply to check the checksum and
2102 * tell the stack about it only if the checksum is good
2104 static inline uint16_t
2105 mxge_rx_csum(struct mbuf *m, int csum)
2107 struct ether_header *eh;
2111 eh = mtod(m, struct ether_header *);
2113 /* only deal with IPv4 TCP & UDP for now */
2114 if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2116 ip = (struct ip *)(eh + 1);
2117 if (__predict_false(ip->ip_p != IPPROTO_TCP &&
2118 ip->ip_p != IPPROTO_UDP))
2121 c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2122 htonl(ntohs(csum) + ntohs(ip->ip_len) +
2123 - (ip->ip_hl << 2) + ip->ip_p));
2132 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2134 struct ether_vlan_header *evl;
2137 evl = mtod(m, struct ether_vlan_header *);
2140 * fix checksum by subtracting EVL_ENCAPLEN bytes
2141 * after what the firmware thought was the end of the ethernet
2145 /* put checksum into host byte order */
2146 *csum = ntohs(*csum);
2147 partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2148 (*csum) += ~partial;
2149 (*csum) += ((*csum) < ~partial);
2150 (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2151 (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2153 /* restore checksum to network byte order;
2154 later consumers expect this */
2155 *csum = htons(*csum);
2158 m->m_pkthdr.ether_vlantag = ntohs(evl->evl_tag);
2159 m->m_flags |= M_VLANTAG;
2162 * Remove the 802.1q header by copying the Ethernet
2163 * addresses over it and adjusting the beginning of
2164 * the data in the mbuf. The encapsulated Ethernet
2165 * type field is already in place.
2167 bcopy((char *)evl, (char *)evl + EVL_ENCAPLEN,
2168 ETHER_HDR_LEN - ETHER_TYPE_LEN);
2169 m_adj(m, EVL_ENCAPLEN);
2174 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2179 struct ether_header *eh;
2181 bus_dmamap_t old_map;
2187 idx = rx->cnt & rx->mask;
2188 rx->cnt += rx->nbufs;
2189 /* save a pointer to the received mbuf */
2190 m = rx->info[idx].m;
2191 /* try to replace the received mbuf */
2192 if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2193 /* drop the frame -- the old mbuf is re-cycled */
2194 IFNET_STAT_INC(ifp, ierrors, 1);
2198 /* unmap the received buffer */
2199 old_map = rx->info[idx].map;
2200 bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2201 bus_dmamap_unload(rx->dmat, old_map);
2203 /* swap the bus_dmamap_t's */
2204 rx->info[idx].map = rx->extra_map;
2205 rx->extra_map = old_map;
2207 /* mcp implicitly skips 1st 2 bytes so that packet is properly
2209 m->m_data += MXGEFW_PAD;
2211 m->m_pkthdr.rcvif = ifp;
2212 m->m_len = m->m_pkthdr.len = len;
2214 eh = mtod(m, struct ether_header *);
2215 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2216 mxge_vlan_tag_remove(m, &csum);
2218 /* if the checksum is valid, mark it in the mbuf header */
2219 if ((ifp->if_capenable & IFCAP_RXCSUM) &&
2220 0 == mxge_rx_csum(m, csum)) {
2221 /* Tell the stack that the checksum is good */
2222 m->m_pkthdr.csum_data = 0xffff;
2223 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2227 /* flowid only valid if RSS hashing is enabled */
2228 if (sc->num_slices > 1) {
2229 m->m_pkthdr.flowid = (ss - sc->ss);
2230 m->m_flags |= M_FLOWID;
2233 ifp->if_input(ifp, m);
2237 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2241 struct ether_header *eh;
2244 bus_dmamap_t old_map;
2250 idx = rx->cnt & rx->mask;
2252 /* save a pointer to the received mbuf */
2253 m = rx->info[idx].m;
2254 /* try to replace the received mbuf */
2255 if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2256 /* drop the frame -- the old mbuf is re-cycled */
2257 IFNET_STAT_INC(ifp, ierrors, 1);
2261 /* unmap the received buffer */
2262 old_map = rx->info[idx].map;
2263 bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2264 bus_dmamap_unload(rx->dmat, old_map);
2266 /* swap the bus_dmamap_t's */
2267 rx->info[idx].map = rx->extra_map;
2268 rx->extra_map = old_map;
2270 /* mcp implicitly skips 1st 2 bytes so that packet is properly
2272 m->m_data += MXGEFW_PAD;
2274 m->m_pkthdr.rcvif = ifp;
2275 m->m_len = m->m_pkthdr.len = len;
2277 eh = mtod(m, struct ether_header *);
2278 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2279 mxge_vlan_tag_remove(m, &csum);
2281 /* if the checksum is valid, mark it in the mbuf header */
2282 if ((ifp->if_capenable & IFCAP_RXCSUM) &&
2283 0 == mxge_rx_csum(m, csum)) {
2284 /* Tell the stack that the checksum is good */
2285 m->m_pkthdr.csum_data = 0xffff;
2286 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2290 /* flowid only valid if RSS hashing is enabled */
2291 if (sc->num_slices > 1) {
2292 m->m_pkthdr.flowid = (ss - sc->ss);
2293 m->m_flags |= M_FLOWID;
2296 ifp->if_input(ifp, m);
2300 mxge_clean_rx_done(struct mxge_slice_state *ss)
2302 mxge_rx_done_t *rx_done = &ss->rx_done;
2307 while (rx_done->entry[rx_done->idx].length != 0) {
2308 length = ntohs(rx_done->entry[rx_done->idx].length);
2309 rx_done->entry[rx_done->idx].length = 0;
2310 checksum = rx_done->entry[rx_done->idx].checksum;
2311 if (length <= (MHLEN - MXGEFW_PAD))
2312 mxge_rx_done_small(ss, length, checksum);
2314 mxge_rx_done_big(ss, length, checksum);
2316 rx_done->idx = rx_done->cnt & rx_done->mask;
2318 /* limit potential for livelock */
2319 if (__predict_false(++limit > rx_done->mask / 2))
2325 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2335 ASSERT_SERIALIZED(ifp->if_serializer);
2336 while (tx->pkt_done != mcp_idx) {
2337 idx = tx->done & tx->mask;
2339 m = tx->info[idx].m;
2340 /* mbuf and DMA map only attached to the first
2343 ss->obytes += m->m_pkthdr.len;
2344 if (m->m_flags & M_MCAST)
2347 tx->info[idx].m = NULL;
2348 map = tx->info[idx].map;
2349 bus_dmamap_unload(tx->dmat, map);
2352 if (tx->info[idx].flag) {
2353 tx->info[idx].flag = 0;
2358 /* If we have space, clear OACTIVE to tell the stack that
2359 its OK to send packets */
2360 if (tx->req - tx->done < (tx->mask + 1)/4)
2361 ifq_clr_oactive(&ifp->if_snd);
2363 if (!ifq_is_empty(&ifp->if_snd))
2366 #ifdef IFNET_BUF_RING
2367 if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2368 /* let the NIC stop polling this queue, since there
2369 * are no more transmits pending */
2370 if (tx->req == tx->done) {
2372 tx->queue_active = 0;
2380 static struct mxge_media_type mxge_xfp_media_types[] = {
2381 {IFM_10G_CX4, 0x7f, "10GBASE-CX4 (module)"},
2382 {IFM_10G_SR, (1 << 7), "10GBASE-SR"},
2383 {IFM_10G_LR, (1 << 6), "10GBASE-LR"},
2384 {0, (1 << 5), "10GBASE-ER"},
2385 {IFM_10G_LRM, (1 << 4), "10GBASE-LRM"},
2386 {0, (1 << 3), "10GBASE-SW"},
2387 {0, (1 << 2), "10GBASE-LW"},
2388 {0, (1 << 1), "10GBASE-EW"},
2389 {0, (1 << 0), "Reserved"}
2392 static struct mxge_media_type mxge_sfp_media_types[] = {
2393 {IFM_10G_TWINAX, 0, "10GBASE-Twinax"},
2394 {0, (1 << 7), "Reserved"},
2395 {IFM_10G_LRM, (1 << 6), "10GBASE-LRM"},
2396 {IFM_10G_LR, (1 << 5), "10GBASE-LR"},
2397 {IFM_10G_SR, (1 << 4), "10GBASE-SR"},
2398 {IFM_10G_TWINAX,(1 << 0), "10GBASE-Twinax"}
2402 mxge_media_set(mxge_softc_t *sc, int media_type)
2404 ifmedia_add(&sc->media, IFM_ETHER | IFM_FDX | media_type, 0, NULL);
2405 ifmedia_set(&sc->media, IFM_ETHER | IFM_FDX | media_type);
2406 sc->current_media = media_type;
2407 sc->media.ifm_media = sc->media.ifm_cur->ifm_media;
2411 mxge_media_init(mxge_softc_t *sc)
2416 ifmedia_removeall(&sc->media);
2417 mxge_media_set(sc, IFM_AUTO);
2420 * parse the product code to deterimine the interface type
2421 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2422 * after the 3rd dash in the driver's cached copy of the
2423 * EEPROM's product code string.
2425 ptr = sc->product_code_string;
2427 device_printf(sc->dev, "Missing product code\n");
2431 for (i = 0; i < 3; i++, ptr++) {
2432 ptr = strchr(ptr, '-');
2434 device_printf(sc->dev, "only %d dashes in PC?!?\n", i);
2438 if (*ptr == 'C' || *(ptr +1) == 'C') {
2440 sc->connector = MXGE_CX4;
2441 mxge_media_set(sc, IFM_10G_CX4);
2442 } else if (*ptr == 'Q') {
2443 /* -Q is Quad Ribbon Fiber */
2444 sc->connector = MXGE_QRF;
2445 device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2446 /* FreeBSD has no media type for Quad ribbon fiber */
2447 } else if (*ptr == 'R') {
2449 sc->connector = MXGE_XFP;
2450 } else if (*ptr == 'S' || *(ptr +1) == 'S') {
2451 /* -S or -2S is SFP+ */
2452 sc->connector = MXGE_SFP;
2454 device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2459 * Determine the media type for a NIC. Some XFPs will identify
2460 * themselves only when their link is up, so this is initiated via a
2461 * link up interrupt. However, this can potentially take up to
2462 * several milliseconds, so it is run via the watchdog routine, rather
2463 * than in the interrupt handler itself.
2466 mxge_media_probe(mxge_softc_t *sc)
2469 const char *cage_type;
2470 struct mxge_media_type *mxge_media_types = NULL;
2471 int i, err, ms, mxge_media_type_entries;
2474 sc->need_media_probe = 0;
2476 if (sc->connector == MXGE_XFP) {
2478 mxge_media_types = mxge_xfp_media_types;
2479 mxge_media_type_entries = sizeof(mxge_xfp_media_types) /
2480 sizeof(mxge_xfp_media_types[0]);
2481 byte = MXGE_XFP_COMPLIANCE_BYTE;
2483 } else if (sc->connector == MXGE_SFP) {
2484 /* -S or -2S is SFP+ */
2485 mxge_media_types = mxge_sfp_media_types;
2486 mxge_media_type_entries = sizeof(mxge_sfp_media_types) /
2487 sizeof(mxge_sfp_media_types[0]);
2491 /* nothing to do; media type cannot change */
2496 * At this point we know the NIC has an XFP cage, so now we
2497 * try to determine what is in the cage by using the
2498 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2499 * register. We read just one byte, which may take over
2503 cmd.data0 = 0; /* just fetch 1 byte, not all 256 */
2505 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2506 if (err == MXGEFW_CMD_ERROR_I2C_FAILURE)
2507 device_printf(sc->dev, "failed to read XFP\n");
2508 if (err == MXGEFW_CMD_ERROR_I2C_ABSENT)
2509 device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
2510 if (err != MXGEFW_CMD_OK)
2513 /* Now we wait for the data to be cached */
2515 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2516 for (ms = 0; err == EBUSY && ms < 50; ms++) {
2519 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2521 if (err != MXGEFW_CMD_OK) {
2522 device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
2523 cage_type, err, ms);
2527 if (cmd.data0 == mxge_media_types[0].bitmask) {
2529 device_printf(sc->dev, "%s:%s\n", cage_type,
2530 mxge_media_types[0].name);
2532 if (sc->current_media != mxge_media_types[0].flag) {
2533 mxge_media_init(sc);
2534 mxge_media_set(sc, mxge_media_types[0].flag);
2538 for (i = 1; i < mxge_media_type_entries; i++) {
2539 if (cmd.data0 & mxge_media_types[i].bitmask) {
2541 device_printf(sc->dev, "%s:%s\n", cage_type,
2542 mxge_media_types[i].name);
2545 if (sc->current_media != mxge_media_types[i].flag) {
2546 mxge_media_init(sc);
2547 mxge_media_set(sc, mxge_media_types[i].flag);
2553 device_printf(sc->dev, "%s media 0x%x unknown\n", cage_type,
2559 mxge_intr(void *arg)
2561 struct mxge_slice_state *ss = arg;
2562 mxge_softc_t *sc = ss->sc;
2563 mcp_irq_data_t *stats = ss->fw_stats;
2564 mxge_tx_ring_t *tx = &ss->tx;
2565 mxge_rx_done_t *rx_done = &ss->rx_done;
2566 uint32_t send_done_count;
2570 #ifndef IFNET_BUF_RING
2571 /* an interrupt on a non-zero slice is implicitly valid
2572 since MSI-X irqs are not shared */
2574 mxge_clean_rx_done(ss);
2575 *ss->irq_claim = be32toh(3);
2580 /* make sure the DMA has finished */
2581 if (!stats->valid) {
2584 valid = stats->valid;
2586 if (sc->irq_type == PCI_INTR_TYPE_LEGACY) {
2587 /* lower legacy IRQ */
2588 *sc->irq_deassert = 0;
2589 if (!mxge_deassert_wait)
2590 /* don't wait for conf. that irq is low */
2596 /* loop while waiting for legacy irq deassertion */
2598 /* check for transmit completes and receives */
2599 send_done_count = be32toh(stats->send_done_count);
2600 while ((send_done_count != tx->pkt_done) ||
2601 (rx_done->entry[rx_done->idx].length != 0)) {
2602 if (send_done_count != tx->pkt_done)
2603 mxge_tx_done(ss, (int)send_done_count);
2604 mxge_clean_rx_done(ss);
2605 send_done_count = be32toh(stats->send_done_count);
2607 if (sc->irq_type == PCI_INTR_TYPE_LEGACY && mxge_deassert_wait)
2609 } while (*((volatile uint8_t *) &stats->valid));
2611 /* fw link & error stats meaningful only on the first slice */
2612 if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
2613 if (sc->link_state != stats->link_up) {
2614 sc->link_state = stats->link_up;
2615 if (sc->link_state) {
2616 sc->ifp->if_link_state = LINK_STATE_UP;
2617 if_link_state_change(sc->ifp);
2619 device_printf(sc->dev, "link up\n");
2621 sc->ifp->if_link_state = LINK_STATE_DOWN;
2622 if_link_state_change(sc->ifp);
2624 device_printf(sc->dev, "link down\n");
2626 sc->need_media_probe = 1;
2628 if (sc->rdma_tags_available !=
2629 be32toh(stats->rdma_tags_available)) {
2630 sc->rdma_tags_available =
2631 be32toh(stats->rdma_tags_available);
2632 device_printf(sc->dev, "RDMA timed out! %d tags "
2633 "left\n", sc->rdma_tags_available);
2636 if (stats->link_down) {
2637 sc->down_cnt += stats->link_down;
2639 sc->ifp->if_link_state = LINK_STATE_DOWN;
2640 if_link_state_change(sc->ifp);
2644 /* check to see if we have rx token to pass back */
2646 *ss->irq_claim = be32toh(3);
2647 *(ss->irq_claim + 1) = be32toh(3);
2651 mxge_init(void *arg)
2653 struct mxge_softc *sc = arg;
2655 ASSERT_SERIALIZED(sc->ifp->if_serializer);
2656 if ((sc->ifp->if_flags & IFF_RUNNING) == 0)
2661 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
2665 for (i = 0; i <= ss->rx_big.mask; i++) {
2666 if (ss->rx_big.info[i].m == NULL)
2668 bus_dmamap_unload(ss->rx_big.dmat,
2669 ss->rx_big.info[i].map);
2670 m_freem(ss->rx_big.info[i].m);
2671 ss->rx_big.info[i].m = NULL;
2674 for (i = 0; i <= ss->rx_small.mask; i++) {
2675 if (ss->rx_small.info[i].m == NULL)
2677 bus_dmamap_unload(ss->rx_small.dmat,
2678 ss->rx_small.info[i].map);
2679 m_freem(ss->rx_small.info[i].m);
2680 ss->rx_small.info[i].m = NULL;
2683 /* transmit ring used only on the first slice */
2684 if (ss->tx.info == NULL)
2687 for (i = 0; i <= ss->tx.mask; i++) {
2688 ss->tx.info[i].flag = 0;
2689 if (ss->tx.info[i].m == NULL)
2691 bus_dmamap_unload(ss->tx.dmat,
2692 ss->tx.info[i].map);
2693 m_freem(ss->tx.info[i].m);
2694 ss->tx.info[i].m = NULL;
2699 mxge_free_mbufs(mxge_softc_t *sc)
2703 for (slice = 0; slice < sc->num_slices; slice++)
2704 mxge_free_slice_mbufs(&sc->ss[slice]);
2708 mxge_free_slice_rings(struct mxge_slice_state *ss)
2712 if (ss->rx_done.entry != NULL) {
2713 mxge_dma_free(&ss->rx_done.dma);
2714 ss->rx_done.entry = NULL;
2717 if (ss->tx.req_bytes != NULL) {
2718 kfree(ss->tx.req_bytes, M_DEVBUF);
2719 ss->tx.req_bytes = NULL;
2722 if (ss->tx.seg_list != NULL) {
2723 kfree(ss->tx.seg_list, M_DEVBUF);
2724 ss->tx.seg_list = NULL;
2727 if (ss->rx_small.shadow != NULL) {
2728 kfree(ss->rx_small.shadow, M_DEVBUF);
2729 ss->rx_small.shadow = NULL;
2732 if (ss->rx_big.shadow != NULL) {
2733 kfree(ss->rx_big.shadow, M_DEVBUF);
2734 ss->rx_big.shadow = NULL;
2737 if (ss->tx.info != NULL) {
2738 if (ss->tx.dmat != NULL) {
2739 for (i = 0; i <= ss->tx.mask; i++) {
2740 bus_dmamap_destroy(ss->tx.dmat,
2741 ss->tx.info[i].map);
2743 bus_dma_tag_destroy(ss->tx.dmat);
2745 kfree(ss->tx.info, M_DEVBUF);
2749 if (ss->rx_small.info != NULL) {
2750 if (ss->rx_small.dmat != NULL) {
2751 for (i = 0; i <= ss->rx_small.mask; i++) {
2752 bus_dmamap_destroy(ss->rx_small.dmat,
2753 ss->rx_small.info[i].map);
2755 bus_dmamap_destroy(ss->rx_small.dmat,
2756 ss->rx_small.extra_map);
2757 bus_dma_tag_destroy(ss->rx_small.dmat);
2759 kfree(ss->rx_small.info, M_DEVBUF);
2760 ss->rx_small.info = NULL;
2763 if (ss->rx_big.info != NULL) {
2764 if (ss->rx_big.dmat != NULL) {
2765 for (i = 0; i <= ss->rx_big.mask; i++) {
2766 bus_dmamap_destroy(ss->rx_big.dmat,
2767 ss->rx_big.info[i].map);
2769 bus_dmamap_destroy(ss->rx_big.dmat,
2770 ss->rx_big.extra_map);
2771 bus_dma_tag_destroy(ss->rx_big.dmat);
2773 kfree(ss->rx_big.info, M_DEVBUF);
2774 ss->rx_big.info = NULL;
2779 mxge_free_rings(mxge_softc_t *sc)
2786 for (slice = 0; slice < sc->num_slices; slice++)
2787 mxge_free_slice_rings(&sc->ss[slice]);
2791 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
2792 int tx_ring_entries)
2794 mxge_softc_t *sc = ss->sc;
2799 * Allocate per-slice receive resources
2802 ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
2803 ss->rx_done.mask = (2 * rx_ring_entries) - 1;
2805 /* Allocate the rx shadow rings */
2806 bytes = rx_ring_entries * sizeof(*ss->rx_small.shadow);
2807 ss->rx_small.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2809 bytes = rx_ring_entries * sizeof(*ss->rx_big.shadow);
2810 ss->rx_big.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2812 /* Allocate the rx host info rings */
2813 bytes = rx_ring_entries * sizeof(*ss->rx_small.info);
2814 ss->rx_small.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2816 bytes = rx_ring_entries * sizeof(*ss->rx_big.info);
2817 ss->rx_big.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2819 /* Allocate the rx busdma resources */
2820 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
2822 4096, /* boundary */
2823 BUS_SPACE_MAXADDR, /* low */
2824 BUS_SPACE_MAXADDR, /* high */
2825 NULL, NULL, /* filter */
2826 MHLEN, /* maxsize */
2828 MHLEN, /* maxsegsize */
2829 BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW,
2831 &ss->rx_small.dmat); /* tag */
2833 device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
2838 err = bus_dmamap_create(ss->rx_small.dmat, BUS_DMA_WAITOK,
2839 &ss->rx_small.extra_map);
2841 device_printf(sc->dev, "Err %d extra rx_small dmamap\n", err);
2842 bus_dma_tag_destroy(ss->rx_small.dmat);
2843 ss->rx_small.dmat = NULL;
2846 for (i = 0; i <= ss->rx_small.mask; i++) {
2847 err = bus_dmamap_create(ss->rx_small.dmat, BUS_DMA_WAITOK,
2848 &ss->rx_small.info[i].map);
2852 device_printf(sc->dev, "Err %d rx_small dmamap\n", err);
2854 for (j = 0; j < i; ++j) {
2855 bus_dmamap_destroy(ss->rx_small.dmat,
2856 ss->rx_small.info[j].map);
2858 bus_dmamap_destroy(ss->rx_small.dmat,
2859 ss->rx_small.extra_map);
2860 bus_dma_tag_destroy(ss->rx_small.dmat);
2861 ss->rx_small.dmat = NULL;
2866 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
2868 4096, /* boundary */
2869 BUS_SPACE_MAXADDR, /* low */
2870 BUS_SPACE_MAXADDR, /* high */
2871 NULL, NULL, /* filter */
2872 3*4096, /* maxsize */
2874 4096, /* maxsegsize*/
2875 BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW,
2877 &ss->rx_big.dmat); /* tag */
2879 device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
2884 err = bus_dmamap_create(ss->rx_big.dmat, BUS_DMA_WAITOK,
2885 &ss->rx_big.extra_map);
2887 device_printf(sc->dev, "Err %d extra rx_big dmamap\n", err);
2888 bus_dma_tag_destroy(ss->rx_big.dmat);
2889 ss->rx_big.dmat = NULL;
2892 for (i = 0; i <= ss->rx_big.mask; i++) {
2893 err = bus_dmamap_create(ss->rx_big.dmat, BUS_DMA_WAITOK,
2894 &ss->rx_big.info[i].map);
2898 device_printf(sc->dev, "Err %d rx_big dmamap\n", err);
2899 for (j = 0; j < i; ++j) {
2900 bus_dmamap_destroy(ss->rx_big.dmat,
2901 ss->rx_big.info[j].map);
2903 bus_dmamap_destroy(ss->rx_big.dmat,
2904 ss->rx_big.extra_map);
2905 bus_dma_tag_destroy(ss->rx_big.dmat);
2906 ss->rx_big.dmat = NULL;
2912 * Now allocate TX resources
2915 #ifndef IFNET_BUF_RING
2916 /* only use a single TX ring for now */
2917 if (ss != ss->sc->ss)
2921 ss->tx.mask = tx_ring_entries - 1;
2922 ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
2924 /* Allocate the tx request copy block XXX */
2925 bytes = 8 + sizeof(*ss->tx.req_list) * (ss->tx.max_desc + 4);
2926 ss->tx.req_bytes = kmalloc(bytes, M_DEVBUF, M_WAITOK);
2927 /* Ensure req_list entries are aligned to 8 bytes */
2928 ss->tx.req_list = (mcp_kreq_ether_send_t *)
2929 ((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
2931 /* Allocate the tx busdma segment list */
2932 bytes = sizeof(*ss->tx.seg_list) * ss->tx.max_desc;
2933 ss->tx.seg_list = kmalloc(bytes, M_DEVBUF, M_WAITOK);
2935 /* Allocate the tx host info ring */
2936 bytes = tx_ring_entries * sizeof(*ss->tx.info);
2937 ss->tx.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2939 /* Allocate the tx busdma resources */
2940 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
2942 sc->tx_boundary, /* boundary */
2943 BUS_SPACE_MAXADDR, /* low */
2944 BUS_SPACE_MAXADDR, /* high */
2945 NULL, NULL, /* filter */
2947 sizeof(struct ether_vlan_header),
2949 ss->tx.max_desc - 2, /* num segs */
2950 sc->tx_boundary, /* maxsegsz */
2951 BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW |
2952 BUS_DMA_ONEBPAGE, /* flags */
2953 &ss->tx.dmat); /* tag */
2955 device_printf(sc->dev, "Err %d allocating tx dmat\n", err);
2960 * Now use these tags to setup DMA maps for each slot in the ring
2962 for (i = 0; i <= ss->tx.mask; i++) {
2963 err = bus_dmamap_create(ss->tx.dmat,
2964 BUS_DMA_WAITOK | BUS_DMA_ONEBPAGE, &ss->tx.info[i].map);
2968 device_printf(sc->dev, "Err %d tx dmamap\n", err);
2969 for (j = 0; j < i; ++j) {
2970 bus_dmamap_destroy(ss->tx.dmat,
2971 ss->tx.info[j].map);
2973 bus_dma_tag_destroy(ss->tx.dmat);
2982 mxge_alloc_rings(mxge_softc_t *sc)
2986 int tx_ring_entries, rx_ring_entries;
2989 /* Get ring sizes */
2990 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
2992 device_printf(sc->dev, "Cannot determine tx ring sizes\n");
2995 tx_ring_size = cmd.data0;
2997 tx_ring_entries = tx_ring_size / sizeof(mcp_kreq_ether_send_t);
2998 rx_ring_entries = sc->rx_ring_size / sizeof(mcp_dma_addr_t);
2999 ifq_set_maxlen(&sc->ifp->if_snd, tx_ring_entries - 1);
3000 ifq_set_ready(&sc->ifp->if_snd);
3002 for (slice = 0; slice < sc->num_slices; slice++) {
3003 err = mxge_alloc_slice_rings(&sc->ss[slice],
3004 rx_ring_entries, tx_ring_entries);
3006 device_printf(sc->dev,
3007 "alloc %d slice rings failed\n", slice);
3015 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3017 int bufsize = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN + MXGEFW_PAD;
3019 if (bufsize < MCLBYTES) {
3020 /* easy, everything fits in a single buffer */
3021 *big_buf_size = MCLBYTES;
3022 *cl_size = MCLBYTES;
3027 if (bufsize < MJUMPAGESIZE) {
3028 /* still easy, everything still fits in a single buffer */
3029 *big_buf_size = MJUMPAGESIZE;
3030 *cl_size = MJUMPAGESIZE;
3034 #if MXGE_VIRT_JUMBOS
3035 /* now we need to use virtually contiguous buffers */
3036 *cl_size = MJUM9BYTES;
3037 *big_buf_size = 4096;
3038 *nbufs = mtu / 4096 + 1;
3039 /* needs to be a power of two, so round up */
3043 *cl_size = MJUM9BYTES;
3044 *big_buf_size = MJUM9BYTES;
3050 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3058 slice = ss - sc->ss;
3060 /* get the lanai pointers to the send and receive rings */
3063 #ifndef IFNET_BUF_RING
3064 /* We currently only send from the first slice */
3068 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3070 (volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3071 ss->tx.send_go = (volatile uint32_t *)
3072 (sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3073 ss->tx.send_stop = (volatile uint32_t *)
3074 (sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3075 #ifndef IFNET_BUF_RING
3079 err |= mxge_send_cmd(sc,
3080 MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3081 ss->rx_small.lanai =
3082 (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3084 err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3086 (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3089 device_printf(sc->dev,
3090 "failed to get ring sizes or locations\n");
3094 /* stock receive rings */
3095 for (i = 0; i <= ss->rx_small.mask; i++) {
3096 map = ss->rx_small.info[i].map;
3097 err = mxge_get_buf_small(ss, map, i);
3099 device_printf(sc->dev, "alloced %d/%d smalls\n",
3100 i, ss->rx_small.mask + 1);
3104 for (i = 0; i <= ss->rx_big.mask; i++) {
3105 ss->rx_big.shadow[i].addr_low = 0xffffffff;
3106 ss->rx_big.shadow[i].addr_high = 0xffffffff;
3108 ss->rx_big.nbufs = nbufs;
3109 ss->rx_big.cl_size = cl_size;
3110 ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3111 EVL_ENCAPLEN + MXGEFW_PAD;
3112 for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3113 map = ss->rx_big.info[i].map;
3114 err = mxge_get_buf_big(ss, map, i);
3116 device_printf(sc->dev, "alloced %d/%d bigs\n",
3117 i, ss->rx_big.mask + 1);
3125 mxge_open(mxge_softc_t *sc)
3127 struct ifnet *ifp = sc->ifp;
3129 int err, big_bytes, nbufs, slice, cl_size, i;
3131 volatile uint8_t *itable;
3132 struct mxge_slice_state *ss;
3134 ASSERT_SERIALIZED(ifp->if_serializer);
3136 /* Copy the MAC address in case it was overridden */
3137 bcopy(IF_LLADDR(ifp), sc->mac_addr, ETHER_ADDR_LEN);
3139 err = mxge_reset(sc, 1);
3141 if_printf(ifp, "failed to reset\n");
3145 if (sc->num_slices > 1) {
3146 /* Setup the indirection table */
3147 cmd.data0 = sc->num_slices;
3148 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE, &cmd);
3150 err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET, &cmd);
3152 if_printf(ifp, "failed to setup rss tables\n");
3156 /* Just enable an identity mapping */
3157 itable = sc->sram + cmd.data0;
3158 for (i = 0; i < sc->num_slices; i++)
3159 itable[i] = (uint8_t)i;
3162 cmd.data1 = MXGEFW_RSS_HASH_TYPE_TCP_IPV4;
3163 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3165 if_printf(ifp, "failed to enable slices\n");
3170 cmd.data0 = MXGEFW_TSO_MODE_NDIS;
3171 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_TSO_MODE, &cmd);
3174 * Can't change TSO mode to NDIS, never allow TSO then
3176 if_printf(ifp, "failed to set TSO mode\n");
3177 ifp->if_capenable &= ~IFCAP_TSO;
3178 ifp->if_capabilities &= ~IFCAP_TSO;
3179 ifp->if_hwassist &= ~CSUM_TSO;
3182 mxge_choose_params(ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3185 err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS, &cmd);
3187 * Error is only meaningful if we're trying to set
3188 * MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1
3190 if (err && nbufs > 1) {
3191 if_printf(ifp, "Failed to set alway-use-n to %d\n", nbufs);
3196 * Give the firmware the mtu and the big and small buffer
3197 * sizes. The firmware wants the big buf size to be a power
3198 * of two. Luckily, FreeBSD's clusters are powers of two
3200 cmd.data0 = ifp->if_mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
3201 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3203 cmd.data0 = MHLEN - MXGEFW_PAD;
3204 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE, &cmd);
3206 cmd.data0 = big_bytes;
3207 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3210 if_printf(ifp, "failed to setup params\n");
3214 /* Now give him the pointer to the stats block */
3216 #ifdef IFNET_BUF_RING
3217 slice < sc->num_slices;
3222 ss = &sc->ss[slice];
3223 cmd.data0 = MXGE_LOWPART_TO_U32(ss->fw_stats_dma.dmem_busaddr);
3224 cmd.data1 = MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.dmem_busaddr);
3225 cmd.data2 = sizeof(struct mcp_irq_data);
3226 cmd.data2 |= (slice << 16);
3227 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3231 bus = sc->ss->fw_stats_dma.dmem_busaddr;
3232 bus += offsetof(struct mcp_irq_data, send_done_count);
3233 cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3234 cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3235 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3238 /* Firmware cannot support multicast without STATS_DMA_V2 */
3239 sc->fw_multicast_support = 0;
3241 sc->fw_multicast_support = 1;
3245 if_printf(ifp, "failed to setup params\n");
3249 for (slice = 0; slice < sc->num_slices; slice++) {
3250 err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3252 if_printf(ifp, "couldn't open slice %d\n", slice);
3257 /* Finally, start the firmware running */
3258 err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3260 if_printf(ifp, "Couldn't bring up link\n");
3263 ifp->if_flags |= IFF_RUNNING;
3264 ifq_clr_oactive(&ifp->if_snd);
3269 mxge_free_mbufs(sc);
3274 mxge_close(mxge_softc_t *sc, int down)
3277 int err, old_down_cnt;
3279 ASSERT_SERIALIZED(sc->ifp->if_serializer);
3281 sc->ifp->if_flags &= ~IFF_RUNNING;
3283 old_down_cnt = sc->down_cnt;
3285 err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3287 device_printf(sc->dev,
3288 "Couldn't bring down link\n");
3290 if (old_down_cnt == sc->down_cnt) {
3291 /* wait for down irq */
3292 lwkt_serialize_exit(sc->ifp->if_serializer);
3293 DELAY(10 * sc->intr_coal_delay);
3294 lwkt_serialize_enter(sc->ifp->if_serializer);
3297 if (old_down_cnt == sc->down_cnt) {
3298 device_printf(sc->dev, "never got down irq\n");
3301 mxge_free_mbufs(sc);
3307 mxge_setup_cfg_space(mxge_softc_t *sc)
3309 device_t dev = sc->dev;
3311 uint16_t lnk, pectl;
3313 /* Find the PCIe link width and set max read request to 4KB */
3314 if (pci_find_extcap(dev, PCIY_EXPRESS, ®) == 0) {
3315 lnk = pci_read_config(dev, reg + 0x12, 2);
3316 sc->link_width = (lnk >> 4) & 0x3f;
3318 if (sc->pectl == 0) {
3319 pectl = pci_read_config(dev, reg + 0x8, 2);
3320 pectl = (pectl & ~0x7000) | (5 << 12);
3321 pci_write_config(dev, reg + 0x8, pectl, 2);
3324 /* Restore saved pectl after watchdog reset */
3325 pci_write_config(dev, reg + 0x8, sc->pectl, 2);
3329 /* Enable DMA and memory space access */
3330 pci_enable_busmaster(dev);
3334 mxge_read_reboot(mxge_softc_t *sc)
3336 device_t dev = sc->dev;
3339 /* find the vendor specific offset */
3340 if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
3341 device_printf(sc->dev,
3342 "could not find vendor specific offset\n");
3343 return (uint32_t)-1;
3345 /* enable read32 mode */
3346 pci_write_config(dev, vs + 0x10, 0x3, 1);
3347 /* tell NIC which register to read */
3348 pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3349 return (pci_read_config(dev, vs + 0x14, 4));
3353 mxge_watchdog_reset(mxge_softc_t *sc)
3355 struct pci_devinfo *dinfo;
3362 device_printf(sc->dev, "Watchdog reset!\n");
3365 * check to see if the NIC rebooted. If it did, then all of
3366 * PCI config space has been reset, and things like the
3367 * busmaster bit will be zero. If this is the case, then we
3368 * must restore PCI config space before the NIC can be used
3371 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3372 if (cmd == 0xffff) {
3374 * maybe the watchdog caught the NIC rebooting; wait
3375 * up to 100ms for it to finish. If it does not come
3376 * back, then give up
3379 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3380 if (cmd == 0xffff) {
3381 device_printf(sc->dev, "NIC disappeared!\n");
3384 if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3385 /* print the reboot status */
3386 reboot = mxge_read_reboot(sc);
3387 device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3389 running = sc->ifp->if_flags & IFF_RUNNING;
3393 * quiesce NIC so that TX routines will not try to
3394 * xmit after restoration of BAR
3397 /* Mark the link as down */
3398 if (sc->link_state) {
3399 sc->ifp->if_link_state = LINK_STATE_DOWN;
3400 if_link_state_change(sc->ifp);
3404 /* restore PCI configuration space */
3405 dinfo = device_get_ivars(sc->dev);
3406 pci_cfg_restore(sc->dev, dinfo);
3408 /* and redo any changes we made to our config space */
3409 mxge_setup_cfg_space(sc);
3412 err = mxge_load_firmware(sc, 0);
3414 device_printf(sc->dev,
3415 "Unable to re-load f/w\n");
3419 err = mxge_open(sc);
3420 if_devstart_sched(sc->ifp);
3423 sc->watchdog_resets++;
3425 device_printf(sc->dev,
3426 "NIC did not reboot, not resetting\n");
3430 device_printf(sc->dev, "watchdog reset failed\n");
3434 callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3439 mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice)
3441 tx = &sc->ss[slice].tx;
3442 device_printf(sc->dev, "slice %d struck? ring state:\n", slice);
3443 device_printf(sc->dev,
3444 "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3445 tx->req, tx->done, tx->queue_active);
3446 device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3447 tx->activate, tx->deactivate);
3448 device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3450 be32toh(sc->ss->fw_stats->send_done_count));
3454 mxge_watchdog(mxge_softc_t *sc)
3457 uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3460 /* see if we have outstanding transmits, which
3461 have been pending for more than mxge_ticks */
3463 #ifdef IFNET_BUF_RING
3464 (i < sc->num_slices) && (err == 0);
3466 (i < 1) && (err == 0);
3470 if (tx->req != tx->done &&
3471 tx->watchdog_req != tx->watchdog_done &&
3472 tx->done == tx->watchdog_done) {
3473 /* check for pause blocking before resetting */
3474 if (tx->watchdog_rx_pause == rx_pause) {
3475 mxge_warn_stuck(sc, tx, i);
3476 mxge_watchdog_reset(sc);
3480 device_printf(sc->dev, "Flow control blocking "
3481 "xmits, check link partner\n");
3484 tx->watchdog_req = tx->req;
3485 tx->watchdog_done = tx->done;
3486 tx->watchdog_rx_pause = rx_pause;
3489 if (sc->need_media_probe)
3490 mxge_media_probe(sc);
3495 mxge_update_stats(mxge_softc_t *sc)
3497 struct mxge_slice_state *ss;
3499 u_long ipackets = 0, old_ipackets;
3500 u_long opackets = 0, old_opackets;
3501 #ifdef IFNET_BUF_RING
3509 for (slice = 0; slice < sc->num_slices; slice++) {
3510 ss = &sc->ss[slice];
3511 ipackets += ss->ipackets;
3512 opackets += ss->opackets;
3513 #ifdef IFNET_BUF_RING
3514 obytes += ss->obytes;
3515 omcasts += ss->omcasts;
3516 odrops += ss->tx.br->br_drops;
3518 oerrors += ss->oerrors;
3520 IFNET_STAT_GET(sc->ifp, ipackets, old_ipackets);
3521 IFNET_STAT_GET(sc->ifp, opackets, old_opackets);
3523 pkts = ipackets - old_ipackets;
3524 pkts += opackets - old_opackets;
3526 IFNET_STAT_SET(sc->ifp, ipackets, ipackets);
3527 IFNET_STAT_SET(sc->ifp, opackets, opackets);
3528 #ifdef IFNET_BUF_RING
3529 sc->ifp->if_obytes = obytes;
3530 sc->ifp->if_omcasts = omcasts;
3531 sc->ifp->if_snd.ifq_drops = odrops;
3533 IFNET_STAT_SET(sc->ifp, oerrors, oerrors);
3538 mxge_tick(void *arg)
3540 mxge_softc_t *sc = arg;
3546 lwkt_serialize_enter(sc->ifp->if_serializer);
3549 running = sc->ifp->if_flags & IFF_RUNNING;
3551 /* aggregate stats from different slices */
3552 pkts = mxge_update_stats(sc);
3553 if (!sc->watchdog_countdown) {
3554 err = mxge_watchdog(sc);
3555 sc->watchdog_countdown = 4;
3557 sc->watchdog_countdown--;
3560 /* ensure NIC did not suffer h/w fault while idle */
3561 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3562 if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3564 mxge_watchdog_reset(sc);
3567 /* look less often if NIC is idle */
3572 callout_reset(&sc->co_hdl, ticks, mxge_tick, sc);
3574 lwkt_serialize_exit(sc->ifp->if_serializer);
3578 mxge_media_change(struct ifnet *ifp)
3584 mxge_change_mtu(mxge_softc_t *sc, int mtu)
3586 struct ifnet *ifp = sc->ifp;
3587 int real_mtu, old_mtu;
3590 if (ifp->if_serializer)
3591 ASSERT_SERIALIZED(ifp->if_serializer);
3593 real_mtu = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
3594 if ((real_mtu > sc->max_mtu) || real_mtu < 60)
3596 old_mtu = ifp->if_mtu;
3598 if (ifp->if_flags & IFF_RUNNING) {
3600 err = mxge_open(sc);
3602 ifp->if_mtu = old_mtu;
3604 (void) mxge_open(sc);
3611 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
3613 mxge_softc_t *sc = ifp->if_softc;
3618 ifmr->ifm_status = IFM_AVALID;
3619 ifmr->ifm_active = IFM_ETHER | IFM_FDX;
3620 ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
3621 ifmr->ifm_active |= sc->current_media;
3625 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data,
3626 struct ucred *cr __unused)
3628 mxge_softc_t *sc = ifp->if_softc;
3629 struct ifreq *ifr = (struct ifreq *)data;
3633 ASSERT_SERIALIZED(ifp->if_serializer);
3636 err = mxge_change_mtu(sc, ifr->ifr_mtu);
3643 if (ifp->if_flags & IFF_UP) {
3644 if (!(ifp->if_flags & IFF_RUNNING)) {
3645 err = mxge_open(sc);
3647 /* take care of promis can allmulti
3649 mxge_change_promisc(sc,
3650 ifp->if_flags & IFF_PROMISC);
3651 mxge_set_multicast_list(sc);
3654 if (ifp->if_flags & IFF_RUNNING) {
3662 mxge_set_multicast_list(sc);
3666 mask = ifr->ifr_reqcap ^ ifp->if_capenable;
3667 if (mask & IFCAP_TXCSUM) {
3668 ifp->if_capenable ^= IFCAP_TXCSUM;
3669 if (ifp->if_capenable & IFCAP_TXCSUM)
3670 ifp->if_hwassist |= CSUM_TCP | CSUM_UDP;
3672 ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP);
3674 if (mask & IFCAP_TSO) {
3675 ifp->if_capenable ^= IFCAP_TSO;
3676 if (ifp->if_capenable & IFCAP_TSO)
3677 ifp->if_hwassist |= CSUM_TSO;
3679 ifp->if_hwassist &= ~CSUM_TSO;
3681 if (mask & IFCAP_RXCSUM)
3682 ifp->if_capenable ^= IFCAP_RXCSUM;
3683 if (mask & IFCAP_VLAN_HWTAGGING)
3684 ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
3688 mxge_media_probe(sc);
3689 err = ifmedia_ioctl(ifp, (struct ifreq *)data,
3690 &sc->media, command);
3694 err = ether_ioctl(ifp, command, data);
3701 mxge_fetch_tunables(mxge_softc_t *sc)
3703 sc->intr_coal_delay = mxge_intr_coal_delay;
3704 if (sc->intr_coal_delay < 0 || sc->intr_coal_delay > (10 * 1000))
3705 sc->intr_coal_delay = MXGE_INTR_COAL_DELAY;
3708 if (mxge_ticks == 0)
3709 mxge_ticks = hz / 2;
3711 sc->pause = mxge_flow_control;
3713 sc->throttle = mxge_throttle;
3714 if (sc->throttle && sc->throttle > MXGE_MAX_THROTTLE)
3715 sc->throttle = MXGE_MAX_THROTTLE;
3716 if (sc->throttle && sc->throttle < MXGE_MIN_THROTTLE)
3717 sc->throttle = MXGE_MIN_THROTTLE;
3721 mxge_free_slices(mxge_softc_t *sc)
3723 struct mxge_slice_state *ss;
3729 for (i = 0; i < sc->num_slices; i++) {
3731 if (ss->fw_stats != NULL) {
3732 mxge_dma_free(&ss->fw_stats_dma);
3733 ss->fw_stats = NULL;
3735 if (ss->rx_done.entry != NULL) {
3736 mxge_dma_free(&ss->rx_done.dma);
3737 ss->rx_done.entry = NULL;
3740 kfree(sc->ss, M_DEVBUF);
3745 mxge_alloc_slices(mxge_softc_t *sc)
3748 struct mxge_slice_state *ss;
3750 int err, i, max_intr_slots;
3752 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
3754 device_printf(sc->dev, "Cannot determine rx ring size\n");
3757 sc->rx_ring_size = cmd.data0;
3758 max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
3760 bytes = sizeof(*sc->ss) * sc->num_slices;
3761 sc->ss = kmalloc(bytes, M_DEVBUF, M_WAITOK | M_ZERO);
3763 for (i = 0; i < sc->num_slices; i++) {
3769 * Allocate per-slice rx interrupt queues
3771 bytes = max_intr_slots * sizeof(*ss->rx_done.entry);
3772 err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
3774 device_printf(sc->dev,
3775 "alloc %d slice rx_done failed\n", i);
3778 ss->rx_done.entry = ss->rx_done.dma.dmem_addr;
3781 * Allocate the per-slice firmware stats; stats
3782 * (including tx) are used used only on the first
3785 #ifndef IFNET_BUF_RING
3790 bytes = sizeof(*ss->fw_stats);
3791 err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
3792 sizeof(*ss->fw_stats), 64);
3794 device_printf(sc->dev,
3795 "alloc %d fw_stats failed\n", i);
3798 ss->fw_stats = ss->fw_stats_dma.dmem_addr;
3804 mxge_slice_probe(mxge_softc_t *sc)
3808 int msix_cnt, status, max_intr_slots;
3815 * Don't enable multiple slices if they are not enabled,
3816 * or if this is not an SMP system
3818 if (mxge_max_slices == 0 || mxge_max_slices == 1 || ncpus < 2)
3821 /* see how many MSI-X interrupts are available */
3822 msix_cnt = pci_msix_count(sc->dev);
3826 /* now load the slice aware firmware see what it supports */
3827 old_fw = sc->fw_name;
3828 if (old_fw == mxge_fw_aligned)
3829 sc->fw_name = mxge_fw_rss_aligned;
3831 sc->fw_name = mxge_fw_rss_unaligned;
3832 status = mxge_load_firmware(sc, 0);
3834 device_printf(sc->dev, "Falling back to a single slice\n");
3838 /* try to send a reset command to the card to see if it
3840 memset(&cmd, 0, sizeof (cmd));
3841 status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
3843 device_printf(sc->dev, "failed reset\n");
3847 /* get rx ring size */
3848 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
3850 device_printf(sc->dev, "Cannot determine rx ring size\n");
3853 max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
3855 /* tell it the size of the interrupt queues */
3856 cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
3857 status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
3859 device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
3863 /* ask the maximum number of slices it supports */
3864 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
3866 device_printf(sc->dev,
3867 "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
3870 sc->num_slices = cmd.data0;
3871 if (sc->num_slices > msix_cnt)
3872 sc->num_slices = msix_cnt;
3874 if (mxge_max_slices == -1) {
3875 /* cap to number of CPUs in system */
3876 if (sc->num_slices > ncpus)
3877 sc->num_slices = ncpus;
3879 if (sc->num_slices > mxge_max_slices)
3880 sc->num_slices = mxge_max_slices;
3882 /* make sure it is a power of two */
3883 while (sc->num_slices & (sc->num_slices - 1))
3887 device_printf(sc->dev, "using %d slices\n",
3893 sc->fw_name = old_fw;
3894 (void) mxge_load_firmware(sc, 0);
3899 mxge_add_msix_irqs(mxge_softc_t *sc)
3902 int count, err, i, rid;
3905 sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
3908 if (sc->msix_table_res == NULL) {
3909 device_printf(sc->dev, "couldn't alloc MSIX table res\n");
3913 count = sc->num_slices;
3914 err = pci_alloc_msix(sc->dev, &count);
3916 device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
3917 "err = %d \n", sc->num_slices, err);
3918 goto abort_with_msix_table;
3920 if (count < sc->num_slices) {
3921 device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
3922 count, sc->num_slices);
3923 device_printf(sc->dev,
3924 "Try setting hw.mxge.max_slices to %d\n",
3927 goto abort_with_msix;
3929 bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
3930 sc->msix_irq_res = kmalloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
3931 if (sc->msix_irq_res == NULL) {
3933 goto abort_with_msix;
3936 for (i = 0; i < sc->num_slices; i++) {
3938 sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
3941 if (sc->msix_irq_res[i] == NULL) {
3942 device_printf(sc->dev, "couldn't allocate IRQ res"
3943 " for message %d\n", i);
3945 goto abort_with_res;
3949 bytes = sizeof (*sc->msix_ih) * sc->num_slices;
3950 sc->msix_ih = kmalloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
3952 for (i = 0; i < sc->num_slices; i++) {
3953 err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
3955 mxge_intr, &sc->ss[i], &sc->msix_ih[i],
3956 sc->ifp->if_serializer);
3958 device_printf(sc->dev, "couldn't setup intr for "
3960 goto abort_with_intr;
3965 device_printf(sc->dev, "using %d msix IRQs:",
3967 for (i = 0; i < sc->num_slices; i++)
3968 kprintf(" %ld", rman_get_start(sc->msix_irq_res[i]));
3974 for (i = 0; i < sc->num_slices; i++) {
3975 if (sc->msix_ih[i] != NULL) {
3976 bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
3978 sc->msix_ih[i] = NULL;
3981 kfree(sc->msix_ih, M_DEVBUF);
3985 for (i = 0; i < sc->num_slices; i++) {
3987 if (sc->msix_irq_res[i] != NULL)
3988 bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
3989 sc->msix_irq_res[i]);
3990 sc->msix_irq_res[i] = NULL;
3992 kfree(sc->msix_irq_res, M_DEVBUF);
3996 pci_release_msi(sc->dev);
3998 abort_with_msix_table:
3999 bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4000 sc->msix_table_res);
4007 mxge_add_single_irq(mxge_softc_t *sc)
4011 sc->irq_type = pci_alloc_1intr(sc->dev, mxge_msi_enable,
4012 &sc->irq_rid, &irq_flags);
4014 sc->irq_res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ,
4015 &sc->irq_rid, irq_flags);
4016 if (sc->irq_res == NULL) {
4017 device_printf(sc->dev, "could not alloc interrupt\n");
4021 return bus_setup_intr(sc->dev, sc->irq_res, INTR_MPSAFE,
4022 mxge_intr, &sc->ss[0], &sc->ih, sc->ifp->if_serializer);
4027 mxge_rem_msix_irqs(mxge_softc_t *sc)
4031 for (i = 0; i < sc->num_slices; i++) {
4032 if (sc->msix_ih[i] != NULL) {
4033 bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4035 sc->msix_ih[i] = NULL;
4038 kfree(sc->msix_ih, M_DEVBUF);
4040 for (i = 0; i < sc->num_slices; i++) {
4042 if (sc->msix_irq_res[i] != NULL)
4043 bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4044 sc->msix_irq_res[i]);
4045 sc->msix_irq_res[i] = NULL;
4047 kfree(sc->msix_irq_res, M_DEVBUF);
4049 bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4050 sc->msix_table_res);
4052 pci_release_msi(sc->dev);
4058 mxge_add_irq(mxge_softc_t *sc)
4063 if (sc->num_slices > 1)
4064 err = mxge_add_msix_irqs(sc);
4066 err = mxge_add_single_irq(sc);
4068 if (0 && err == 0 && sc->num_slices > 1) {
4069 mxge_rem_msix_irqs(sc);
4070 err = mxge_add_msix_irqs(sc);
4074 return mxge_add_single_irq(sc);
4079 mxge_attach(device_t dev)
4081 mxge_softc_t *sc = device_get_softc(dev);
4082 struct ifnet *ifp = &sc->arpcom.ac_if;
4086 * Avoid rewriting half the lines in this file to use
4087 * &sc->arpcom.ac_if instead
4091 if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4092 ifmedia_init(&sc->media, 0, mxge_media_change, mxge_media_status);
4094 mxge_fetch_tunables(sc);
4096 err = bus_dma_tag_create(NULL, /* parent */
4099 BUS_SPACE_MAXADDR, /* low */
4100 BUS_SPACE_MAXADDR, /* high */
4101 NULL, NULL, /* filter */
4102 BUS_SPACE_MAXSIZE_32BIT,/* maxsize */
4104 BUS_SPACE_MAXSIZE_32BIT,/* maxsegsize */
4106 &sc->parent_dmat); /* tag */
4108 device_printf(dev, "Err %d allocating parent dmat\n", err);
4112 callout_init_mp(&sc->co_hdl);
4114 mxge_setup_cfg_space(sc);
4117 * Map the board into the kernel
4120 sc->mem_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY,
4122 if (sc->mem_res == NULL) {
4123 device_printf(dev, "could not map memory\n");
4128 sc->sram = rman_get_virtual(sc->mem_res);
4129 sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4130 if (sc->sram_size > rman_get_size(sc->mem_res)) {
4131 device_printf(dev, "impossible memory region size %ld\n",
4132 rman_get_size(sc->mem_res));
4138 * Make NULL terminated copy of the EEPROM strings section of
4141 bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4142 bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4143 rman_get_bushandle(sc->mem_res),
4144 sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4145 sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE - 2);
4146 err = mxge_parse_strings(sc);
4148 device_printf(dev, "parse EEPROM string failed\n");
4153 * Enable write combining for efficient use of PCIe bus
4158 * Allocate the out of band DMA memory
4160 err = mxge_dma_alloc(sc, &sc->cmd_dma, sizeof(mxge_cmd_t), 64);
4162 device_printf(dev, "alloc cmd DMA buf failed\n");
4165 sc->cmd = sc->cmd_dma.dmem_addr;
4167 err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4169 device_printf(dev, "alloc zeropad DMA buf failed\n");
4173 err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4175 device_printf(dev, "alloc dmabench DMA buf failed\n");
4179 /* Select & load the firmware */
4180 err = mxge_select_firmware(sc);
4182 device_printf(dev, "select firmware failed\n");
4186 mxge_slice_probe(sc);
4187 err = mxge_alloc_slices(sc);
4189 device_printf(dev, "alloc slices failed\n");
4193 err = mxge_reset(sc, 0);
4195 device_printf(dev, "reset failed\n");
4199 err = mxge_alloc_rings(sc);
4201 device_printf(dev, "failed to allocate rings\n");
4205 ifp->if_baudrate = IF_Gbps(10UL);
4206 ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO;
4207 ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4209 ifp->if_capabilities |= IFCAP_VLAN_MTU;
4211 /* Well, its software, sigh */
4212 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING;
4214 ifp->if_capenable = ifp->if_capabilities;
4217 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4218 ifp->if_init = mxge_init;
4219 ifp->if_ioctl = mxge_ioctl;
4220 ifp->if_start = mxge_start;
4223 /* Increase TSO burst length */
4224 ifp->if_tsolen = (32 * ETHERMTU);
4226 /* Initialise the ifmedia structure */
4227 mxge_media_init(sc);
4228 mxge_media_probe(sc);
4230 ether_ifattach(ifp, sc->mac_addr, NULL);
4232 sc->max_mtu = ETHERMTU + EVL_ENCAPLEN;
4235 /* must come after ether_ifattach() */
4236 err = mxge_add_irq(sc);
4238 device_printf(dev, "alloc and setup intr failed\n");
4239 ether_ifdetach(ifp);
4242 ifq_set_cpuid(&ifp->if_snd, rman_get_cpuid(sc->irq_res));
4244 mxge_add_sysctls(sc);
4246 callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
4255 mxge_detach(device_t dev)
4257 mxge_softc_t *sc = device_get_softc(dev);
4259 if (device_is_attached(dev)) {
4260 struct ifnet *ifp = sc->ifp;
4262 lwkt_serialize_enter(ifp->if_serializer);
4265 if (ifp->if_flags & IFF_RUNNING)
4267 callout_stop(&sc->co_hdl);
4269 bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4271 lwkt_serialize_exit(ifp->if_serializer);
4273 callout_terminate(&sc->co_hdl);
4275 ether_ifdetach(ifp);
4277 ifmedia_removeall(&sc->media);
4279 if (sc->cmd != NULL && sc->zeropad_dma.dmem_addr != NULL &&
4281 mxge_dummy_rdma(sc, 0);
4283 mxge_rem_sysctls(sc);
4284 mxge_free_rings(sc);
4286 /* MUST after sysctls and rings are freed */
4287 mxge_free_slices(sc);
4289 if (sc->dmabench_dma.dmem_addr != NULL)
4290 mxge_dma_free(&sc->dmabench_dma);
4291 if (sc->zeropad_dma.dmem_addr != NULL)
4292 mxge_dma_free(&sc->zeropad_dma);
4293 if (sc->cmd_dma.dmem_addr != NULL)
4294 mxge_dma_free(&sc->cmd_dma);
4296 if (sc->irq_res != NULL) {
4297 bus_release_resource(dev, SYS_RES_IRQ, sc->irq_rid,
4300 if (sc->irq_type == PCI_INTR_TYPE_MSI)
4301 pci_release_msi(dev);
4303 if (sc->mem_res != NULL) {
4304 bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS,
4308 if (sc->parent_dmat != NULL)
4309 bus_dma_tag_destroy(sc->parent_dmat);
4315 mxge_shutdown(device_t dev)