1 /******************************************************************************
3 Copyright (c) 2006-2013, Myricom Inc.
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
9 1. Redistributions of source code must retain the above copyright notice,
10 this list of conditions and the following disclaimer.
12 2. Neither the name of the Myricom Inc, nor the names of its
13 contributors may be used to endorse or promote products derived from
14 this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
28 $FreeBSD: head/sys/dev/mxge/if_mxge.c 254263 2013-08-12 23:30:01Z scottl $
30 ***************************************************************************/
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/linker.h>
37 #include <sys/firmware.h>
38 #include <sys/endian.h>
39 #include <sys/in_cksum.h>
40 #include <sys/sockio.h>
42 #include <sys/malloc.h>
43 #include <sys/kernel.h>
44 #include <sys/module.h>
45 #include <sys/serialize.h>
46 #include <sys/socket.h>
47 #include <sys/sysctl.h>
50 #include <net/if_arp.h>
51 #include <net/ifq_var.h>
52 #include <net/ethernet.h>
53 #include <net/if_dl.h>
54 #include <net/if_media.h>
58 #include <net/if_types.h>
59 #include <net/vlan/if_vlan_var.h>
62 #include <netinet/in_systm.h>
63 #include <netinet/in.h>
64 #include <netinet/ip.h>
65 #include <netinet/tcp.h>
70 #include <bus/pci/pcireg.h>
71 #include <bus/pci/pcivar.h>
72 #include <bus/pci/pci_private.h> /* XXX for pci_cfg_restore */
74 #include <vm/vm.h> /* for pmap_mapdev() */
77 #if defined(__i386__) || defined(__x86_64__)
78 #include <machine/specialreg.h>
81 #include <dev/netif/mxge/mxge_mcp.h>
82 #include <dev/netif/mxge/mcp_gen_header.h>
83 #include <dev/netif/mxge/if_mxge_var.h>
86 static int mxge_nvidia_ecrc_enable = 1;
87 static int mxge_force_firmware = 0;
88 static int mxge_intr_coal_delay = MXGE_INTR_COAL_DELAY;
89 static int mxge_deassert_wait = 1;
90 static int mxge_flow_control = 1;
91 static int mxge_ticks;
92 static int mxge_max_slices = 1;
93 static int mxge_always_promisc = 0;
94 static int mxge_throttle = 0;
95 static int mxge_msi_enable = 1;
97 static const char *mxge_fw_unaligned = "mxge_ethp_z8e";
98 static const char *mxge_fw_aligned = "mxge_eth_z8e";
99 static const char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
100 static const char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
102 TUNABLE_INT("hw.mxge.max_slices", &mxge_max_slices);
103 TUNABLE_INT("hw.mxge.flow_control_enabled", &mxge_flow_control);
104 TUNABLE_INT("hw.mxge.intr_coal_delay", &mxge_intr_coal_delay);
105 TUNABLE_INT("hw.mxge.nvidia_ecrc_enable", &mxge_nvidia_ecrc_enable);
106 TUNABLE_INT("hw.mxge.force_firmware", &mxge_force_firmware);
107 TUNABLE_INT("hw.mxge.deassert_wait", &mxge_deassert_wait);
108 TUNABLE_INT("hw.mxge.ticks", &mxge_ticks);
109 TUNABLE_INT("hw.mxge.always_promisc", &mxge_always_promisc);
110 TUNABLE_INT("hw.mxge.throttle", &mxge_throttle);
111 TUNABLE_INT("hw.mxge.msi.enable", &mxge_msi_enable);
113 static int mxge_probe(device_t dev);
114 static int mxge_attach(device_t dev);
115 static int mxge_detach(device_t dev);
116 static int mxge_shutdown(device_t dev);
117 static void mxge_intr(void *arg);
119 static device_method_t mxge_methods[] = {
120 /* Device interface */
121 DEVMETHOD(device_probe, mxge_probe),
122 DEVMETHOD(device_attach, mxge_attach),
123 DEVMETHOD(device_detach, mxge_detach),
124 DEVMETHOD(device_shutdown, mxge_shutdown),
128 static driver_t mxge_driver = {
131 sizeof(mxge_softc_t),
134 static devclass_t mxge_devclass;
136 /* Declare ourselves to be a child of the PCI bus.*/
137 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, NULL, NULL);
138 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
139 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
141 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
142 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
143 static void mxge_close(mxge_softc_t *sc, int down);
144 static int mxge_open(mxge_softc_t *sc);
145 static void mxge_tick(void *arg);
146 static void mxge_watchdog_reset(mxge_softc_t *sc);
147 static void mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice);
150 mxge_probe(device_t dev)
152 if (pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM &&
153 (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E ||
154 pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9)) {
155 int rev = pci_get_revid(dev);
158 case MXGE_PCI_REV_Z8E:
159 device_set_desc(dev, "Myri10G-PCIE-8A");
161 case MXGE_PCI_REV_Z8ES:
162 device_set_desc(dev, "Myri10G-PCIE-8B");
165 device_set_desc(dev, "Myri10G-PCIE-8??");
166 device_printf(dev, "Unrecognized rev %d NIC\n", rev);
175 mxge_enable_wc(mxge_softc_t *sc)
177 #if defined(__i386__) || defined(__x86_64__)
181 len = rman_get_size(sc->mem_res);
182 pmap_change_attr((vm_offset_t) sc->sram, len / PAGE_SIZE,
183 PAT_WRITE_COMBINING);
188 mxge_dma_alloc(mxge_softc_t *sc, bus_dmamem_t *dma, size_t bytes,
189 bus_size_t alignment)
194 if (bytes > 4096 && alignment == 4096)
199 err = bus_dmamem_coherent(sc->parent_dmat, alignment, boundary,
200 BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, bytes,
201 BUS_DMA_WAITOK | BUS_DMA_ZERO, dma);
203 device_printf(sc->dev, "bus_dmamem_coherent failed: %d\n", err);
210 mxge_dma_free(bus_dmamem_t *dma)
212 bus_dmamap_unload(dma->dmem_tag, dma->dmem_map);
213 bus_dmamem_free(dma->dmem_tag, dma->dmem_addr, dma->dmem_map);
214 bus_dma_tag_destroy(dma->dmem_tag);
218 * The eeprom strings on the lanaiX have the format
224 mxge_parse_strings(mxge_softc_t *sc)
227 int i, found_mac, found_sn2;
230 ptr = sc->eeprom_strings;
233 while (*ptr != '\0') {
234 if (strncmp(ptr, "MAC=", 4) == 0) {
237 sc->mac_addr[i] = strtoul(ptr, &endptr, 16);
238 if (endptr - ptr != 2)
247 } else if (strncmp(ptr, "PC=", 3) == 0) {
249 strlcpy(sc->product_code_string, ptr,
250 sizeof(sc->product_code_string));
251 } else if (!found_sn2 && (strncmp(ptr, "SN=", 3) == 0)) {
253 strlcpy(sc->serial_number_string, ptr,
254 sizeof(sc->serial_number_string));
255 } else if (strncmp(ptr, "SN2=", 4) == 0) {
256 /* SN2 takes precedence over SN */
259 strlcpy(sc->serial_number_string, ptr,
260 sizeof(sc->serial_number_string));
262 while (*ptr++ != '\0') {}
269 device_printf(sc->dev, "failed to parse eeprom_strings\n");
273 #if defined(__i386__) || defined(__x86_64__)
276 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
279 unsigned long base, off;
281 device_t pdev, mcp55;
282 uint16_t vendor_id, device_id, word;
283 uintptr_t bus, slot, func, ivend, idev;
286 if (!mxge_nvidia_ecrc_enable)
289 pdev = device_get_parent(device_get_parent(sc->dev));
291 device_printf(sc->dev, "could not find parent?\n");
294 vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
295 device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
297 if (vendor_id != 0x10de)
302 if (device_id == 0x005d) {
303 /* ck804, base address is magic */
305 } else if (device_id >= 0x0374 && device_id <= 0x378) {
306 /* mcp55, base address stored in chipset */
307 mcp55 = pci_find_bsf(0, 0, 0);
309 0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
310 0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
311 word = pci_read_config(mcp55, 0x90, 2);
312 base = ((unsigned long)word & 0x7ffeU) << 25;
320 * Test below is commented because it is believed that doing
321 * config read/write beyond 0xff will access the config space
322 * for the next larger function. Uncomment this and remove
323 * the hacky pmap_mapdev() way of accessing config space when
324 * FreeBSD grows support for extended pcie config space access
328 * See if we can, by some miracle, access the extended
331 val = pci_read_config(pdev, 0x178, 4);
332 if (val != 0xffffffff) {
334 pci_write_config(pdev, 0x178, val, 4);
339 * Rather than using normal pci config space writes, we must
340 * map the Nvidia config space ourselves. This is because on
341 * opteron/nvidia class machine the 0xe000000 mapping is
342 * handled by the nvidia chipset, that means the internal PCI
343 * device (the on-chip northbridge), or the amd-8131 bridge
344 * and things behind them are not visible by this method.
347 BUS_READ_IVAR(device_get_parent(pdev), pdev,
349 BUS_READ_IVAR(device_get_parent(pdev), pdev,
350 PCI_IVAR_SLOT, &slot);
351 BUS_READ_IVAR(device_get_parent(pdev), pdev,
352 PCI_IVAR_FUNCTION, &func);
353 BUS_READ_IVAR(device_get_parent(pdev), pdev,
354 PCI_IVAR_VENDOR, &ivend);
355 BUS_READ_IVAR(device_get_parent(pdev), pdev,
356 PCI_IVAR_DEVICE, &idev);
358 off = base + 0x00100000UL * (unsigned long)bus +
359 0x00001000UL * (unsigned long)(func + 8 * slot);
361 /* map it into the kernel */
362 va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
364 device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
367 /* get a pointer to the config space mapped into the kernel */
368 cfgptr = va + (off & PAGE_MASK);
370 /* make sure that we can really access it */
371 vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
372 device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
373 if (!(vendor_id == ivend && device_id == idev)) {
374 device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
375 vendor_id, device_id);
376 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
380 ptr32 = (uint32_t*)(cfgptr + 0x178);
383 if (val == 0xffffffff) {
384 device_printf(sc->dev, "extended mapping failed\n");
385 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
389 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
391 device_printf(sc->dev, "Enabled ECRC on upstream "
392 "Nvidia bridge at %d:%d:%d\n",
393 (int)bus, (int)slot, (int)func);
397 #else /* __i386__ || __x86_64__ */
400 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
402 device_printf(sc->dev, "Nforce 4 chipset on non-x86/x86_64!?!?!\n");
408 mxge_dma_test(mxge_softc_t *sc, int test_type)
411 bus_addr_t dmatest_bus = sc->dmabench_dma.dmem_busaddr;
414 const char *test = " ";
417 * Run a small DMA test.
418 * The magic multipliers to the length tell the firmware
419 * to do DMA read, write, or read+write tests. The
420 * results are returned in cmd.data0. The upper 16
421 * bits of the return is the number of transfers completed.
422 * The lower 16 bits is the time in 0.5us ticks that the
423 * transfers took to complete.
426 len = sc->tx_boundary;
428 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
429 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
430 cmd.data2 = len * 0x10000;
431 status = mxge_send_cmd(sc, test_type, &cmd);
436 sc->read_dma = ((cmd.data0>>16) * len * 2) / (cmd.data0 & 0xffff);
438 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
439 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
440 cmd.data2 = len * 0x1;
441 status = mxge_send_cmd(sc, test_type, &cmd);
446 sc->write_dma = ((cmd.data0>>16) * len * 2) / (cmd.data0 & 0xffff);
448 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
449 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
450 cmd.data2 = len * 0x10001;
451 status = mxge_send_cmd(sc, test_type, &cmd);
456 sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
457 (cmd.data0 & 0xffff);
460 if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST) {
461 device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
468 * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
469 * when the PCI-E Completion packets are aligned on an 8-byte
470 * boundary. Some PCI-E chip sets always align Completion packets; on
471 * the ones that do not, the alignment can be enforced by enabling
472 * ECRC generation (if supported).
474 * When PCI-E Completion packets are not aligned, it is actually more
475 * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
477 * If the driver can neither enable ECRC nor verify that it has
478 * already been enabled, then it must use a firmware image which works
479 * around unaligned completion packets (ethp_z8e.dat), and it should
480 * also ensure that it never gives the device a Read-DMA which is
481 * larger than 2KB by setting the tx_boundary to 2KB. If ECRC is
482 * enabled, then the driver should use the aligned (eth_z8e.dat)
483 * firmware image, and set tx_boundary to 4KB.
486 mxge_firmware_probe(mxge_softc_t *sc)
488 device_t dev = sc->dev;
492 sc->tx_boundary = 4096;
495 * Verify the max read request size was set to 4KB
496 * before trying the test with 4KB.
498 if (pci_find_extcap(dev, PCIY_EXPRESS, ®) == 0) {
499 pectl = pci_read_config(dev, reg + 0x8, 2);
500 if ((pectl & (5 << 12)) != (5 << 12)) {
501 device_printf(dev, "Max Read Req. size != 4k (0x%x)\n",
503 sc->tx_boundary = 2048;
508 * Load the optimized firmware (which assumes aligned PCIe
509 * completions) in order to see if it works on this host.
511 sc->fw_name = mxge_fw_aligned;
512 status = mxge_load_firmware(sc, 1);
517 * Enable ECRC if possible
519 mxge_enable_nvidia_ecrc(sc);
522 * Run a DMA test which watches for unaligned completions and
523 * aborts on the first one seen. Not required on Z8ES or newer.
525 if (pci_get_revid(sc->dev) >= MXGE_PCI_REV_Z8ES)
528 status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
530 return 0; /* keep the aligned firmware */
533 device_printf(dev, "DMA test failed: %d\n", status);
534 if (status == ENOSYS) {
535 device_printf(dev, "Falling back to ethp! "
536 "Please install up to date fw\n");
542 mxge_select_firmware(mxge_softc_t *sc)
545 int force_firmware = mxge_force_firmware;
548 force_firmware = sc->throttle;
550 if (force_firmware != 0) {
551 if (force_firmware == 1)
556 device_printf(sc->dev,
557 "Assuming %s completions (forced)\n",
558 aligned ? "aligned" : "unaligned");
564 * If the PCIe link width is 4 or less, we can use the aligned
565 * firmware and skip any checks
567 if (sc->link_width != 0 && sc->link_width <= 4) {
568 device_printf(sc->dev, "PCIe x%d Link, "
569 "expect reduced performance\n", sc->link_width);
574 if (mxge_firmware_probe(sc) == 0)
579 sc->fw_name = mxge_fw_aligned;
580 sc->tx_boundary = 4096;
582 sc->fw_name = mxge_fw_unaligned;
583 sc->tx_boundary = 2048;
585 return mxge_load_firmware(sc, 0);
589 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
591 if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
592 if_printf(sc->ifp, "Bad firmware type: 0x%x\n",
593 be32toh(hdr->mcp_type));
597 /* Save firmware version for sysctl */
598 strlcpy(sc->fw_version, hdr->version, sizeof(sc->fw_version));
600 if_printf(sc->ifp, "firmware id: %s\n", hdr->version);
602 ksscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
603 &sc->fw_ver_minor, &sc->fw_ver_tiny);
605 if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR &&
606 sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
607 if_printf(sc->ifp, "Found firmware version %s\n",
609 if_printf(sc->ifp, "Driver needs %d.%d\n",
610 MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
617 z_alloc(void *nil, u_int items, u_int size)
619 return kmalloc(items * size, M_TEMP, M_WAITOK);
623 z_free(void *nil, void *ptr)
629 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
632 char *inflate_buffer;
633 const struct firmware *fw;
634 const mcp_gen_header_t *hdr;
641 fw = firmware_get(sc->fw_name);
643 if_printf(sc->ifp, "Could not find firmware image %s\n",
648 /* Setup zlib and decompress f/w */
649 bzero(&zs, sizeof(zs));
652 status = inflateInit(&zs);
653 if (status != Z_OK) {
659 * The uncompressed size is stored as the firmware version,
660 * which would otherwise go unused
662 fw_len = (size_t)fw->version;
663 inflate_buffer = kmalloc(fw_len, M_TEMP, M_WAITOK);
664 zs.avail_in = fw->datasize;
665 zs.next_in = __DECONST(char *, fw->data);
666 zs.avail_out = fw_len;
667 zs.next_out = inflate_buffer;
668 status = inflate(&zs, Z_FINISH);
669 if (status != Z_STREAM_END) {
670 if_printf(sc->ifp, "zlib %d\n", status);
672 goto abort_with_buffer;
677 htobe32(*(const uint32_t *)(inflate_buffer + MCP_HEADER_PTR_OFFSET));
678 if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
679 if_printf(sc->ifp, "Bad firmware file");
681 goto abort_with_buffer;
683 hdr = (const void*)(inflate_buffer + hdr_offset);
685 status = mxge_validate_firmware(sc, hdr);
687 goto abort_with_buffer;
689 /* Copy the inflated firmware to NIC SRAM. */
690 for (i = 0; i < fw_len; i += 256) {
691 mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i, inflate_buffer + i,
692 min(256U, (unsigned)(fw_len - i)));
701 kfree(inflate_buffer, M_TEMP);
704 firmware_put(fw, FIRMWARE_UNLOAD);
709 * Enable or disable periodic RDMAs from the host to make certain
710 * chipsets resend dropped PCIe messages
713 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
716 volatile uint32_t *confirm;
717 volatile char *submit;
718 uint32_t *buf, dma_low, dma_high;
721 buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
723 /* Clear confirmation addr */
724 confirm = (volatile uint32_t *)sc->cmd;
729 * Send an rdma command to the PCIe engine, and wait for the
730 * response in the confirmation address. The firmware should
731 * write a -1 there to indicate it is alive and well
733 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.dmem_busaddr);
734 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.dmem_busaddr);
735 buf[0] = htobe32(dma_high); /* confirm addr MSW */
736 buf[1] = htobe32(dma_low); /* confirm addr LSW */
737 buf[2] = htobe32(0xffffffff); /* confirm data */
738 dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.dmem_busaddr);
739 dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.dmem_busaddr);
740 buf[3] = htobe32(dma_high); /* dummy addr MSW */
741 buf[4] = htobe32(dma_low); /* dummy addr LSW */
742 buf[5] = htobe32(enable); /* enable? */
744 submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
746 mxge_pio_copy(submit, buf, 64);
751 while (*confirm != 0xffffffff && i < 20) {
755 if (*confirm != 0xffffffff) {
756 if_printf(sc->ifp, "dummy rdma %s failed (%p = 0x%x)",
757 (enable ? "enable" : "disable"), confirm, *confirm);
762 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
765 char buf_bytes[sizeof(*buf) + 8];
766 volatile mcp_cmd_response_t *response = sc->cmd;
767 volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
768 uint32_t dma_low, dma_high;
769 int err, sleep_total = 0;
771 /* Ensure buf is aligned to 8 bytes */
772 buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
774 buf->data0 = htobe32(data->data0);
775 buf->data1 = htobe32(data->data1);
776 buf->data2 = htobe32(data->data2);
777 buf->cmd = htobe32(cmd);
778 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.dmem_busaddr);
779 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.dmem_busaddr);
781 buf->response_addr.low = htobe32(dma_low);
782 buf->response_addr.high = htobe32(dma_high);
784 response->result = 0xffffffff;
786 mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
792 for (sleep_total = 0; sleep_total < 20; sleep_total++) {
794 switch (be32toh(response->result)) {
796 data->data0 = be32toh(response->data);
802 case MXGEFW_CMD_UNKNOWN:
805 case MXGEFW_CMD_ERROR_UNALIGNED:
808 case MXGEFW_CMD_ERROR_BUSY:
811 case MXGEFW_CMD_ERROR_I2C_ABSENT:
815 if_printf(sc->ifp, "command %d failed, result = %d\n",
816 cmd, be32toh(response->result));
824 if_printf(sc->ifp, "command %d timed out result = %d\n",
825 cmd, be32toh(response->result));
831 mxge_adopt_running_firmware(mxge_softc_t *sc)
833 struct mcp_gen_header *hdr;
834 const size_t bytes = sizeof(struct mcp_gen_header);
839 * Find running firmware header
842 htobe32(*(volatile uint32_t *)(sc->sram + MCP_HEADER_PTR_OFFSET));
844 if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
845 if_printf(sc->ifp, "Running firmware has bad header offset "
846 "(%zu)\n", hdr_offset);
851 * Copy header of running firmware from SRAM to host memory to
854 hdr = kmalloc(bytes, M_DEVBUF, M_WAITOK);
855 bus_space_read_region_1(rman_get_bustag(sc->mem_res),
856 rman_get_bushandle(sc->mem_res), hdr_offset, (char *)hdr, bytes);
857 status = mxge_validate_firmware(sc, hdr);
858 kfree(hdr, M_DEVBUF);
861 * Check to see if adopted firmware has bug where adopting
862 * it will cause broadcasts to be filtered unless the NIC
863 * is kept in ALLMULTI mode
865 if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
866 sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
867 sc->adopted_rx_filter_bug = 1;
868 if_printf(sc->ifp, "Adopting fw %d.%d.%d: "
869 "working around rx filter bug\n",
870 sc->fw_ver_major, sc->fw_ver_minor, sc->fw_ver_tiny);
877 mxge_load_firmware(mxge_softc_t *sc, int adopt)
879 volatile uint32_t *confirm;
880 volatile char *submit;
882 uint32_t *buf, size, dma_low, dma_high;
885 buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
887 size = sc->sram_size;
888 status = mxge_load_firmware_helper(sc, &size);
894 * Try to use the currently running firmware, if
897 status = mxge_adopt_running_firmware(sc);
900 "failed to adopt running firmware\n");
903 if_printf(sc->ifp, "Successfully adopted running firmware\n");
905 if (sc->tx_boundary == 4096) {
907 "Using firmware currently running on NIC. "
909 if_printf(sc->ifp, "performance consider loading "
910 "optimized firmware\n");
912 sc->fw_name = mxge_fw_unaligned;
913 sc->tx_boundary = 2048;
917 /* Clear confirmation addr */
918 confirm = (volatile uint32_t *)sc->cmd;
923 * Send a reload command to the bootstrap MCP, and wait for the
924 * response in the confirmation address. The firmware should
925 * write a -1 there to indicate it is alive and well
928 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.dmem_busaddr);
929 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.dmem_busaddr);
931 buf[0] = htobe32(dma_high); /* confirm addr MSW */
932 buf[1] = htobe32(dma_low); /* confirm addr LSW */
933 buf[2] = htobe32(0xffffffff); /* confirm data */
936 * FIX: All newest firmware should un-protect the bottom of
937 * the sram before handoff. However, the very first interfaces
938 * do not. Therefore the handoff copy must skip the first 8 bytes
940 /* where the code starts*/
941 buf[3] = htobe32(MXGE_FW_OFFSET + 8);
942 buf[4] = htobe32(size - 8); /* length of code */
943 buf[5] = htobe32(8); /* where to copy to */
944 buf[6] = htobe32(0); /* where to jump to */
946 submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
947 mxge_pio_copy(submit, buf, 64);
952 while (*confirm != 0xffffffff && i < 20) {
956 if (*confirm != 0xffffffff) {
957 if_printf(sc->ifp,"handoff failed (%p = 0x%x)",
965 mxge_update_mac_address(mxge_softc_t *sc)
968 uint8_t *addr = sc->mac_addr;
970 cmd.data0 = (addr[0] << 24) | (addr[1] << 16) |
971 (addr[2] << 8) | addr[3];
972 cmd.data1 = (addr[4] << 8) | (addr[5]);
973 return mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
977 mxge_change_pause(mxge_softc_t *sc, int pause)
983 status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL, &cmd);
985 status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL, &cmd);
987 if_printf(sc->ifp, "Failed to set flow control mode\n");
995 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1000 if (mxge_always_promisc)
1004 status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC, &cmd);
1006 status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC, &cmd);
1008 if_printf(sc->ifp, "Failed to set promisc mode\n");
1012 mxge_set_multicast_list(mxge_softc_t *sc)
1015 struct ifmultiaddr *ifma;
1016 struct ifnet *ifp = sc->ifp;
1019 /* This firmware is known to not support multicast */
1020 if (!sc->fw_multicast_support)
1023 /* Disable multicast filtering while we play with the lists*/
1024 err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1026 if_printf(ifp, "Failed MXGEFW_ENABLE_ALLMULTI, "
1027 "error status: %d\n", err);
1031 if (sc->adopted_rx_filter_bug)
1034 if (ifp->if_flags & IFF_ALLMULTI) {
1035 /* Request to disable multicast filtering, so quit here */
1039 /* Flush all the filters */
1040 err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1042 if_printf(ifp, "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, "
1043 "error status: %d\n", err);
1048 * Walk the multicast list, and add each address
1050 TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1051 if (ifma->ifma_addr->sa_family != AF_LINK)
1054 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1056 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1058 cmd.data0 = htonl(cmd.data0);
1059 cmd.data1 = htonl(cmd.data1);
1060 err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1062 if_printf(ifp, "Failed MXGEFW_JOIN_MULTICAST_GROUP, "
1063 "error status: %d\n", err);
1064 /* Abort, leaving multicast filtering off */
1069 /* Enable multicast filtering */
1070 err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1072 if_printf(ifp, "Failed MXGEFW_DISABLE_ALLMULTI, "
1073 "error status: %d\n", err);
1079 mxge_max_mtu(mxge_softc_t *sc)
1084 if (MJUMPAGESIZE - MXGEFW_PAD > MXGEFW_MAX_MTU)
1085 return MXGEFW_MAX_MTU - MXGEFW_PAD;
1087 /* try to set nbufs to see if it we can
1088 use virtually contiguous jumbos */
1090 status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1093 return MXGEFW_MAX_MTU - MXGEFW_PAD;
1095 /* otherwise, we're limited to MJUMPAGESIZE */
1096 return MJUMPAGESIZE - MXGEFW_PAD;
1101 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1103 struct mxge_slice_state *ss;
1104 mxge_rx_done_t *rx_done;
1105 volatile uint32_t *irq_claim;
1110 * Try to send a reset command to the card to see if it
1113 memset(&cmd, 0, sizeof (cmd));
1114 status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1116 if_printf(sc->ifp, "failed reset\n");
1120 mxge_dummy_rdma(sc, 1);
1122 /* Set the intrq size */
1123 cmd.data0 = sc->rx_ring_size;
1124 status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1127 * Even though we already know how many slices are supported
1128 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1129 * has magic side effects, and must be called after a reset.
1130 * It must be called prior to calling any RSS related cmds,
1131 * including assigning an interrupt queue for anything but
1132 * slice 0. It must also be called *after*
1133 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1134 * the firmware to compute offsets.
1136 if (sc->num_slices > 1) {
1137 /* Ask the maximum number of slices it supports */
1138 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
1140 if_printf(sc->ifp, "failed to get number of slices\n");
1145 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1146 * to setting up the interrupt queue DMA
1148 cmd.data0 = sc->num_slices;
1149 cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1150 #ifdef IFNET_BUF_RING
1151 cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1153 status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES, &cmd);
1155 if_printf(sc->ifp, "failed to set number of slices\n");
1160 if (interrupts_setup) {
1161 /* Now exchange information about interrupts */
1162 for (slice = 0; slice < sc->num_slices; slice++) {
1163 rx_done = &sc->ss[slice].rx_done;
1164 memset(rx_done->entry, 0, sc->rx_ring_size);
1166 MXGE_LOWPART_TO_U32(rx_done->dma.dmem_busaddr);
1168 MXGE_HIGHPART_TO_U32(rx_done->dma.dmem_busaddr);
1170 status |= mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_DMA,
1175 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET,
1177 sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1179 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1180 irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1182 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET, &cmd);
1183 sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1186 if_printf(sc->ifp, "failed set interrupt parameters\n");
1190 *sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1192 /* Run a DMA benchmark */
1193 mxge_dma_test(sc, MXGEFW_DMA_TEST);
1195 for (slice = 0; slice < sc->num_slices; slice++) {
1196 ss = &sc->ss[slice];
1198 ss->irq_claim = irq_claim + (2 * slice);
1200 /* Reset mcp/driver shared state back to 0 */
1201 ss->rx_done.idx = 0;
1202 ss->rx_done.cnt = 0;
1205 ss->tx.pkt_done = 0;
1206 ss->tx.queue_active = 0;
1207 ss->tx.activate = 0;
1208 ss->tx.deactivate = 0;
1210 ss->rx_small.cnt = 0;
1211 if (ss->fw_stats != NULL)
1212 bzero(ss->fw_stats, sizeof(*ss->fw_stats));
1214 sc->rdma_tags_available = 15;
1216 status = mxge_update_mac_address(sc);
1217 mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1218 mxge_change_pause(sc, sc->pause);
1219 mxge_set_multicast_list(sc);
1222 cmd.data0 = sc->throttle;
1223 if (mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd))
1224 if_printf(sc->ifp, "can't enable throttle\n");
1230 mxge_change_throttle(SYSCTL_HANDLER_ARGS)
1235 unsigned int throttle;
1238 throttle = sc->throttle;
1239 err = sysctl_handle_int(oidp, &throttle, arg2, req);
1243 if (throttle == sc->throttle)
1246 if (throttle < MXGE_MIN_THROTTLE || throttle > MXGE_MAX_THROTTLE)
1249 lwkt_serialize_enter(sc->ifp->if_serializer);
1251 cmd.data0 = throttle;
1252 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd);
1254 sc->throttle = throttle;
1256 lwkt_serialize_exit(sc->ifp->if_serializer);
1261 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1264 unsigned int intr_coal_delay;
1268 intr_coal_delay = sc->intr_coal_delay;
1269 err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1273 if (intr_coal_delay == sc->intr_coal_delay)
1276 if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1279 lwkt_serialize_enter(sc->ifp->if_serializer);
1281 *sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1282 sc->intr_coal_delay = intr_coal_delay;
1284 lwkt_serialize_exit(sc->ifp->if_serializer);
1289 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1292 unsigned int enabled;
1296 enabled = sc->pause;
1297 err = sysctl_handle_int(oidp, &enabled, arg2, req);
1301 if (enabled == sc->pause)
1304 lwkt_serialize_enter(sc->ifp->if_serializer);
1305 err = mxge_change_pause(sc, enabled);
1306 lwkt_serialize_exit(sc->ifp->if_serializer);
1312 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1318 arg2 = be32toh(*(int *)arg1);
1320 err = sysctl_handle_int(oidp, arg1, arg2, req);
1326 mxge_rem_sysctls(mxge_softc_t *sc)
1328 if (sc->ss != NULL) {
1329 struct mxge_slice_state *ss;
1332 for (slice = 0; slice < sc->num_slices; slice++) {
1333 ss = &sc->ss[slice];
1334 if (ss->sysctl_tree != NULL) {
1335 sysctl_ctx_free(&ss->sysctl_ctx);
1336 ss->sysctl_tree = NULL;
1341 if (sc->slice_sysctl_tree != NULL) {
1342 sysctl_ctx_free(&sc->slice_sysctl_ctx);
1343 sc->slice_sysctl_tree = NULL;
1346 if (sc->sysctl_tree != NULL) {
1347 sysctl_ctx_free(&sc->sysctl_ctx);
1348 sc->sysctl_tree = NULL;
1353 mxge_add_sysctls(mxge_softc_t *sc)
1355 struct sysctl_ctx_list *ctx;
1356 struct sysctl_oid_list *children;
1358 struct mxge_slice_state *ss;
1362 ctx = &sc->sysctl_ctx;
1363 sysctl_ctx_init(ctx);
1364 sc->sysctl_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_STATIC_CHILDREN(_hw),
1365 OID_AUTO, device_get_nameunit(sc->dev), CTLFLAG_RD, 0, "");
1366 if (sc->sysctl_tree == NULL) {
1367 device_printf(sc->dev, "can't add sysctl node\n");
1371 children = SYSCTL_CHILDREN(sc->sysctl_tree);
1372 fw = sc->ss[0].fw_stats;
1375 * Random information
1377 SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "firmware_version",
1378 CTLFLAG_RD, &sc->fw_version, 0, "firmware version");
1380 SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "serial_number",
1381 CTLFLAG_RD, &sc->serial_number_string, 0, "serial number");
1383 SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "product_code",
1384 CTLFLAG_RD, &sc->product_code_string, 0, "product code");
1386 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "pcie_link_width",
1387 CTLFLAG_RD, &sc->link_width, 0, "link width");
1389 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_boundary",
1390 CTLFLAG_RD, &sc->tx_boundary, 0, "tx boundary");
1392 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "write_combine",
1393 CTLFLAG_RD, &sc->wc, 0, "write combining PIO");
1395 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "read_dma_MBs",
1396 CTLFLAG_RD, &sc->read_dma, 0, "DMA Read speed in MB/s");
1398 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "write_dma_MBs",
1399 CTLFLAG_RD, &sc->write_dma, 0, "DMA Write speed in MB/s");
1401 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "read_write_dma_MBs",
1402 CTLFLAG_RD, &sc->read_write_dma, 0,
1403 "DMA concurrent Read/Write speed in MB/s");
1405 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "watchdog_resets",
1406 CTLFLAG_RD, &sc->watchdog_resets, 0,
1407 "Number of times NIC was reset");
1410 * Performance related tunables
1412 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "intr_coal_delay",
1413 CTLTYPE_INT|CTLFLAG_RW, sc, 0, mxge_change_intr_coal, "I",
1414 "Interrupt coalescing delay in usecs");
1416 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "throttle",
1417 CTLTYPE_INT|CTLFLAG_RW, sc, 0, mxge_change_throttle, "I",
1418 "Transmit throttling");
1420 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "flow_control_enabled",
1421 CTLTYPE_INT|CTLFLAG_RW, sc, 0, mxge_change_flow_control, "I",
1422 "Interrupt coalescing delay in usecs");
1424 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "deassert_wait",
1425 CTLFLAG_RW, &mxge_deassert_wait, 0,
1426 "Wait for IRQ line to go low in ihandler");
1429 * Stats block from firmware is in network byte order.
1432 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "link_up",
1433 CTLTYPE_INT|CTLFLAG_RD, &fw->link_up, 0,
1434 mxge_handle_be32, "I", "link up");
1436 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rdma_tags_available",
1437 CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available, 0,
1438 mxge_handle_be32, "I", "rdma_tags_available");
1440 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_bad_crc32",
1441 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_bad_crc32, 0,
1442 mxge_handle_be32, "I", "dropped_bad_crc32");
1444 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_bad_phy",
1445 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_bad_phy, 0,
1446 mxge_handle_be32, "I", "dropped_bad_phy");
1448 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_link_error_or_filtered",
1449 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_error_or_filtered, 0,
1450 mxge_handle_be32, "I", "dropped_link_error_or_filtered");
1452 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_link_overflow",
1453 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow, 0,
1454 mxge_handle_be32, "I", "dropped_link_overflow");
1456 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_multicast_filtered",
1457 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_multicast_filtered, 0,
1458 mxge_handle_be32, "I", "dropped_multicast_filtered");
1460 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_no_big_buffer",
1461 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer, 0,
1462 mxge_handle_be32, "I", "dropped_no_big_buffer");
1464 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_no_small_buffer",
1465 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_small_buffer, 0,
1466 mxge_handle_be32, "I", "dropped_no_small_buffer");
1468 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_overrun",
1469 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun, 0,
1470 mxge_handle_be32, "I", "dropped_overrun");
1472 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_pause",
1473 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_pause, 0,
1474 mxge_handle_be32, "I", "dropped_pause");
1476 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_runt",
1477 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt, 0,
1478 mxge_handle_be32, "I", "dropped_runt");
1480 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_unicast_filtered",
1481 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered, 0,
1482 mxge_handle_be32, "I", "dropped_unicast_filtered");
1484 /* add counters exported for debugging from all slices */
1485 sysctl_ctx_init(&sc->slice_sysctl_ctx);
1486 sc->slice_sysctl_tree = SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx,
1487 children, OID_AUTO, "slice", CTLFLAG_RD, 0, "");
1488 if (sc->slice_sysctl_tree == NULL) {
1489 device_printf(sc->dev, "can't add slice sysctl node\n");
1493 for (slice = 0; slice < sc->num_slices; slice++) {
1494 ss = &sc->ss[slice];
1495 sysctl_ctx_init(&ss->sysctl_ctx);
1496 ctx = &ss->sysctl_ctx;
1497 children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1498 ksprintf(slice_num, "%d", slice);
1499 ss->sysctl_tree = SYSCTL_ADD_NODE(ctx, children, OID_AUTO,
1500 slice_num, CTLFLAG_RD, 0, "");
1501 if (ss->sysctl_tree == NULL) {
1502 device_printf(sc->dev,
1503 "can't add %d slice sysctl node\n", slice);
1504 return; /* XXX continue? */
1506 children = SYSCTL_CHILDREN(ss->sysctl_tree);
1509 * XXX change to ULONG
1512 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rx_small_cnt",
1513 CTLFLAG_RD, &ss->rx_small.cnt, 0, "rx_small_cnt");
1515 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rx_big_cnt",
1516 CTLFLAG_RD, &ss->rx_big.cnt, 0, "rx_small_cnt");
1518 #ifndef IFNET_BUF_RING
1519 /* only transmit from slice 0 for now */
1524 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_req",
1525 CTLFLAG_RD, &ss->tx.req, 0, "tx_req");
1527 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_done",
1528 CTLFLAG_RD, &ss->tx.done, 0, "tx_done");
1530 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_pkt_done",
1531 CTLFLAG_RD, &ss->tx.pkt_done, 0, "tx_done");
1533 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_queue_active",
1534 CTLFLAG_RD, &ss->tx.queue_active, 0, "tx_queue_active");
1536 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_activate",
1537 CTLFLAG_RD, &ss->tx.activate, 0, "tx_activate");
1539 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_deactivate",
1540 CTLFLAG_RD, &ss->tx.deactivate, 0, "tx_deactivate");
1545 * Copy an array of mcp_kreq_ether_send_t's to the mcp. Copy
1546 * backwards one at a time and handle ring wraps
1548 static __inline void
1549 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1550 mcp_kreq_ether_send_t *src, int cnt)
1552 int idx, starting_slot;
1554 starting_slot = tx->req;
1557 idx = (starting_slot + cnt) & tx->mask;
1558 mxge_pio_copy(&tx->lanai[idx], &src[cnt], sizeof(*src));
1564 * Copy an array of mcp_kreq_ether_send_t's to the mcp. Copy
1565 * at most 32 bytes at a time, so as to avoid involving the software
1566 * pio handler in the nic. We re-write the first segment's flags
1567 * to mark them valid only after writing the entire chain
1569 static __inline void
1570 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src, int cnt)
1574 volatile uint32_t *dst_ints;
1575 mcp_kreq_ether_send_t *srcp;
1576 volatile mcp_kreq_ether_send_t *dstp, *dst;
1579 idx = tx->req & tx->mask;
1581 last_flags = src->flags;
1584 dst = dstp = &tx->lanai[idx];
1587 if ((idx + cnt) < tx->mask) {
1588 for (i = 0; i < cnt - 1; i += 2) {
1589 mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1590 wmb(); /* force write every 32 bytes */
1596 * Submit all but the first request, and ensure
1597 * that it is submitted below
1599 mxge_submit_req_backwards(tx, src, cnt);
1603 /* Submit the first request */
1604 mxge_pio_copy(dstp, srcp, sizeof(*src));
1605 wmb(); /* barrier before setting valid flag */
1608 /* Re-write the last 32-bits with the valid flags */
1609 src->flags = last_flags;
1610 src_ints = (uint32_t *)src;
1612 dst_ints = (volatile uint32_t *)dst;
1614 *dst_ints = *src_ints;
1620 mxge_pullup_tso(struct mbuf **mp)
1622 int hoff, iphlen, thoff;
1626 KASSERT(M_WRITABLE(m), ("TSO mbuf not writable"));
1628 iphlen = m->m_pkthdr.csum_iphlen;
1629 thoff = m->m_pkthdr.csum_thlen;
1630 hoff = m->m_pkthdr.csum_lhlen;
1632 KASSERT(iphlen > 0, ("invalid ip hlen"));
1633 KASSERT(thoff > 0, ("invalid tcp hlen"));
1634 KASSERT(hoff > 0, ("invalid ether hlen"));
1636 if (__predict_false(m->m_len < hoff + iphlen + thoff)) {
1637 m = m_pullup(m, hoff + iphlen + thoff);
1648 mxge_encap_tso(mxge_tx_ring_t *tx, struct mbuf *m, int busdma_seg_cnt)
1650 mcp_kreq_ether_send_t *req;
1651 bus_dma_segment_t *seg;
1652 uint32_t low, high_swapped;
1653 int len, seglen, cum_len, cum_len_next;
1654 int next_is_first, chop, cnt, rdma_count, small;
1655 uint16_t pseudo_hdr_offset, cksum_offset, mss;
1656 uint8_t flags, flags_next;
1658 mss = m->m_pkthdr.tso_segsz;
1661 * Negative cum_len signifies to the send loop that we are
1662 * still in the header portion of the TSO packet.
1664 cum_len = -(m->m_pkthdr.csum_lhlen + m->m_pkthdr.csum_iphlen +
1665 m->m_pkthdr.csum_thlen);
1668 * TSO implies checksum offload on this hardware
1670 cksum_offset = m->m_pkthdr.csum_lhlen + m->m_pkthdr.csum_iphlen;
1671 flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1674 * For TSO, pseudo_hdr_offset holds mss. The firmware figures
1675 * out where to put the checksum by parsing the header.
1677 pseudo_hdr_offset = htobe16(mss);
1685 * "rdma_count" is the number of RDMAs belonging to the current
1686 * packet BEFORE the current send request. For non-TSO packets,
1687 * this is equal to "count".
1689 * For TSO packets, rdma_count needs to be reset to 0 after a
1692 * The rdma_count field of the send request is the number of
1693 * RDMAs of the packet starting at that request. For TSO send
1694 * requests with one ore more cuts in the middle, this is the
1695 * number of RDMAs starting after the last cut in the request.
1696 * All previous segments before the last cut implicitly have 1
1699 * Since the number of RDMAs is not known beforehand, it must be
1700 * filled-in retroactively - after each segmentation cut or at
1701 * the end of the entire packet.
1704 while (busdma_seg_cnt) {
1706 * Break the busdma segment up into pieces
1708 low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1709 high_swapped = htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1713 flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1715 cum_len_next = cum_len + seglen;
1716 (req - rdma_count)->rdma_count = rdma_count + 1;
1717 if (__predict_true(cum_len >= 0)) {
1719 chop = (cum_len_next > mss);
1720 cum_len_next = cum_len_next % mss;
1721 next_is_first = (cum_len_next == 0);
1722 flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1724 next_is_first * MXGEFW_FLAGS_FIRST;
1725 rdma_count |= -(chop | next_is_first);
1726 rdma_count += chop & !next_is_first;
1727 } else if (cum_len_next >= 0) {
1732 small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1733 flags_next = MXGEFW_FLAGS_TSO_PLD |
1734 MXGEFW_FLAGS_FIRST |
1735 (small * MXGEFW_FLAGS_SMALL);
1738 req->addr_high = high_swapped;
1739 req->addr_low = htobe32(low);
1740 req->pseudo_hdr_offset = pseudo_hdr_offset;
1742 req->rdma_count = 1;
1743 req->length = htobe16(seglen);
1744 req->cksum_offset = cksum_offset;
1746 flags | ((cum_len & 1) * MXGEFW_FLAGS_ALIGN_ODD);
1749 cum_len = cum_len_next;
1754 if (__predict_false(cksum_offset > seglen))
1755 cksum_offset -= seglen;
1758 if (__predict_false(cnt > tx->max_desc))
1764 (req - rdma_count)->rdma_count = rdma_count;
1768 req->flags |= MXGEFW_FLAGS_TSO_LAST;
1769 } while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1771 tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1772 mxge_submit_req(tx, tx->req_list, cnt);
1773 #ifdef IFNET_BUF_RING
1774 if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1775 /* tell the NIC to start polling this slice */
1777 tx->queue_active = 1;
1785 bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1788 /* TODO update oerror counter */
1794 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
1797 mcp_kreq_ether_send_t *req;
1798 bus_dma_segment_t *seg;
1800 int cnt, cum_len, err, i, idx, odd_flag;
1801 uint16_t pseudo_hdr_offset;
1802 uint8_t flags, cksum_offset;
1807 if (m->m_pkthdr.csum_flags & CSUM_TSO) {
1808 err = mxge_pullup_tso(&m);
1809 if (__predict_false(err))
1814 * Map the frame for DMA
1816 idx = tx->req & tx->mask;
1817 err = bus_dmamap_load_mbuf_defrag(tx->dmat, tx->info[idx].map, &m,
1818 tx->seg_list, tx->max_desc - 2, &cnt, BUS_DMA_NOWAIT);
1819 if (__predict_false(err != 0))
1821 bus_dmamap_sync(tx->dmat, tx->info[idx].map, BUS_DMASYNC_PREWRITE);
1822 tx->info[idx].m = m;
1825 * TSO is different enough, we handle it in another routine
1827 if (m->m_pkthdr.csum_flags & CSUM_TSO)
1828 return mxge_encap_tso(tx, m, cnt);
1832 pseudo_hdr_offset = 0;
1833 flags = MXGEFW_FLAGS_NO_TSO;
1836 * Checksum offloading
1838 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
1839 cksum_offset = m->m_pkthdr.csum_lhlen + m->m_pkthdr.csum_iphlen;
1840 pseudo_hdr_offset = cksum_offset + m->m_pkthdr.csum_data;
1841 pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
1842 req->cksum_offset = cksum_offset;
1843 flags |= MXGEFW_FLAGS_CKSUM;
1844 odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
1848 if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
1849 flags |= MXGEFW_FLAGS_SMALL;
1852 * Convert segments into a request list
1856 req->flags = MXGEFW_FLAGS_FIRST;
1857 for (i = 0; i < cnt; i++) {
1858 req->addr_low = htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
1859 req->addr_high = htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1860 req->length = htobe16(seg->ds_len);
1861 req->cksum_offset = cksum_offset;
1862 if (cksum_offset > seg->ds_len)
1863 cksum_offset -= seg->ds_len;
1866 req->pseudo_hdr_offset = pseudo_hdr_offset;
1867 req->pad = 0; /* complete solid 16-byte block */
1868 req->rdma_count = 1;
1869 req->flags |= flags | ((cum_len & 1) * odd_flag);
1870 cum_len += seg->ds_len;
1878 * Pad runt to 60 bytes
1883 htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.dmem_busaddr));
1885 htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.dmem_busaddr));
1886 req->length = htobe16(60 - cum_len);
1887 req->cksum_offset = 0;
1888 req->pseudo_hdr_offset = pseudo_hdr_offset;
1889 req->pad = 0; /* complete solid 16-byte block */
1890 req->rdma_count = 1;
1891 req->flags |= flags | ((cum_len & 1) * odd_flag);
1895 tx->req_list[0].rdma_count = cnt;
1897 /* print what the firmware will see */
1898 for (i = 0; i < cnt; i++) {
1899 kprintf("%d: addr: 0x%x 0x%x len:%d pso%d,"
1900 "cso:%d, flags:0x%x, rdma:%d\n",
1901 i, (int)ntohl(tx->req_list[i].addr_high),
1902 (int)ntohl(tx->req_list[i].addr_low),
1903 (int)ntohs(tx->req_list[i].length),
1904 (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
1905 tx->req_list[i].cksum_offset, tx->req_list[i].flags,
1906 tx->req_list[i].rdma_count);
1908 kprintf("--------------\n");
1910 tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1911 mxge_submit_req(tx, tx->req_list, cnt);
1912 #ifdef IFNET_BUF_RING
1913 if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1914 /* tell the NIC to start polling this slice */
1916 tx->queue_active = 1;
1930 mxge_start(struct ifnet *ifp, struct ifaltq_subque *ifsq)
1932 mxge_softc_t *sc = ifp->if_softc;
1933 struct mxge_slice_state *ss;
1937 ASSERT_ALTQ_SQ_DEFAULT(ifp, ifsq);
1938 ASSERT_SERIALIZED(sc->ifp->if_serializer);
1940 if ((ifp->if_flags & IFF_RUNNING) == 0 || ifsq_is_oactive(ifsq))
1943 /* XXX Only use the first slice for now */
1947 while (tx->mask - (tx->req - tx->done) > tx->max_desc) {
1951 m = ifsq_dequeue(ifsq);
1956 error = mxge_encap(ss, m);
1961 /* Ran out of transmit slots */
1962 ifsq_set_oactive(ifsq);
1969 mxge_watchdog(struct ifnet *ifp)
1971 struct mxge_softc *sc = ifp->if_softc;
1972 uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
1973 mxge_tx_ring_t *tx = &sc->ss[0].tx;
1975 ASSERT_SERIALIZED(ifp->if_serializer);
1977 /* Check for pause blocking before resetting */
1978 if (tx->watchdog_rx_pause == rx_pause) {
1979 mxge_warn_stuck(sc, tx, 0);
1980 mxge_watchdog_reset(sc);
1983 if_printf(ifp, "Flow control blocking xmits, "
1984 "check link partner\n");
1986 tx->watchdog_rx_pause = rx_pause;
1990 * copy an array of mcp_kreq_ether_recv_t's to the mcp. Copy
1991 * at most 32 bytes at a time, so as to avoid involving the software
1992 * pio handler in the nic. We re-write the first segment's low
1993 * DMA address to mark it valid only after we write the entire chunk
1996 static __inline void
1997 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
1998 mcp_kreq_ether_recv_t *src)
2002 low = src->addr_low;
2003 src->addr_low = 0xffffffff;
2004 mxge_pio_copy(dst, src, 4 * sizeof (*src));
2006 mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2008 src->addr_low = low;
2009 dst->addr_low = low;
2014 mxge_get_buf_small(mxge_rx_ring_t *rx, bus_dmamap_t map, int idx,
2017 bus_dma_segment_t seg;
2019 int cnt, err, mflag;
2021 mflag = MB_DONTWAIT;
2022 if (__predict_false(init))
2025 m = m_gethdr(mflag, MT_DATA);
2029 if (__predict_false(init)) {
2031 * During initialization, there
2032 * is nothing to setup; bail out
2038 m->m_len = m->m_pkthdr.len = MHLEN;
2040 err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2041 &seg, 1, &cnt, BUS_DMA_NOWAIT);
2044 if (__predict_false(init)) {
2046 * During initialization, there
2047 * is nothing to setup; bail out
2054 rx->info[idx].m = m;
2055 rx->shadow[idx].addr_low = htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2056 rx->shadow[idx].addr_high = htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2060 mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2065 mxge_get_buf_big(mxge_rx_ring_t *rx, bus_dmamap_t map, int idx,
2068 bus_dma_segment_t seg;
2070 int cnt, err, mflag;
2072 mflag = MB_DONTWAIT;
2073 if (__predict_false(init))
2076 if (rx->cl_size == MCLBYTES)
2077 m = m_getcl(mflag, MT_DATA, M_PKTHDR);
2079 m = m_getjcl(mflag, MT_DATA, M_PKTHDR, MJUMPAGESIZE);
2083 if (__predict_false(init)) {
2085 * During initialization, there
2086 * is nothing to setup; bail out
2092 m->m_len = m->m_pkthdr.len = rx->mlen;
2094 err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2095 &seg, 1, &cnt, BUS_DMA_NOWAIT);
2098 if (__predict_false(init)) {
2100 * During initialization, there
2101 * is nothing to setup; bail out
2108 rx->info[idx].m = m;
2109 rx->shadow[idx].addr_low = htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2110 rx->shadow[idx].addr_high = htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2114 mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2119 * Myri10GE hardware checksums are not valid if the sender
2120 * padded the frame with non-zero padding. This is because
2121 * the firmware just does a simple 16-bit 1s complement
2122 * checksum across the entire frame, excluding the first 14
2123 * bytes. It is best to simply to check the checksum and
2124 * tell the stack about it only if the checksum is good
2126 static __inline uint16_t
2127 mxge_rx_csum(struct mbuf *m, int csum)
2129 const struct ether_header *eh;
2130 const struct ip *ip;
2133 eh = mtod(m, const struct ether_header *);
2135 /* Only deal with IPv4 TCP & UDP for now */
2136 if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2139 ip = (const struct ip *)(eh + 1);
2140 if (__predict_false(ip->ip_p != IPPROTO_TCP && ip->ip_p != IPPROTO_UDP))
2144 c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2145 htonl(ntohs(csum) + ntohs(ip->ip_len) +
2146 - (ip->ip_hl << 2) + ip->ip_p));
2155 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2157 struct ether_vlan_header *evl;
2160 evl = mtod(m, struct ether_vlan_header *);
2163 * Fix checksum by subtracting EVL_ENCAPLEN bytes after
2164 * what the firmware thought was the end of the ethernet
2168 /* Put checksum into host byte order */
2169 *csum = ntohs(*csum);
2171 partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2173 *csum += ((*csum) < ~partial);
2174 *csum = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2175 *csum = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2178 * Restore checksum to network byte order;
2179 * later consumers expect this
2181 *csum = htons(*csum);
2184 m->m_pkthdr.ether_vlantag = ntohs(evl->evl_tag);
2185 m->m_flags |= M_VLANTAG;
2188 * Remove the 802.1q header by copying the Ethernet
2189 * addresses over it and adjusting the beginning of
2190 * the data in the mbuf. The encapsulated Ethernet
2191 * type field is already in place.
2193 bcopy((char *)evl, (char *)evl + EVL_ENCAPLEN,
2194 ETHER_HDR_LEN - ETHER_TYPE_LEN);
2195 m_adj(m, EVL_ENCAPLEN);
2199 static __inline void
2200 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2205 const struct ether_header *eh;
2207 bus_dmamap_t old_map;
2214 idx = rx->cnt & rx->mask;
2217 /* Save a pointer to the received mbuf */
2218 m = rx->info[idx].m;
2220 /* Try to replace the received mbuf */
2221 if (mxge_get_buf_big(rx, rx->extra_map, idx, FALSE)) {
2222 /* Drop the frame -- the old mbuf is re-cycled */
2223 IFNET_STAT_INC(ifp, ierrors, 1);
2227 /* Unmap the received buffer */
2228 old_map = rx->info[idx].map;
2229 bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2230 bus_dmamap_unload(rx->dmat, old_map);
2232 /* Swap the bus_dmamap_t's */
2233 rx->info[idx].map = rx->extra_map;
2234 rx->extra_map = old_map;
2237 * mcp implicitly skips 1st 2 bytes so that packet is properly
2240 m->m_data += MXGEFW_PAD;
2242 m->m_pkthdr.rcvif = ifp;
2243 m->m_len = m->m_pkthdr.len = len;
2247 eh = mtod(m, const struct ether_header *);
2248 if (eh->ether_type == htons(ETHERTYPE_VLAN))
2249 mxge_vlan_tag_remove(m, &csum);
2251 /* If the checksum is valid, mark it in the mbuf header */
2252 if ((ifp->if_capenable & IFCAP_RXCSUM) &&
2253 mxge_rx_csum(m, csum) == 0) {
2254 /* Tell the stack that the checksum is good */
2255 m->m_pkthdr.csum_data = 0xffff;
2256 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2259 ifp->if_input(ifp, m);
2262 static __inline void
2263 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2267 const struct ether_header *eh;
2270 bus_dmamap_t old_map;
2277 idx = rx->cnt & rx->mask;
2280 /* Save a pointer to the received mbuf */
2281 m = rx->info[idx].m;
2283 /* Try to replace the received mbuf */
2284 if (mxge_get_buf_small(rx, rx->extra_map, idx, FALSE)) {
2285 /* Drop the frame -- the old mbuf is re-cycled */
2286 IFNET_STAT_INC(ifp, ierrors, 1);
2290 /* Unmap the received buffer */
2291 old_map = rx->info[idx].map;
2292 bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2293 bus_dmamap_unload(rx->dmat, old_map);
2295 /* Swap the bus_dmamap_t's */
2296 rx->info[idx].map = rx->extra_map;
2297 rx->extra_map = old_map;
2300 * mcp implicitly skips 1st 2 bytes so that packet is properly
2303 m->m_data += MXGEFW_PAD;
2305 m->m_pkthdr.rcvif = ifp;
2306 m->m_len = m->m_pkthdr.len = len;
2310 eh = mtod(m, const struct ether_header *);
2311 if (eh->ether_type == htons(ETHERTYPE_VLAN))
2312 mxge_vlan_tag_remove(m, &csum);
2314 /* If the checksum is valid, mark it in the mbuf header */
2315 if ((ifp->if_capenable & IFCAP_RXCSUM) &&
2316 mxge_rx_csum(m, csum) == 0) {
2317 /* Tell the stack that the checksum is good */
2318 m->m_pkthdr.csum_data = 0xffff;
2319 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2322 ifp->if_input(ifp, m);
2325 static __inline void
2326 mxge_clean_rx_done(struct mxge_slice_state *ss)
2328 mxge_rx_done_t *rx_done = &ss->rx_done;
2330 while (rx_done->entry[rx_done->idx].length != 0) {
2331 uint16_t length, checksum;
2333 length = ntohs(rx_done->entry[rx_done->idx].length);
2334 rx_done->entry[rx_done->idx].length = 0;
2336 checksum = rx_done->entry[rx_done->idx].checksum;
2338 if (length <= (MHLEN - MXGEFW_PAD))
2339 mxge_rx_done_small(ss, length, checksum);
2341 mxge_rx_done_big(ss, length, checksum);
2344 rx_done->idx = rx_done->cnt & rx_done->mask;
2348 static __inline void
2349 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2356 ASSERT_SERIALIZED(ifp->if_serializer);
2358 while (tx->pkt_done != mcp_idx) {
2362 idx = tx->done & tx->mask;
2365 m = tx->info[idx].m;
2367 * mbuf and DMA map only attached to the first
2372 tx->info[idx].m = NULL;
2373 bus_dmamap_unload(tx->dmat, tx->info[idx].map);
2376 if (tx->info[idx].flag) {
2377 tx->info[idx].flag = 0;
2383 * If we have space, clear OACTIVE to tell the stack that
2384 * its OK to send packets
2386 if (tx->req - tx->done < (tx->mask + 1) / 4) {
2387 ifq_clr_oactive(&ifp->if_snd);
2388 if (tx->req == tx->done)
2392 if (!ifq_is_empty(&ifp->if_snd))
2395 #ifdef IFNET_BUF_RING
2396 if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2397 /* let the NIC stop polling this queue, since there
2398 * are no more transmits pending */
2399 if (tx->req == tx->done) {
2401 tx->queue_active = 0;
2409 static struct mxge_media_type mxge_xfp_media_types[] = {
2410 {IFM_10G_CX4, 0x7f, "10GBASE-CX4 (module)"},
2411 {IFM_10G_SR, (1 << 7), "10GBASE-SR"},
2412 {IFM_10G_LR, (1 << 6), "10GBASE-LR"},
2413 {0, (1 << 5), "10GBASE-ER"},
2414 {IFM_10G_LRM, (1 << 4), "10GBASE-LRM"},
2415 {0, (1 << 3), "10GBASE-SW"},
2416 {0, (1 << 2), "10GBASE-LW"},
2417 {0, (1 << 1), "10GBASE-EW"},
2418 {0, (1 << 0), "Reserved"}
2421 static struct mxge_media_type mxge_sfp_media_types[] = {
2422 {IFM_10G_TWINAX, 0, "10GBASE-Twinax"},
2423 {0, (1 << 7), "Reserved"},
2424 {IFM_10G_LRM, (1 << 6), "10GBASE-LRM"},
2425 {IFM_10G_LR, (1 << 5), "10GBASE-LR"},
2426 {IFM_10G_SR, (1 << 4), "10GBASE-SR"},
2427 {IFM_10G_TWINAX,(1 << 0), "10GBASE-Twinax"}
2431 mxge_media_set(mxge_softc_t *sc, int media_type)
2433 ifmedia_add(&sc->media, IFM_ETHER | IFM_FDX | media_type, 0, NULL);
2434 ifmedia_set(&sc->media, IFM_ETHER | IFM_FDX | media_type);
2435 sc->current_media = media_type;
2436 sc->media.ifm_media = sc->media.ifm_cur->ifm_media;
2440 mxge_media_init(mxge_softc_t *sc)
2445 ifmedia_removeall(&sc->media);
2446 mxge_media_set(sc, IFM_AUTO);
2449 * parse the product code to deterimine the interface type
2450 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2451 * after the 3rd dash in the driver's cached copy of the
2452 * EEPROM's product code string.
2454 ptr = sc->product_code_string;
2456 if_printf(sc->ifp, "Missing product code\n");
2460 for (i = 0; i < 3; i++, ptr++) {
2461 ptr = strchr(ptr, '-');
2463 if_printf(sc->ifp, "only %d dashes in PC?!?\n", i);
2467 if (*ptr == 'C' || *(ptr +1) == 'C') {
2469 sc->connector = MXGE_CX4;
2470 mxge_media_set(sc, IFM_10G_CX4);
2471 } else if (*ptr == 'Q') {
2472 /* -Q is Quad Ribbon Fiber */
2473 sc->connector = MXGE_QRF;
2474 if_printf(sc->ifp, "Quad Ribbon Fiber Media\n");
2475 /* FreeBSD has no media type for Quad ribbon fiber */
2476 } else if (*ptr == 'R') {
2478 sc->connector = MXGE_XFP;
2479 } else if (*ptr == 'S' || *(ptr +1) == 'S') {
2480 /* -S or -2S is SFP+ */
2481 sc->connector = MXGE_SFP;
2483 if_printf(sc->ifp, "Unknown media type: %c\n", *ptr);
2488 * Determine the media type for a NIC. Some XFPs will identify
2489 * themselves only when their link is up, so this is initiated via a
2490 * link up interrupt. However, this can potentially take up to
2491 * several milliseconds, so it is run via the watchdog routine, rather
2492 * than in the interrupt handler itself.
2495 mxge_media_probe(mxge_softc_t *sc)
2498 const char *cage_type;
2499 struct mxge_media_type *mxge_media_types = NULL;
2500 int i, err, ms, mxge_media_type_entries;
2503 sc->need_media_probe = 0;
2505 if (sc->connector == MXGE_XFP) {
2507 mxge_media_types = mxge_xfp_media_types;
2508 mxge_media_type_entries = sizeof(mxge_xfp_media_types) /
2509 sizeof(mxge_xfp_media_types[0]);
2510 byte = MXGE_XFP_COMPLIANCE_BYTE;
2512 } else if (sc->connector == MXGE_SFP) {
2513 /* -S or -2S is SFP+ */
2514 mxge_media_types = mxge_sfp_media_types;
2515 mxge_media_type_entries = sizeof(mxge_sfp_media_types) /
2516 sizeof(mxge_sfp_media_types[0]);
2520 /* nothing to do; media type cannot change */
2525 * At this point we know the NIC has an XFP cage, so now we
2526 * try to determine what is in the cage by using the
2527 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2528 * register. We read just one byte, which may take over
2532 cmd.data0 = 0; /* just fetch 1 byte, not all 256 */
2534 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2535 if (err == MXGEFW_CMD_ERROR_I2C_FAILURE)
2536 if_printf(sc->ifp, "failed to read XFP\n");
2537 if (err == MXGEFW_CMD_ERROR_I2C_ABSENT)
2538 if_printf(sc->ifp, "Type R/S with no XFP!?!?\n");
2539 if (err != MXGEFW_CMD_OK)
2542 /* Now we wait for the data to be cached */
2544 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2545 for (ms = 0; err == EBUSY && ms < 50; ms++) {
2548 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2550 if (err != MXGEFW_CMD_OK) {
2551 if_printf(sc->ifp, "failed to read %s (%d, %dms)\n",
2552 cage_type, err, ms);
2556 if (cmd.data0 == mxge_media_types[0].bitmask) {
2558 if_printf(sc->ifp, "%s:%s\n", cage_type,
2559 mxge_media_types[0].name);
2561 if (sc->current_media != mxge_media_types[0].flag) {
2562 mxge_media_init(sc);
2563 mxge_media_set(sc, mxge_media_types[0].flag);
2567 for (i = 1; i < mxge_media_type_entries; i++) {
2568 if (cmd.data0 & mxge_media_types[i].bitmask) {
2570 if_printf(sc->ifp, "%s:%s\n", cage_type,
2571 mxge_media_types[i].name);
2574 if (sc->current_media != mxge_media_types[i].flag) {
2575 mxge_media_init(sc);
2576 mxge_media_set(sc, mxge_media_types[i].flag);
2582 if_printf(sc->ifp, "%s media 0x%x unknown\n", cage_type,
2588 mxge_intr(void *arg)
2590 struct mxge_slice_state *ss = arg;
2591 mxge_softc_t *sc = ss->sc;
2592 mcp_irq_data_t *stats = ss->fw_stats;
2593 mxge_tx_ring_t *tx = &ss->tx;
2594 mxge_rx_done_t *rx_done = &ss->rx_done;
2595 uint32_t send_done_count;
2599 #ifndef IFNET_BUF_RING
2600 /* an interrupt on a non-zero slice is implicitly valid
2601 since MSI-X irqs are not shared */
2603 mxge_clean_rx_done(ss);
2604 *ss->irq_claim = be32toh(3);
2609 /* make sure the DMA has finished */
2610 if (!stats->valid) {
2613 valid = stats->valid;
2615 if (sc->irq_type == PCI_INTR_TYPE_LEGACY) {
2616 /* lower legacy IRQ */
2617 *sc->irq_deassert = 0;
2618 if (!mxge_deassert_wait)
2619 /* don't wait for conf. that irq is low */
2625 /* loop while waiting for legacy irq deassertion */
2627 /* check for transmit completes and receives */
2628 send_done_count = be32toh(stats->send_done_count);
2629 while ((send_done_count != tx->pkt_done) ||
2630 (rx_done->entry[rx_done->idx].length != 0)) {
2631 if (send_done_count != tx->pkt_done)
2632 mxge_tx_done(ss, (int)send_done_count);
2633 mxge_clean_rx_done(ss);
2634 send_done_count = be32toh(stats->send_done_count);
2636 if (sc->irq_type == PCI_INTR_TYPE_LEGACY && mxge_deassert_wait)
2638 } while (*((volatile uint8_t *) &stats->valid));
2640 /* fw link & error stats meaningful only on the first slice */
2641 if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
2642 if (sc->link_state != stats->link_up) {
2643 sc->link_state = stats->link_up;
2644 if (sc->link_state) {
2645 sc->ifp->if_link_state = LINK_STATE_UP;
2646 if_link_state_change(sc->ifp);
2648 device_printf(sc->dev, "link up\n");
2650 sc->ifp->if_link_state = LINK_STATE_DOWN;
2651 if_link_state_change(sc->ifp);
2653 device_printf(sc->dev, "link down\n");
2655 sc->need_media_probe = 1;
2657 if (sc->rdma_tags_available !=
2658 be32toh(stats->rdma_tags_available)) {
2659 sc->rdma_tags_available =
2660 be32toh(stats->rdma_tags_available);
2661 device_printf(sc->dev, "RDMA timed out! %d tags "
2662 "left\n", sc->rdma_tags_available);
2665 if (stats->link_down) {
2666 sc->down_cnt += stats->link_down;
2668 sc->ifp->if_link_state = LINK_STATE_DOWN;
2669 if_link_state_change(sc->ifp);
2673 /* check to see if we have rx token to pass back */
2675 *ss->irq_claim = be32toh(3);
2676 *(ss->irq_claim + 1) = be32toh(3);
2680 mxge_init(void *arg)
2682 struct mxge_softc *sc = arg;
2684 ASSERT_SERIALIZED(sc->ifp->if_serializer);
2685 if ((sc->ifp->if_flags & IFF_RUNNING) == 0)
2690 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
2694 for (i = 0; i <= ss->rx_big.mask; i++) {
2695 if (ss->rx_big.info[i].m == NULL)
2697 bus_dmamap_unload(ss->rx_big.dmat, ss->rx_big.info[i].map);
2698 m_freem(ss->rx_big.info[i].m);
2699 ss->rx_big.info[i].m = NULL;
2702 for (i = 0; i <= ss->rx_small.mask; i++) {
2703 if (ss->rx_small.info[i].m == NULL)
2705 bus_dmamap_unload(ss->rx_small.dmat, ss->rx_small.info[i].map);
2706 m_freem(ss->rx_small.info[i].m);
2707 ss->rx_small.info[i].m = NULL;
2710 /* Transmit ring used only on the first slice */
2711 if (ss->tx.info == NULL)
2714 for (i = 0; i <= ss->tx.mask; i++) {
2715 ss->tx.info[i].flag = 0;
2716 if (ss->tx.info[i].m == NULL)
2718 bus_dmamap_unload(ss->tx.dmat, ss->tx.info[i].map);
2719 m_freem(ss->tx.info[i].m);
2720 ss->tx.info[i].m = NULL;
2725 mxge_free_mbufs(mxge_softc_t *sc)
2729 for (slice = 0; slice < sc->num_slices; slice++)
2730 mxge_free_slice_mbufs(&sc->ss[slice]);
2734 mxge_free_slice_rings(struct mxge_slice_state *ss)
2738 if (ss->rx_done.entry != NULL) {
2739 mxge_dma_free(&ss->rx_done.dma);
2740 ss->rx_done.entry = NULL;
2743 if (ss->tx.req_bytes != NULL) {
2744 kfree(ss->tx.req_bytes, M_DEVBUF);
2745 ss->tx.req_bytes = NULL;
2748 if (ss->tx.seg_list != NULL) {
2749 kfree(ss->tx.seg_list, M_DEVBUF);
2750 ss->tx.seg_list = NULL;
2753 if (ss->rx_small.shadow != NULL) {
2754 kfree(ss->rx_small.shadow, M_DEVBUF);
2755 ss->rx_small.shadow = NULL;
2758 if (ss->rx_big.shadow != NULL) {
2759 kfree(ss->rx_big.shadow, M_DEVBUF);
2760 ss->rx_big.shadow = NULL;
2763 if (ss->tx.info != NULL) {
2764 if (ss->tx.dmat != NULL) {
2765 for (i = 0; i <= ss->tx.mask; i++) {
2766 bus_dmamap_destroy(ss->tx.dmat,
2767 ss->tx.info[i].map);
2769 bus_dma_tag_destroy(ss->tx.dmat);
2771 kfree(ss->tx.info, M_DEVBUF);
2775 if (ss->rx_small.info != NULL) {
2776 if (ss->rx_small.dmat != NULL) {
2777 for (i = 0; i <= ss->rx_small.mask; i++) {
2778 bus_dmamap_destroy(ss->rx_small.dmat,
2779 ss->rx_small.info[i].map);
2781 bus_dmamap_destroy(ss->rx_small.dmat,
2782 ss->rx_small.extra_map);
2783 bus_dma_tag_destroy(ss->rx_small.dmat);
2785 kfree(ss->rx_small.info, M_DEVBUF);
2786 ss->rx_small.info = NULL;
2789 if (ss->rx_big.info != NULL) {
2790 if (ss->rx_big.dmat != NULL) {
2791 for (i = 0; i <= ss->rx_big.mask; i++) {
2792 bus_dmamap_destroy(ss->rx_big.dmat,
2793 ss->rx_big.info[i].map);
2795 bus_dmamap_destroy(ss->rx_big.dmat,
2796 ss->rx_big.extra_map);
2797 bus_dma_tag_destroy(ss->rx_big.dmat);
2799 kfree(ss->rx_big.info, M_DEVBUF);
2800 ss->rx_big.info = NULL;
2805 mxge_free_rings(mxge_softc_t *sc)
2812 for (slice = 0; slice < sc->num_slices; slice++)
2813 mxge_free_slice_rings(&sc->ss[slice]);
2817 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
2818 int tx_ring_entries)
2820 mxge_softc_t *sc = ss->sc;
2825 * Allocate per-slice receive resources
2828 ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
2829 ss->rx_done.mask = (2 * rx_ring_entries) - 1;
2831 /* Allocate the rx shadow rings */
2832 bytes = rx_ring_entries * sizeof(*ss->rx_small.shadow);
2833 ss->rx_small.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2835 bytes = rx_ring_entries * sizeof(*ss->rx_big.shadow);
2836 ss->rx_big.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2838 /* Allocate the rx host info rings */
2839 bytes = rx_ring_entries * sizeof(*ss->rx_small.info);
2840 ss->rx_small.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2842 bytes = rx_ring_entries * sizeof(*ss->rx_big.info);
2843 ss->rx_big.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2845 /* Allocate the rx busdma resources */
2846 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
2848 4096, /* boundary */
2849 BUS_SPACE_MAXADDR, /* low */
2850 BUS_SPACE_MAXADDR, /* high */
2851 NULL, NULL, /* filter */
2852 MHLEN, /* maxsize */
2854 MHLEN, /* maxsegsize */
2855 BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW,
2857 &ss->rx_small.dmat); /* tag */
2859 device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
2864 err = bus_dmamap_create(ss->rx_small.dmat, BUS_DMA_WAITOK,
2865 &ss->rx_small.extra_map);
2867 device_printf(sc->dev, "Err %d extra rx_small dmamap\n", err);
2868 bus_dma_tag_destroy(ss->rx_small.dmat);
2869 ss->rx_small.dmat = NULL;
2872 for (i = 0; i <= ss->rx_small.mask; i++) {
2873 err = bus_dmamap_create(ss->rx_small.dmat, BUS_DMA_WAITOK,
2874 &ss->rx_small.info[i].map);
2878 device_printf(sc->dev, "Err %d rx_small dmamap\n", err);
2880 for (j = 0; j < i; ++j) {
2881 bus_dmamap_destroy(ss->rx_small.dmat,
2882 ss->rx_small.info[j].map);
2884 bus_dmamap_destroy(ss->rx_small.dmat,
2885 ss->rx_small.extra_map);
2886 bus_dma_tag_destroy(ss->rx_small.dmat);
2887 ss->rx_small.dmat = NULL;
2892 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
2894 4096, /* boundary */
2895 BUS_SPACE_MAXADDR, /* low */
2896 BUS_SPACE_MAXADDR, /* high */
2897 NULL, NULL, /* filter */
2900 4096, /* maxsegsize*/
2901 BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW,
2903 &ss->rx_big.dmat); /* tag */
2905 device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
2910 err = bus_dmamap_create(ss->rx_big.dmat, BUS_DMA_WAITOK,
2911 &ss->rx_big.extra_map);
2913 device_printf(sc->dev, "Err %d extra rx_big dmamap\n", err);
2914 bus_dma_tag_destroy(ss->rx_big.dmat);
2915 ss->rx_big.dmat = NULL;
2918 for (i = 0; i <= ss->rx_big.mask; i++) {
2919 err = bus_dmamap_create(ss->rx_big.dmat, BUS_DMA_WAITOK,
2920 &ss->rx_big.info[i].map);
2924 device_printf(sc->dev, "Err %d rx_big dmamap\n", err);
2925 for (j = 0; j < i; ++j) {
2926 bus_dmamap_destroy(ss->rx_big.dmat,
2927 ss->rx_big.info[j].map);
2929 bus_dmamap_destroy(ss->rx_big.dmat,
2930 ss->rx_big.extra_map);
2931 bus_dma_tag_destroy(ss->rx_big.dmat);
2932 ss->rx_big.dmat = NULL;
2938 * Now allocate TX resources
2941 #ifndef IFNET_BUF_RING
2942 /* only use a single TX ring for now */
2943 if (ss != ss->sc->ss)
2947 ss->tx.mask = tx_ring_entries - 1;
2948 ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
2950 /* Allocate the tx request copy block XXX */
2951 bytes = 8 + sizeof(*ss->tx.req_list) * (ss->tx.max_desc + 4);
2952 ss->tx.req_bytes = kmalloc(bytes, M_DEVBUF, M_WAITOK);
2953 /* Ensure req_list entries are aligned to 8 bytes */
2954 ss->tx.req_list = (mcp_kreq_ether_send_t *)
2955 ((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
2957 /* Allocate the tx busdma segment list */
2958 bytes = sizeof(*ss->tx.seg_list) * ss->tx.max_desc;
2959 ss->tx.seg_list = kmalloc(bytes, M_DEVBUF, M_WAITOK);
2961 /* Allocate the tx host info ring */
2962 bytes = tx_ring_entries * sizeof(*ss->tx.info);
2963 ss->tx.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2965 /* Allocate the tx busdma resources */
2966 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
2968 sc->tx_boundary, /* boundary */
2969 BUS_SPACE_MAXADDR, /* low */
2970 BUS_SPACE_MAXADDR, /* high */
2971 NULL, NULL, /* filter */
2973 sizeof(struct ether_vlan_header),
2975 ss->tx.max_desc - 2, /* num segs */
2976 sc->tx_boundary, /* maxsegsz */
2977 BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW |
2978 BUS_DMA_ONEBPAGE, /* flags */
2979 &ss->tx.dmat); /* tag */
2981 device_printf(sc->dev, "Err %d allocating tx dmat\n", err);
2986 * Now use these tags to setup DMA maps for each slot in the ring
2988 for (i = 0; i <= ss->tx.mask; i++) {
2989 err = bus_dmamap_create(ss->tx.dmat,
2990 BUS_DMA_WAITOK | BUS_DMA_ONEBPAGE, &ss->tx.info[i].map);
2994 device_printf(sc->dev, "Err %d tx dmamap\n", err);
2995 for (j = 0; j < i; ++j) {
2996 bus_dmamap_destroy(ss->tx.dmat,
2997 ss->tx.info[j].map);
2999 bus_dma_tag_destroy(ss->tx.dmat);
3008 mxge_alloc_rings(mxge_softc_t *sc)
3012 int tx_ring_entries, rx_ring_entries;
3015 /* Get ring sizes */
3016 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3018 device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3021 tx_ring_size = cmd.data0;
3023 tx_ring_entries = tx_ring_size / sizeof(mcp_kreq_ether_send_t);
3024 rx_ring_entries = sc->rx_ring_size / sizeof(mcp_dma_addr_t);
3025 ifq_set_maxlen(&sc->ifp->if_snd, tx_ring_entries - 1);
3026 ifq_set_ready(&sc->ifp->if_snd);
3028 for (slice = 0; slice < sc->num_slices; slice++) {
3029 err = mxge_alloc_slice_rings(&sc->ss[slice],
3030 rx_ring_entries, tx_ring_entries);
3032 device_printf(sc->dev,
3033 "alloc %d slice rings failed\n", slice);
3041 mxge_choose_params(int mtu, int *cl_size)
3043 int bufsize = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN + MXGEFW_PAD;
3045 if (bufsize < MCLBYTES) {
3046 *cl_size = MCLBYTES;
3048 KASSERT(bufsize < MJUMPAGESIZE, ("invalid MTU %d", mtu));
3049 *cl_size = MJUMPAGESIZE;
3054 mxge_slice_open(struct mxge_slice_state *ss, int cl_size)
3059 slice = ss - ss->sc->ss;
3062 * Get the lanai pointers to the send and receive rings
3065 #ifndef IFNET_BUF_RING
3066 /* We currently only send from the first slice */
3070 err = mxge_send_cmd(ss->sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3071 ss->tx.lanai = (volatile mcp_kreq_ether_send_t *)
3072 (ss->sc->sram + cmd.data0);
3073 ss->tx.send_go = (volatile uint32_t *)
3074 (ss->sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3075 ss->tx.send_stop = (volatile uint32_t *)
3076 (ss->sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3077 #ifndef IFNET_BUF_RING
3082 err |= mxge_send_cmd(ss->sc, MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3083 ss->rx_small.lanai =
3084 (volatile mcp_kreq_ether_recv_t *)(ss->sc->sram + cmd.data0);
3087 err |= mxge_send_cmd(ss->sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3089 (volatile mcp_kreq_ether_recv_t *)(ss->sc->sram + cmd.data0);
3092 if_printf(ss->sc->ifp,
3093 "failed to get ring sizes or locations\n");
3098 * Stock small receive ring
3100 for (i = 0; i <= ss->rx_small.mask; i++) {
3101 err = mxge_get_buf_small(&ss->rx_small,
3102 ss->rx_small.info[i].map, i, TRUE);
3104 if_printf(ss->sc->ifp, "alloced %d/%d smalls\n", i,
3105 ss->rx_small.mask + 1);
3111 * Stock big receive ring
3113 for (i = 0; i <= ss->rx_big.mask; i++) {
3114 ss->rx_big.shadow[i].addr_low = 0xffffffff;
3115 ss->rx_big.shadow[i].addr_high = 0xffffffff;
3118 ss->rx_big.cl_size = cl_size;
3119 ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3120 EVL_ENCAPLEN + MXGEFW_PAD;
3122 for (i = 0; i <= ss->rx_big.mask; i++) {
3123 err = mxge_get_buf_big(&ss->rx_big,
3124 ss->rx_big.info[i].map, i, TRUE);
3126 if_printf(ss->sc->ifp, "alloced %d/%d bigs\n", i,
3127 ss->rx_big.mask + 1);
3135 mxge_open(mxge_softc_t *sc)
3137 struct ifnet *ifp = sc->ifp;
3139 int err, slice, cl_size, i;
3141 volatile uint8_t *itable;
3142 struct mxge_slice_state *ss;
3144 ASSERT_SERIALIZED(ifp->if_serializer);
3146 /* Copy the MAC address in case it was overridden */
3147 bcopy(IF_LLADDR(ifp), sc->mac_addr, ETHER_ADDR_LEN);
3149 err = mxge_reset(sc, 1);
3151 if_printf(ifp, "failed to reset\n");
3155 if (sc->num_slices > 1) {
3156 /* Setup the indirection table */
3157 cmd.data0 = sc->num_slices;
3158 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE, &cmd);
3160 err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET, &cmd);
3162 if_printf(ifp, "failed to setup rss tables\n");
3166 /* Just enable an identity mapping */
3167 itable = sc->sram + cmd.data0;
3168 for (i = 0; i < sc->num_slices; i++)
3169 itable[i] = (uint8_t)i;
3172 cmd.data1 = MXGEFW_RSS_HASH_TYPE_TCP_IPV4;
3173 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3175 if_printf(ifp, "failed to enable slices\n");
3180 cmd.data0 = MXGEFW_TSO_MODE_NDIS;
3181 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_TSO_MODE, &cmd);
3184 * Can't change TSO mode to NDIS, never allow TSO then
3186 if_printf(ifp, "failed to set TSO mode\n");
3187 ifp->if_capenable &= ~IFCAP_TSO;
3188 ifp->if_capabilities &= ~IFCAP_TSO;
3189 ifp->if_hwassist &= ~CSUM_TSO;
3192 mxge_choose_params(ifp->if_mtu, &cl_size);
3195 err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS, &cmd);
3197 * Error is only meaningful if we're trying to set
3198 * MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1
3202 * Give the firmware the mtu and the big and small buffer
3203 * sizes. The firmware wants the big buf size to be a power
3204 * of two. Luckily, FreeBSD's clusters are powers of two
3206 cmd.data0 = ifp->if_mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
3207 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3209 /* XXX need to cut MXGEFW_PAD here? */
3210 cmd.data0 = MHLEN - MXGEFW_PAD;
3211 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE, &cmd);
3213 cmd.data0 = cl_size;
3214 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3217 if_printf(ifp, "failed to setup params\n");
3221 /* Now give him the pointer to the stats block */
3223 #ifdef IFNET_BUF_RING
3224 slice < sc->num_slices;
3229 ss = &sc->ss[slice];
3230 cmd.data0 = MXGE_LOWPART_TO_U32(ss->fw_stats_dma.dmem_busaddr);
3231 cmd.data1 = MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.dmem_busaddr);
3232 cmd.data2 = sizeof(struct mcp_irq_data);
3233 cmd.data2 |= (slice << 16);
3234 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3238 bus = sc->ss->fw_stats_dma.dmem_busaddr;
3239 bus += offsetof(struct mcp_irq_data, send_done_count);
3240 cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3241 cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3242 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3245 /* Firmware cannot support multicast without STATS_DMA_V2 */
3246 sc->fw_multicast_support = 0;
3248 sc->fw_multicast_support = 1;
3252 if_printf(ifp, "failed to setup params\n");
3256 for (slice = 0; slice < sc->num_slices; slice++) {
3257 err = mxge_slice_open(&sc->ss[slice], cl_size);
3259 if_printf(ifp, "couldn't open slice %d\n", slice);
3264 /* Finally, start the firmware running */
3265 err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3267 if_printf(ifp, "Couldn't bring up link\n");
3270 ifp->if_flags |= IFF_RUNNING;
3271 ifq_clr_oactive(&ifp->if_snd);
3277 mxge_free_mbufs(sc);
3282 mxge_close(mxge_softc_t *sc, int down)
3284 struct ifnet *ifp = sc->ifp;
3286 int err, old_down_cnt;
3288 ASSERT_SERIALIZED(ifp->if_serializer);
3290 ifp->if_flags &= ~IFF_RUNNING;
3291 ifq_clr_oactive(&ifp->if_snd);
3295 old_down_cnt = sc->down_cnt;
3298 err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3300 if_printf(ifp, "Couldn't bring down link\n");
3302 if (old_down_cnt == sc->down_cnt) {
3303 /* Wait for down irq */
3304 lwkt_serialize_exit(ifp->if_serializer);
3305 DELAY(10 * sc->intr_coal_delay);
3306 lwkt_serialize_enter(ifp->if_serializer);
3310 if (old_down_cnt == sc->down_cnt)
3311 if_printf(ifp, "never got down irq\n");
3313 mxge_free_mbufs(sc);
3317 mxge_setup_cfg_space(mxge_softc_t *sc)
3319 device_t dev = sc->dev;
3321 uint16_t lnk, pectl;
3323 /* Find the PCIe link width and set max read request to 4KB */
3324 if (pci_find_extcap(dev, PCIY_EXPRESS, ®) == 0) {
3325 lnk = pci_read_config(dev, reg + 0x12, 2);
3326 sc->link_width = (lnk >> 4) & 0x3f;
3328 if (sc->pectl == 0) {
3329 pectl = pci_read_config(dev, reg + 0x8, 2);
3330 pectl = (pectl & ~0x7000) | (5 << 12);
3331 pci_write_config(dev, reg + 0x8, pectl, 2);
3334 /* Restore saved pectl after watchdog reset */
3335 pci_write_config(dev, reg + 0x8, sc->pectl, 2);
3339 /* Enable DMA and memory space access */
3340 pci_enable_busmaster(dev);
3344 mxge_read_reboot(mxge_softc_t *sc)
3346 device_t dev = sc->dev;
3349 /* Find the vendor specific offset */
3350 if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
3351 if_printf(sc->ifp, "could not find vendor specific offset\n");
3352 return (uint32_t)-1;
3354 /* Enable read32 mode */
3355 pci_write_config(dev, vs + 0x10, 0x3, 1);
3356 /* Tell NIC which register to read */
3357 pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3358 return pci_read_config(dev, vs + 0x14, 4);
3362 mxge_watchdog_reset(mxge_softc_t *sc)
3364 struct pci_devinfo *dinfo;
3371 if_printf(sc->ifp, "Watchdog reset!\n");
3374 * Check to see if the NIC rebooted. If it did, then all of
3375 * PCI config space has been reset, and things like the
3376 * busmaster bit will be zero. If this is the case, then we
3377 * must restore PCI config space before the NIC can be used
3380 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3381 if (cmd == 0xffff) {
3383 * Maybe the watchdog caught the NIC rebooting; wait
3384 * up to 100ms for it to finish. If it does not come
3385 * back, then give up
3388 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3390 if_printf(sc->ifp, "NIC disappeared!\n");
3392 if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3393 /* Print the reboot status */
3394 reboot = mxge_read_reboot(sc);
3395 if_printf(sc->ifp, "NIC rebooted, status = 0x%x\n", reboot);
3397 running = sc->ifp->if_flags & IFF_RUNNING;
3400 * Quiesce NIC so that TX routines will not try to
3401 * xmit after restoration of BAR
3404 /* Mark the link as down */
3405 if (sc->link_state) {
3406 sc->ifp->if_link_state = LINK_STATE_DOWN;
3407 if_link_state_change(sc->ifp);
3411 /* Restore PCI configuration space */
3412 dinfo = device_get_ivars(sc->dev);
3413 pci_cfg_restore(sc->dev, dinfo);
3415 /* And redo any changes we made to our config space */
3416 mxge_setup_cfg_space(sc);
3419 err = mxge_load_firmware(sc, 0);
3421 if_printf(sc->ifp, "Unable to re-load f/w\n");
3422 if (running && !err) {
3423 err = mxge_open(sc);
3424 if_devstart_sched(sc->ifp);
3426 sc->watchdog_resets++;
3428 if_printf(sc->ifp, "NIC did not reboot, not resetting\n");
3432 if_printf(sc->ifp, "watchdog reset failed\n");
3436 callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3441 mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice)
3443 if_printf(sc->ifp, "slice %d struck? ring state:\n", slice);
3444 if_printf(sc->ifp, "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3445 tx->req, tx->done, tx->queue_active);
3446 if_printf(sc->ifp, "tx.activate=%d tx.deactivate=%d\n",
3447 tx->activate, tx->deactivate);
3448 if_printf(sc->ifp, "pkt_done=%d fw=%d\n",
3449 tx->pkt_done, be32toh(sc->ss->fw_stats->send_done_count));
3453 mxge_update_stats(mxge_softc_t *sc)
3455 struct mxge_slice_state *ss;
3457 u_long ipackets = 0, old_ipackets;
3458 u_long opackets = 0, old_opackets;
3459 #ifdef IFNET_BUF_RING
3467 for (slice = 0; slice < sc->num_slices; slice++) {
3468 ss = &sc->ss[slice];
3469 ipackets += ss->ipackets;
3470 opackets += ss->opackets;
3471 #ifdef IFNET_BUF_RING
3472 obytes += ss->obytes;
3473 omcasts += ss->omcasts;
3474 odrops += ss->tx.br->br_drops;
3476 oerrors += ss->oerrors;
3478 IFNET_STAT_GET(sc->ifp, ipackets, old_ipackets);
3479 IFNET_STAT_GET(sc->ifp, opackets, old_opackets);
3481 pkts = ipackets - old_ipackets;
3482 pkts += opackets - old_opackets;
3484 IFNET_STAT_SET(sc->ifp, ipackets, ipackets);
3485 IFNET_STAT_SET(sc->ifp, opackets, opackets);
3486 #ifdef IFNET_BUF_RING
3487 sc->ifp->if_obytes = obytes;
3488 sc->ifp->if_omcasts = omcasts;
3489 sc->ifp->if_snd.ifq_drops = odrops;
3491 IFNET_STAT_SET(sc->ifp, oerrors, oerrors);
3496 mxge_tick(void *arg)
3498 mxge_softc_t *sc = arg;
3504 lwkt_serialize_enter(sc->ifp->if_serializer);
3507 if (sc->ifp->if_flags & IFF_RUNNING) {
3508 /* Aggregate stats from different slices */
3509 pkts = mxge_update_stats(sc);
3510 if (sc->need_media_probe)
3511 mxge_media_probe(sc);
3514 /* Ensure NIC did not suffer h/w fault while idle */
3515 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3516 if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3518 mxge_watchdog_reset(sc);
3521 /* Look less often if NIC is idle */
3526 callout_reset(&sc->co_hdl, ticks, mxge_tick, sc);
3528 lwkt_serialize_exit(sc->ifp->if_serializer);
3532 mxge_media_change(struct ifnet *ifp)
3538 mxge_change_mtu(mxge_softc_t *sc, int mtu)
3540 struct ifnet *ifp = sc->ifp;
3541 int real_mtu, old_mtu;
3544 real_mtu = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
3545 if (mtu > sc->max_mtu || real_mtu < 60)
3548 old_mtu = ifp->if_mtu;
3550 if (ifp->if_flags & IFF_RUNNING) {
3552 err = mxge_open(sc);
3554 ifp->if_mtu = old_mtu;
3563 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
3565 mxge_softc_t *sc = ifp->if_softc;
3570 ifmr->ifm_status = IFM_AVALID;
3571 ifmr->ifm_active = IFM_ETHER | IFM_FDX;
3572 ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
3573 ifmr->ifm_active |= sc->current_media;
3577 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data,
3578 struct ucred *cr __unused)
3580 mxge_softc_t *sc = ifp->if_softc;
3581 struct ifreq *ifr = (struct ifreq *)data;
3584 ASSERT_SERIALIZED(ifp->if_serializer);
3589 err = mxge_change_mtu(sc, ifr->ifr_mtu);
3596 if (ifp->if_flags & IFF_UP) {
3597 if (!(ifp->if_flags & IFF_RUNNING)) {
3598 err = mxge_open(sc);
3601 * Take care of PROMISC and ALLMULTI
3604 mxge_change_promisc(sc,
3605 ifp->if_flags & IFF_PROMISC);
3606 mxge_set_multicast_list(sc);
3609 if (ifp->if_flags & IFF_RUNNING)
3616 mxge_set_multicast_list(sc);
3620 mask = ifr->ifr_reqcap ^ ifp->if_capenable;
3621 if (mask & IFCAP_TXCSUM) {
3622 ifp->if_capenable ^= IFCAP_TXCSUM;
3623 if (ifp->if_capenable & IFCAP_TXCSUM)
3624 ifp->if_hwassist |= CSUM_TCP | CSUM_UDP;
3626 ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP);
3628 if (mask & IFCAP_TSO) {
3629 ifp->if_capenable ^= IFCAP_TSO;
3630 if (ifp->if_capenable & IFCAP_TSO)
3631 ifp->if_hwassist |= CSUM_TSO;
3633 ifp->if_hwassist &= ~CSUM_TSO;
3635 if (mask & IFCAP_RXCSUM)
3636 ifp->if_capenable ^= IFCAP_RXCSUM;
3637 if (mask & IFCAP_VLAN_HWTAGGING)
3638 ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
3642 mxge_media_probe(sc);
3643 err = ifmedia_ioctl(ifp, (struct ifreq *)data,
3644 &sc->media, command);
3648 err = ether_ioctl(ifp, command, data);
3655 mxge_fetch_tunables(mxge_softc_t *sc)
3657 sc->intr_coal_delay = mxge_intr_coal_delay;
3658 if (sc->intr_coal_delay < 0 || sc->intr_coal_delay > (10 * 1000))
3659 sc->intr_coal_delay = MXGE_INTR_COAL_DELAY;
3662 if (mxge_ticks == 0)
3663 mxge_ticks = hz / 2;
3665 sc->pause = mxge_flow_control;
3667 sc->throttle = mxge_throttle;
3668 if (sc->throttle && sc->throttle > MXGE_MAX_THROTTLE)
3669 sc->throttle = MXGE_MAX_THROTTLE;
3670 if (sc->throttle && sc->throttle < MXGE_MIN_THROTTLE)
3671 sc->throttle = MXGE_MIN_THROTTLE;
3675 mxge_free_slices(mxge_softc_t *sc)
3677 struct mxge_slice_state *ss;
3683 for (i = 0; i < sc->num_slices; i++) {
3685 if (ss->fw_stats != NULL) {
3686 mxge_dma_free(&ss->fw_stats_dma);
3687 ss->fw_stats = NULL;
3689 if (ss->rx_done.entry != NULL) {
3690 mxge_dma_free(&ss->rx_done.dma);
3691 ss->rx_done.entry = NULL;
3694 kfree(sc->ss, M_DEVBUF);
3699 mxge_alloc_slices(mxge_softc_t *sc)
3702 struct mxge_slice_state *ss;
3704 int err, i, max_intr_slots;
3706 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
3708 device_printf(sc->dev, "Cannot determine rx ring size\n");
3711 sc->rx_ring_size = cmd.data0;
3712 max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
3714 bytes = sizeof(*sc->ss) * sc->num_slices;
3715 sc->ss = kmalloc(bytes, M_DEVBUF, M_WAITOK | M_ZERO);
3717 for (i = 0; i < sc->num_slices; i++) {
3723 * Allocate per-slice rx interrupt queues
3725 bytes = max_intr_slots * sizeof(*ss->rx_done.entry);
3726 err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
3728 device_printf(sc->dev,
3729 "alloc %d slice rx_done failed\n", i);
3732 ss->rx_done.entry = ss->rx_done.dma.dmem_addr;
3735 * Allocate the per-slice firmware stats; stats
3736 * (including tx) are used used only on the first
3739 #ifndef IFNET_BUF_RING
3744 bytes = sizeof(*ss->fw_stats);
3745 err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
3746 sizeof(*ss->fw_stats), 64);
3748 device_printf(sc->dev,
3749 "alloc %d fw_stats failed\n", i);
3752 ss->fw_stats = ss->fw_stats_dma.dmem_addr;
3758 mxge_slice_probe(mxge_softc_t *sc)
3762 int msix_cnt, status, max_intr_slots;
3769 * Don't enable multiple slices if they are not enabled,
3770 * or if this is not an SMP system
3772 if (mxge_max_slices == 0 || mxge_max_slices == 1 || ncpus < 2)
3775 /* see how many MSI-X interrupts are available */
3776 msix_cnt = pci_msix_count(sc->dev);
3780 /* now load the slice aware firmware see what it supports */
3781 old_fw = sc->fw_name;
3782 if (old_fw == mxge_fw_aligned)
3783 sc->fw_name = mxge_fw_rss_aligned;
3785 sc->fw_name = mxge_fw_rss_unaligned;
3786 status = mxge_load_firmware(sc, 0);
3788 device_printf(sc->dev, "Falling back to a single slice\n");
3792 /* try to send a reset command to the card to see if it
3794 memset(&cmd, 0, sizeof (cmd));
3795 status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
3797 device_printf(sc->dev, "failed reset\n");
3801 /* get rx ring size */
3802 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
3804 device_printf(sc->dev, "Cannot determine rx ring size\n");
3807 max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
3809 /* tell it the size of the interrupt queues */
3810 cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
3811 status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
3813 device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
3817 /* ask the maximum number of slices it supports */
3818 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
3820 device_printf(sc->dev,
3821 "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
3824 sc->num_slices = cmd.data0;
3825 if (sc->num_slices > msix_cnt)
3826 sc->num_slices = msix_cnt;
3828 if (mxge_max_slices == -1) {
3829 /* cap to number of CPUs in system */
3830 if (sc->num_slices > ncpus)
3831 sc->num_slices = ncpus;
3833 if (sc->num_slices > mxge_max_slices)
3834 sc->num_slices = mxge_max_slices;
3836 /* make sure it is a power of two */
3837 while (sc->num_slices & (sc->num_slices - 1))
3841 device_printf(sc->dev, "using %d slices\n",
3847 sc->fw_name = old_fw;
3848 (void) mxge_load_firmware(sc, 0);
3853 mxge_add_msix_irqs(mxge_softc_t *sc)
3856 int count, err, i, rid;
3859 sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
3862 if (sc->msix_table_res == NULL) {
3863 device_printf(sc->dev, "couldn't alloc MSIX table res\n");
3867 count = sc->num_slices;
3868 err = pci_alloc_msix(sc->dev, &count);
3870 device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
3871 "err = %d \n", sc->num_slices, err);
3872 goto abort_with_msix_table;
3874 if (count < sc->num_slices) {
3875 device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
3876 count, sc->num_slices);
3877 device_printf(sc->dev,
3878 "Try setting hw.mxge.max_slices to %d\n",
3881 goto abort_with_msix;
3883 bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
3884 sc->msix_irq_res = kmalloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
3885 if (sc->msix_irq_res == NULL) {
3887 goto abort_with_msix;
3890 for (i = 0; i < sc->num_slices; i++) {
3892 sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
3895 if (sc->msix_irq_res[i] == NULL) {
3896 device_printf(sc->dev, "couldn't allocate IRQ res"
3897 " for message %d\n", i);
3899 goto abort_with_res;
3903 bytes = sizeof (*sc->msix_ih) * sc->num_slices;
3904 sc->msix_ih = kmalloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
3906 for (i = 0; i < sc->num_slices; i++) {
3907 err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
3909 mxge_intr, &sc->ss[i], &sc->msix_ih[i],
3910 sc->ifp->if_serializer);
3912 device_printf(sc->dev, "couldn't setup intr for "
3914 goto abort_with_intr;
3919 device_printf(sc->dev, "using %d msix IRQs:",
3921 for (i = 0; i < sc->num_slices; i++)
3922 kprintf(" %ld", rman_get_start(sc->msix_irq_res[i]));
3928 for (i = 0; i < sc->num_slices; i++) {
3929 if (sc->msix_ih[i] != NULL) {
3930 bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
3932 sc->msix_ih[i] = NULL;
3935 kfree(sc->msix_ih, M_DEVBUF);
3939 for (i = 0; i < sc->num_slices; i++) {
3941 if (sc->msix_irq_res[i] != NULL)
3942 bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
3943 sc->msix_irq_res[i]);
3944 sc->msix_irq_res[i] = NULL;
3946 kfree(sc->msix_irq_res, M_DEVBUF);
3950 pci_release_msi(sc->dev);
3952 abort_with_msix_table:
3953 bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
3954 sc->msix_table_res);
3961 mxge_add_single_irq(mxge_softc_t *sc)
3965 sc->irq_type = pci_alloc_1intr(sc->dev, mxge_msi_enable,
3966 &sc->irq_rid, &irq_flags);
3968 sc->irq_res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ,
3969 &sc->irq_rid, irq_flags);
3970 if (sc->irq_res == NULL) {
3971 device_printf(sc->dev, "could not alloc interrupt\n");
3975 return bus_setup_intr(sc->dev, sc->irq_res, INTR_MPSAFE,
3976 mxge_intr, &sc->ss[0], &sc->ih, sc->ifp->if_serializer);
3981 mxge_rem_msix_irqs(mxge_softc_t *sc)
3985 for (i = 0; i < sc->num_slices; i++) {
3986 if (sc->msix_ih[i] != NULL) {
3987 bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
3989 sc->msix_ih[i] = NULL;
3992 kfree(sc->msix_ih, M_DEVBUF);
3994 for (i = 0; i < sc->num_slices; i++) {
3996 if (sc->msix_irq_res[i] != NULL)
3997 bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
3998 sc->msix_irq_res[i]);
3999 sc->msix_irq_res[i] = NULL;
4001 kfree(sc->msix_irq_res, M_DEVBUF);
4003 bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4004 sc->msix_table_res);
4006 pci_release_msi(sc->dev);
4012 mxge_add_irq(mxge_softc_t *sc)
4017 if (sc->num_slices > 1)
4018 err = mxge_add_msix_irqs(sc);
4020 err = mxge_add_single_irq(sc);
4022 if (0 && err == 0 && sc->num_slices > 1) {
4023 mxge_rem_msix_irqs(sc);
4024 err = mxge_add_msix_irqs(sc);
4028 return mxge_add_single_irq(sc);
4033 mxge_attach(device_t dev)
4035 mxge_softc_t *sc = device_get_softc(dev);
4036 struct ifnet *ifp = &sc->arpcom.ac_if;
4040 * Avoid rewriting half the lines in this file to use
4041 * &sc->arpcom.ac_if instead
4045 if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4046 ifmedia_init(&sc->media, 0, mxge_media_change, mxge_media_status);
4048 mxge_fetch_tunables(sc);
4050 err = bus_dma_tag_create(NULL, /* parent */
4053 BUS_SPACE_MAXADDR, /* low */
4054 BUS_SPACE_MAXADDR, /* high */
4055 NULL, NULL, /* filter */
4056 BUS_SPACE_MAXSIZE_32BIT,/* maxsize */
4058 BUS_SPACE_MAXSIZE_32BIT,/* maxsegsize */
4060 &sc->parent_dmat); /* tag */
4062 device_printf(dev, "Err %d allocating parent dmat\n", err);
4066 callout_init_mp(&sc->co_hdl);
4068 mxge_setup_cfg_space(sc);
4071 * Map the board into the kernel
4074 sc->mem_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY,
4076 if (sc->mem_res == NULL) {
4077 device_printf(dev, "could not map memory\n");
4082 sc->sram = rman_get_virtual(sc->mem_res);
4083 sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4084 if (sc->sram_size > rman_get_size(sc->mem_res)) {
4085 device_printf(dev, "impossible memory region size %ld\n",
4086 rman_get_size(sc->mem_res));
4092 * Make NULL terminated copy of the EEPROM strings section of
4095 bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4096 bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4097 rman_get_bushandle(sc->mem_res),
4098 sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4099 sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE - 2);
4100 err = mxge_parse_strings(sc);
4102 device_printf(dev, "parse EEPROM string failed\n");
4107 * Enable write combining for efficient use of PCIe bus
4112 * Allocate the out of band DMA memory
4114 err = mxge_dma_alloc(sc, &sc->cmd_dma, sizeof(mxge_cmd_t), 64);
4116 device_printf(dev, "alloc cmd DMA buf failed\n");
4119 sc->cmd = sc->cmd_dma.dmem_addr;
4121 err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4123 device_printf(dev, "alloc zeropad DMA buf failed\n");
4127 err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4129 device_printf(dev, "alloc dmabench DMA buf failed\n");
4133 /* Select & load the firmware */
4134 err = mxge_select_firmware(sc);
4136 device_printf(dev, "select firmware failed\n");
4140 mxge_slice_probe(sc);
4141 err = mxge_alloc_slices(sc);
4143 device_printf(dev, "alloc slices failed\n");
4147 err = mxge_reset(sc, 0);
4149 device_printf(dev, "reset failed\n");
4153 err = mxge_alloc_rings(sc);
4155 device_printf(dev, "failed to allocate rings\n");
4159 ifp->if_baudrate = IF_Gbps(10UL);
4160 ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO;
4161 ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4163 ifp->if_capabilities |= IFCAP_VLAN_MTU;
4165 /* Well, its software, sigh */
4166 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING;
4168 ifp->if_capenable = ifp->if_capabilities;
4171 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4172 ifp->if_init = mxge_init;
4173 ifp->if_ioctl = mxge_ioctl;
4174 ifp->if_start = mxge_start;
4175 ifp->if_watchdog = mxge_watchdog;
4177 /* Increase TSO burst length */
4178 ifp->if_tsolen = (32 * ETHERMTU);
4180 /* Initialise the ifmedia structure */
4181 mxge_media_init(sc);
4182 mxge_media_probe(sc);
4184 ether_ifattach(ifp, sc->mac_addr, NULL);
4188 * We are not ready to do "gather" jumbo frame, so
4189 * limit MTU to MJUMPAGESIZE
4191 sc->max_mtu = MJUMPAGESIZE -
4192 ETHER_HDR_LEN - EVL_ENCAPLEN - MXGEFW_PAD - 1;
4195 /* must come after ether_ifattach() */
4196 err = mxge_add_irq(sc);
4198 device_printf(dev, "alloc and setup intr failed\n");
4199 ether_ifdetach(ifp);
4202 ifq_set_cpuid(&ifp->if_snd, rman_get_cpuid(sc->irq_res));
4204 mxge_add_sysctls(sc);
4206 callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
4215 mxge_detach(device_t dev)
4217 mxge_softc_t *sc = device_get_softc(dev);
4219 if (device_is_attached(dev)) {
4220 struct ifnet *ifp = sc->ifp;
4222 lwkt_serialize_enter(ifp->if_serializer);
4225 if (ifp->if_flags & IFF_RUNNING)
4227 callout_stop(&sc->co_hdl);
4229 bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4231 lwkt_serialize_exit(ifp->if_serializer);
4233 callout_terminate(&sc->co_hdl);
4235 ether_ifdetach(ifp);
4237 ifmedia_removeall(&sc->media);
4239 if (sc->cmd != NULL && sc->zeropad_dma.dmem_addr != NULL &&
4241 mxge_dummy_rdma(sc, 0);
4243 mxge_rem_sysctls(sc);
4244 mxge_free_rings(sc);
4246 /* MUST after sysctls and rings are freed */
4247 mxge_free_slices(sc);
4249 if (sc->dmabench_dma.dmem_addr != NULL)
4250 mxge_dma_free(&sc->dmabench_dma);
4251 if (sc->zeropad_dma.dmem_addr != NULL)
4252 mxge_dma_free(&sc->zeropad_dma);
4253 if (sc->cmd_dma.dmem_addr != NULL)
4254 mxge_dma_free(&sc->cmd_dma);
4256 if (sc->irq_res != NULL) {
4257 bus_release_resource(dev, SYS_RES_IRQ, sc->irq_rid,
4260 if (sc->irq_type == PCI_INTR_TYPE_MSI)
4261 pci_release_msi(dev);
4263 if (sc->mem_res != NULL) {
4264 bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS,
4268 if (sc->parent_dmat != NULL)
4269 bus_dma_tag_destroy(sc->parent_dmat);
4275 mxge_shutdown(device_t dev)