1 /******************************************************************************
3 Copyright (c) 2006-2013, Myricom Inc.
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
9 1. Redistributions of source code must retain the above copyright notice,
10 this list of conditions and the following disclaimer.
12 2. Neither the name of the Myricom Inc, nor the names of its
13 contributors may be used to endorse or promote products derived from
14 this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
28 $FreeBSD: head/sys/dev/mxge/if_mxge.c 254263 2013-08-12 23:30:01Z scottl $
30 ***************************************************************************/
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/linker.h>
37 #include <sys/firmware.h>
38 #include <sys/endian.h>
39 #include <sys/in_cksum.h>
40 #include <sys/sockio.h>
42 #include <sys/malloc.h>
43 #include <sys/kernel.h>
44 #include <sys/module.h>
45 #include <sys/serialize.h>
46 #include <sys/socket.h>
47 #include <sys/sysctl.h>
50 #include <net/if_arp.h>
51 #include <net/ifq_var.h>
52 #include <net/ethernet.h>
53 #include <net/if_dl.h>
54 #include <net/if_media.h>
58 #include <net/if_types.h>
59 #include <net/vlan/if_vlan_var.h>
62 #include <netinet/in_systm.h>
63 #include <netinet/in.h>
64 #include <netinet/ip.h>
65 #include <netinet/tcp.h>
70 #include <bus/pci/pcireg.h>
71 #include <bus/pci/pcivar.h>
72 #include <bus/pci/pci_private.h> /* XXX for pci_cfg_restore */
74 #include <vm/vm.h> /* for pmap_mapdev() */
77 #if defined(__i386__) || defined(__x86_64__)
78 #include <machine/specialreg.h>
81 #include <dev/netif/mxge/mxge_mcp.h>
82 #include <dev/netif/mxge/mcp_gen_header.h>
83 #include <dev/netif/mxge/if_mxge_var.h>
86 static int mxge_nvidia_ecrc_enable = 1;
87 static int mxge_force_firmware = 0;
88 static int mxge_intr_coal_delay = MXGE_INTR_COAL_DELAY;
89 static int mxge_deassert_wait = 1;
90 static int mxge_flow_control = 1;
91 static int mxge_ticks;
92 static int mxge_max_slices = 1;
93 static int mxge_always_promisc = 0;
94 static int mxge_throttle = 0;
95 static int mxge_msi_enable = 1;
97 static const char *mxge_fw_unaligned = "mxge_ethp_z8e";
98 static const char *mxge_fw_aligned = "mxge_eth_z8e";
99 static const char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
100 static const char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
102 TUNABLE_INT("hw.mxge.max_slices", &mxge_max_slices);
103 TUNABLE_INT("hw.mxge.flow_control_enabled", &mxge_flow_control);
104 TUNABLE_INT("hw.mxge.intr_coal_delay", &mxge_intr_coal_delay);
105 TUNABLE_INT("hw.mxge.nvidia_ecrc_enable", &mxge_nvidia_ecrc_enable);
106 TUNABLE_INT("hw.mxge.force_firmware", &mxge_force_firmware);
107 TUNABLE_INT("hw.mxge.deassert_wait", &mxge_deassert_wait);
108 TUNABLE_INT("hw.mxge.ticks", &mxge_ticks);
109 TUNABLE_INT("hw.mxge.always_promisc", &mxge_always_promisc);
110 TUNABLE_INT("hw.mxge.throttle", &mxge_throttle);
111 TUNABLE_INT("hw.mxge.msi.enable", &mxge_msi_enable);
113 static int mxge_probe(device_t dev);
114 static int mxge_attach(device_t dev);
115 static int mxge_detach(device_t dev);
116 static int mxge_shutdown(device_t dev);
118 static device_method_t mxge_methods[] = {
119 /* Device interface */
120 DEVMETHOD(device_probe, mxge_probe),
121 DEVMETHOD(device_attach, mxge_attach),
122 DEVMETHOD(device_detach, mxge_detach),
123 DEVMETHOD(device_shutdown, mxge_shutdown),
127 static driver_t mxge_driver = {
130 sizeof(mxge_softc_t),
133 static devclass_t mxge_devclass;
135 /* Declare ourselves to be a child of the PCI bus.*/
136 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, NULL, NULL);
137 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
138 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
140 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
141 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
142 static void mxge_close(mxge_softc_t *sc, int down);
143 static int mxge_open(mxge_softc_t *sc);
144 static void mxge_tick(void *arg);
145 static void mxge_watchdog_reset(mxge_softc_t *sc);
146 static void mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice);
149 mxge_probe(device_t dev)
151 if (pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM &&
152 (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E ||
153 pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9)) {
154 int rev = pci_get_revid(dev);
157 case MXGE_PCI_REV_Z8E:
158 device_set_desc(dev, "Myri10G-PCIE-8A");
160 case MXGE_PCI_REV_Z8ES:
161 device_set_desc(dev, "Myri10G-PCIE-8B");
164 device_set_desc(dev, "Myri10G-PCIE-8??");
165 device_printf(dev, "Unrecognized rev %d NIC\n", rev);
174 mxge_enable_wc(mxge_softc_t *sc)
176 #if defined(__i386__) || defined(__x86_64__)
180 len = rman_get_size(sc->mem_res);
181 pmap_change_attr((vm_offset_t) sc->sram, len / PAGE_SIZE,
182 PAT_WRITE_COMBINING);
187 mxge_dma_alloc(mxge_softc_t *sc, bus_dmamem_t *dma, size_t bytes,
188 bus_size_t alignment)
193 if (bytes > 4096 && alignment == 4096)
198 err = bus_dmamem_coherent(sc->parent_dmat, alignment, boundary,
199 BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, bytes,
200 BUS_DMA_WAITOK | BUS_DMA_ZERO, dma);
202 device_printf(sc->dev, "bus_dmamem_coherent failed: %d\n", err);
209 mxge_dma_free(bus_dmamem_t *dma)
211 bus_dmamap_unload(dma->dmem_tag, dma->dmem_map);
212 bus_dmamem_free(dma->dmem_tag, dma->dmem_addr, dma->dmem_map);
213 bus_dma_tag_destroy(dma->dmem_tag);
217 * The eeprom strings on the lanaiX have the format
223 mxge_parse_strings(mxge_softc_t *sc)
226 int i, found_mac, found_sn2;
229 ptr = sc->eeprom_strings;
232 while (*ptr != '\0') {
233 if (strncmp(ptr, "MAC=", 4) == 0) {
236 sc->mac_addr[i] = strtoul(ptr, &endptr, 16);
237 if (endptr - ptr != 2)
246 } else if (strncmp(ptr, "PC=", 3) == 0) {
248 strlcpy(sc->product_code_string, ptr,
249 sizeof(sc->product_code_string));
250 } else if (!found_sn2 && (strncmp(ptr, "SN=", 3) == 0)) {
252 strlcpy(sc->serial_number_string, ptr,
253 sizeof(sc->serial_number_string));
254 } else if (strncmp(ptr, "SN2=", 4) == 0) {
255 /* SN2 takes precedence over SN */
258 strlcpy(sc->serial_number_string, ptr,
259 sizeof(sc->serial_number_string));
261 while (*ptr++ != '\0') {}
268 device_printf(sc->dev, "failed to parse eeprom_strings\n");
272 #if defined(__i386__) || defined(__x86_64__)
275 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
278 unsigned long base, off;
280 device_t pdev, mcp55;
281 uint16_t vendor_id, device_id, word;
282 uintptr_t bus, slot, func, ivend, idev;
285 if (!mxge_nvidia_ecrc_enable)
288 pdev = device_get_parent(device_get_parent(sc->dev));
290 device_printf(sc->dev, "could not find parent?\n");
293 vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
294 device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
296 if (vendor_id != 0x10de)
301 if (device_id == 0x005d) {
302 /* ck804, base address is magic */
304 } else if (device_id >= 0x0374 && device_id <= 0x378) {
305 /* mcp55, base address stored in chipset */
306 mcp55 = pci_find_bsf(0, 0, 0);
308 0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
309 0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
310 word = pci_read_config(mcp55, 0x90, 2);
311 base = ((unsigned long)word & 0x7ffeU) << 25;
319 * Test below is commented because it is believed that doing
320 * config read/write beyond 0xff will access the config space
321 * for the next larger function. Uncomment this and remove
322 * the hacky pmap_mapdev() way of accessing config space when
323 * DragonFly grows support for extended pcie config space access.
327 * See if we can, by some miracle, access the extended
330 val = pci_read_config(pdev, 0x178, 4);
331 if (val != 0xffffffff) {
333 pci_write_config(pdev, 0x178, val, 4);
338 * Rather than using normal pci config space writes, we must
339 * map the Nvidia config space ourselves. This is because on
340 * opteron/nvidia class machine the 0xe000000 mapping is
341 * handled by the nvidia chipset, that means the internal PCI
342 * device (the on-chip northbridge), or the amd-8131 bridge
343 * and things behind them are not visible by this method.
346 BUS_READ_IVAR(device_get_parent(pdev), pdev,
348 BUS_READ_IVAR(device_get_parent(pdev), pdev,
349 PCI_IVAR_SLOT, &slot);
350 BUS_READ_IVAR(device_get_parent(pdev), pdev,
351 PCI_IVAR_FUNCTION, &func);
352 BUS_READ_IVAR(device_get_parent(pdev), pdev,
353 PCI_IVAR_VENDOR, &ivend);
354 BUS_READ_IVAR(device_get_parent(pdev), pdev,
355 PCI_IVAR_DEVICE, &idev);
357 off = base + 0x00100000UL * (unsigned long)bus +
358 0x00001000UL * (unsigned long)(func + 8 * slot);
360 /* map it into the kernel */
361 va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
363 device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
366 /* get a pointer to the config space mapped into the kernel */
367 cfgptr = va + (off & PAGE_MASK);
369 /* make sure that we can really access it */
370 vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
371 device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
372 if (!(vendor_id == ivend && device_id == idev)) {
373 device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
374 vendor_id, device_id);
375 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
379 ptr32 = (uint32_t*)(cfgptr + 0x178);
382 if (val == 0xffffffff) {
383 device_printf(sc->dev, "extended mapping failed\n");
384 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
388 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
390 device_printf(sc->dev, "Enabled ECRC on upstream "
391 "Nvidia bridge at %d:%d:%d\n",
392 (int)bus, (int)slot, (int)func);
396 #else /* __i386__ || __x86_64__ */
399 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
401 device_printf(sc->dev, "Nforce 4 chipset on non-x86/x86_64!?!?!\n");
407 mxge_dma_test(mxge_softc_t *sc, int test_type)
410 bus_addr_t dmatest_bus = sc->dmabench_dma.dmem_busaddr;
413 const char *test = " ";
416 * Run a small DMA test.
417 * The magic multipliers to the length tell the firmware
418 * to do DMA read, write, or read+write tests. The
419 * results are returned in cmd.data0. The upper 16
420 * bits of the return is the number of transfers completed.
421 * The lower 16 bits is the time in 0.5us ticks that the
422 * transfers took to complete.
425 len = sc->tx_boundary;
427 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
428 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
429 cmd.data2 = len * 0x10000;
430 status = mxge_send_cmd(sc, test_type, &cmd);
435 sc->read_dma = ((cmd.data0>>16) * len * 2) / (cmd.data0 & 0xffff);
437 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
438 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
439 cmd.data2 = len * 0x1;
440 status = mxge_send_cmd(sc, test_type, &cmd);
445 sc->write_dma = ((cmd.data0>>16) * len * 2) / (cmd.data0 & 0xffff);
447 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
448 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
449 cmd.data2 = len * 0x10001;
450 status = mxge_send_cmd(sc, test_type, &cmd);
455 sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
456 (cmd.data0 & 0xffff);
459 if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST) {
460 device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
467 * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
468 * when the PCI-E Completion packets are aligned on an 8-byte
469 * boundary. Some PCI-E chip sets always align Completion packets; on
470 * the ones that do not, the alignment can be enforced by enabling
471 * ECRC generation (if supported).
473 * When PCI-E Completion packets are not aligned, it is actually more
474 * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
476 * If the driver can neither enable ECRC nor verify that it has
477 * already been enabled, then it must use a firmware image which works
478 * around unaligned completion packets (ethp_z8e.dat), and it should
479 * also ensure that it never gives the device a Read-DMA which is
480 * larger than 2KB by setting the tx_boundary to 2KB. If ECRC is
481 * enabled, then the driver should use the aligned (eth_z8e.dat)
482 * firmware image, and set tx_boundary to 4KB.
485 mxge_firmware_probe(mxge_softc_t *sc)
487 device_t dev = sc->dev;
491 sc->tx_boundary = 4096;
494 * Verify the max read request size was set to 4KB
495 * before trying the test with 4KB.
497 if (pci_find_extcap(dev, PCIY_EXPRESS, ®) == 0) {
498 pectl = pci_read_config(dev, reg + 0x8, 2);
499 if ((pectl & (5 << 12)) != (5 << 12)) {
500 device_printf(dev, "Max Read Req. size != 4k (0x%x)\n",
502 sc->tx_boundary = 2048;
507 * Load the optimized firmware (which assumes aligned PCIe
508 * completions) in order to see if it works on this host.
510 sc->fw_name = mxge_fw_aligned;
511 status = mxge_load_firmware(sc, 1);
516 * Enable ECRC if possible
518 mxge_enable_nvidia_ecrc(sc);
521 * Run a DMA test which watches for unaligned completions and
522 * aborts on the first one seen. Not required on Z8ES or newer.
524 if (pci_get_revid(sc->dev) >= MXGE_PCI_REV_Z8ES)
527 status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
529 return 0; /* keep the aligned firmware */
532 device_printf(dev, "DMA test failed: %d\n", status);
533 if (status == ENOSYS) {
534 device_printf(dev, "Falling back to ethp! "
535 "Please install up to date fw\n");
541 mxge_select_firmware(mxge_softc_t *sc)
544 int force_firmware = mxge_force_firmware;
547 force_firmware = sc->throttle;
549 if (force_firmware != 0) {
550 if (force_firmware == 1)
555 device_printf(sc->dev,
556 "Assuming %s completions (forced)\n",
557 aligned ? "aligned" : "unaligned");
563 * If the PCIe link width is 4 or less, we can use the aligned
564 * firmware and skip any checks
566 if (sc->link_width != 0 && sc->link_width <= 4) {
567 device_printf(sc->dev, "PCIe x%d Link, "
568 "expect reduced performance\n", sc->link_width);
573 if (mxge_firmware_probe(sc) == 0)
578 sc->fw_name = mxge_fw_aligned;
579 sc->tx_boundary = 4096;
581 sc->fw_name = mxge_fw_unaligned;
582 sc->tx_boundary = 2048;
584 return mxge_load_firmware(sc, 0);
588 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
590 if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
591 if_printf(sc->ifp, "Bad firmware type: 0x%x\n",
592 be32toh(hdr->mcp_type));
596 /* Save firmware version for sysctl */
597 strlcpy(sc->fw_version, hdr->version, sizeof(sc->fw_version));
599 if_printf(sc->ifp, "firmware id: %s\n", hdr->version);
601 ksscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
602 &sc->fw_ver_minor, &sc->fw_ver_tiny);
604 if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR &&
605 sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
606 if_printf(sc->ifp, "Found firmware version %s\n",
608 if_printf(sc->ifp, "Driver needs %d.%d\n",
609 MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
616 z_alloc(void *nil, u_int items, u_int size)
618 return kmalloc(items * size, M_TEMP, M_WAITOK);
622 z_free(void *nil, void *ptr)
628 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
631 char *inflate_buffer;
632 const struct firmware *fw;
633 const mcp_gen_header_t *hdr;
640 fw = firmware_get(sc->fw_name);
642 if_printf(sc->ifp, "Could not find firmware image %s\n",
647 /* Setup zlib and decompress f/w */
648 bzero(&zs, sizeof(zs));
651 status = inflateInit(&zs);
652 if (status != Z_OK) {
658 * The uncompressed size is stored as the firmware version,
659 * which would otherwise go unused
661 fw_len = (size_t)fw->version;
662 inflate_buffer = kmalloc(fw_len, M_TEMP, M_WAITOK);
663 zs.avail_in = fw->datasize;
664 zs.next_in = __DECONST(char *, fw->data);
665 zs.avail_out = fw_len;
666 zs.next_out = inflate_buffer;
667 status = inflate(&zs, Z_FINISH);
668 if (status != Z_STREAM_END) {
669 if_printf(sc->ifp, "zlib %d\n", status);
671 goto abort_with_buffer;
676 htobe32(*(const uint32_t *)(inflate_buffer + MCP_HEADER_PTR_OFFSET));
677 if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
678 if_printf(sc->ifp, "Bad firmware file");
680 goto abort_with_buffer;
682 hdr = (const void*)(inflate_buffer + hdr_offset);
684 status = mxge_validate_firmware(sc, hdr);
686 goto abort_with_buffer;
688 /* Copy the inflated firmware to NIC SRAM. */
689 for (i = 0; i < fw_len; i += 256) {
690 mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i, inflate_buffer + i,
691 min(256U, (unsigned)(fw_len - i)));
700 kfree(inflate_buffer, M_TEMP);
703 firmware_put(fw, FIRMWARE_UNLOAD);
708 * Enable or disable periodic RDMAs from the host to make certain
709 * chipsets resend dropped PCIe messages
712 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
715 volatile uint32_t *confirm;
716 volatile char *submit;
717 uint32_t *buf, dma_low, dma_high;
720 buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
722 /* Clear confirmation addr */
723 confirm = (volatile uint32_t *)sc->cmd;
728 * Send an rdma command to the PCIe engine, and wait for the
729 * response in the confirmation address. The firmware should
730 * write a -1 there to indicate it is alive and well
732 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.dmem_busaddr);
733 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.dmem_busaddr);
734 buf[0] = htobe32(dma_high); /* confirm addr MSW */
735 buf[1] = htobe32(dma_low); /* confirm addr LSW */
736 buf[2] = htobe32(0xffffffff); /* confirm data */
737 dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.dmem_busaddr);
738 dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.dmem_busaddr);
739 buf[3] = htobe32(dma_high); /* dummy addr MSW */
740 buf[4] = htobe32(dma_low); /* dummy addr LSW */
741 buf[5] = htobe32(enable); /* enable? */
743 submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
745 mxge_pio_copy(submit, buf, 64);
750 while (*confirm != 0xffffffff && i < 20) {
754 if (*confirm != 0xffffffff) {
755 if_printf(sc->ifp, "dummy rdma %s failed (%p = 0x%x)",
756 (enable ? "enable" : "disable"), confirm, *confirm);
761 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
764 char buf_bytes[sizeof(*buf) + 8];
765 volatile mcp_cmd_response_t *response = sc->cmd;
766 volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
767 uint32_t dma_low, dma_high;
768 int err, sleep_total = 0;
770 /* Ensure buf is aligned to 8 bytes */
771 buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
773 buf->data0 = htobe32(data->data0);
774 buf->data1 = htobe32(data->data1);
775 buf->data2 = htobe32(data->data2);
776 buf->cmd = htobe32(cmd);
777 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.dmem_busaddr);
778 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.dmem_busaddr);
780 buf->response_addr.low = htobe32(dma_low);
781 buf->response_addr.high = htobe32(dma_high);
783 response->result = 0xffffffff;
785 mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
791 for (sleep_total = 0; sleep_total < 20; sleep_total++) {
793 switch (be32toh(response->result)) {
795 data->data0 = be32toh(response->data);
801 case MXGEFW_CMD_UNKNOWN:
804 case MXGEFW_CMD_ERROR_UNALIGNED:
807 case MXGEFW_CMD_ERROR_BUSY:
810 case MXGEFW_CMD_ERROR_I2C_ABSENT:
814 if_printf(sc->ifp, "command %d failed, result = %d\n",
815 cmd, be32toh(response->result));
823 if_printf(sc->ifp, "command %d timed out result = %d\n",
824 cmd, be32toh(response->result));
830 mxge_adopt_running_firmware(mxge_softc_t *sc)
832 struct mcp_gen_header *hdr;
833 const size_t bytes = sizeof(struct mcp_gen_header);
838 * Find running firmware header
841 htobe32(*(volatile uint32_t *)(sc->sram + MCP_HEADER_PTR_OFFSET));
843 if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
844 if_printf(sc->ifp, "Running firmware has bad header offset "
845 "(%zu)\n", hdr_offset);
850 * Copy header of running firmware from SRAM to host memory to
853 hdr = kmalloc(bytes, M_DEVBUF, M_WAITOK);
854 bus_space_read_region_1(rman_get_bustag(sc->mem_res),
855 rman_get_bushandle(sc->mem_res), hdr_offset, (char *)hdr, bytes);
856 status = mxge_validate_firmware(sc, hdr);
857 kfree(hdr, M_DEVBUF);
860 * Check to see if adopted firmware has bug where adopting
861 * it will cause broadcasts to be filtered unless the NIC
862 * is kept in ALLMULTI mode
864 if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
865 sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
866 sc->adopted_rx_filter_bug = 1;
867 if_printf(sc->ifp, "Adopting fw %d.%d.%d: "
868 "working around rx filter bug\n",
869 sc->fw_ver_major, sc->fw_ver_minor, sc->fw_ver_tiny);
876 mxge_load_firmware(mxge_softc_t *sc, int adopt)
878 volatile uint32_t *confirm;
879 volatile char *submit;
881 uint32_t *buf, size, dma_low, dma_high;
884 buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
886 size = sc->sram_size;
887 status = mxge_load_firmware_helper(sc, &size);
893 * Try to use the currently running firmware, if
896 status = mxge_adopt_running_firmware(sc);
899 "failed to adopt running firmware\n");
902 if_printf(sc->ifp, "Successfully adopted running firmware\n");
904 if (sc->tx_boundary == 4096) {
906 "Using firmware currently running on NIC. "
908 if_printf(sc->ifp, "performance consider loading "
909 "optimized firmware\n");
911 sc->fw_name = mxge_fw_unaligned;
912 sc->tx_boundary = 2048;
916 /* Clear confirmation addr */
917 confirm = (volatile uint32_t *)sc->cmd;
922 * Send a reload command to the bootstrap MCP, and wait for the
923 * response in the confirmation address. The firmware should
924 * write a -1 there to indicate it is alive and well
927 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.dmem_busaddr);
928 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.dmem_busaddr);
930 buf[0] = htobe32(dma_high); /* confirm addr MSW */
931 buf[1] = htobe32(dma_low); /* confirm addr LSW */
932 buf[2] = htobe32(0xffffffff); /* confirm data */
935 * FIX: All newest firmware should un-protect the bottom of
936 * the sram before handoff. However, the very first interfaces
937 * do not. Therefore the handoff copy must skip the first 8 bytes
939 /* where the code starts*/
940 buf[3] = htobe32(MXGE_FW_OFFSET + 8);
941 buf[4] = htobe32(size - 8); /* length of code */
942 buf[5] = htobe32(8); /* where to copy to */
943 buf[6] = htobe32(0); /* where to jump to */
945 submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
946 mxge_pio_copy(submit, buf, 64);
951 while (*confirm != 0xffffffff && i < 20) {
955 if (*confirm != 0xffffffff) {
956 if_printf(sc->ifp,"handoff failed (%p = 0x%x)",
964 mxge_update_mac_address(mxge_softc_t *sc)
967 uint8_t *addr = sc->mac_addr;
969 cmd.data0 = (addr[0] << 24) | (addr[1] << 16) |
970 (addr[2] << 8) | addr[3];
971 cmd.data1 = (addr[4] << 8) | (addr[5]);
972 return mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
976 mxge_change_pause(mxge_softc_t *sc, int pause)
982 status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL, &cmd);
984 status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL, &cmd);
986 if_printf(sc->ifp, "Failed to set flow control mode\n");
994 mxge_change_promisc(mxge_softc_t *sc, int promisc)
999 if (mxge_always_promisc)
1003 status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC, &cmd);
1005 status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC, &cmd);
1007 if_printf(sc->ifp, "Failed to set promisc mode\n");
1011 mxge_set_multicast_list(mxge_softc_t *sc)
1014 struct ifmultiaddr *ifma;
1015 struct ifnet *ifp = sc->ifp;
1018 /* This firmware is known to not support multicast */
1019 if (!sc->fw_multicast_support)
1022 /* Disable multicast filtering while we play with the lists*/
1023 err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1025 if_printf(ifp, "Failed MXGEFW_ENABLE_ALLMULTI, "
1026 "error status: %d\n", err);
1030 if (sc->adopted_rx_filter_bug)
1033 if (ifp->if_flags & IFF_ALLMULTI) {
1034 /* Request to disable multicast filtering, so quit here */
1038 /* Flush all the filters */
1039 err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1041 if_printf(ifp, "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, "
1042 "error status: %d\n", err);
1047 * Walk the multicast list, and add each address
1049 TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1050 if (ifma->ifma_addr->sa_family != AF_LINK)
1053 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1055 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1057 cmd.data0 = htonl(cmd.data0);
1058 cmd.data1 = htonl(cmd.data1);
1059 err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1061 if_printf(ifp, "Failed MXGEFW_JOIN_MULTICAST_GROUP, "
1062 "error status: %d\n", err);
1063 /* Abort, leaving multicast filtering off */
1068 /* Enable multicast filtering */
1069 err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1071 if_printf(ifp, "Failed MXGEFW_DISABLE_ALLMULTI, "
1072 "error status: %d\n", err);
1078 mxge_max_mtu(mxge_softc_t *sc)
1083 if (MJUMPAGESIZE - MXGEFW_PAD > MXGEFW_MAX_MTU)
1084 return MXGEFW_MAX_MTU - MXGEFW_PAD;
1086 /* try to set nbufs to see if it we can
1087 use virtually contiguous jumbos */
1089 status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1092 return MXGEFW_MAX_MTU - MXGEFW_PAD;
1094 /* otherwise, we're limited to MJUMPAGESIZE */
1095 return MJUMPAGESIZE - MXGEFW_PAD;
1100 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1102 struct mxge_slice_state *ss;
1103 mxge_rx_done_t *rx_done;
1104 volatile uint32_t *irq_claim;
1109 * Try to send a reset command to the card to see if it
1112 memset(&cmd, 0, sizeof (cmd));
1113 status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1115 if_printf(sc->ifp, "failed reset\n");
1119 mxge_dummy_rdma(sc, 1);
1121 /* Set the intrq size */
1122 cmd.data0 = sc->rx_ring_size;
1123 status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1126 * Even though we already know how many slices are supported
1127 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1128 * has magic side effects, and must be called after a reset.
1129 * It must be called prior to calling any RSS related cmds,
1130 * including assigning an interrupt queue for anything but
1131 * slice 0. It must also be called *after*
1132 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1133 * the firmware to compute offsets.
1135 if (sc->num_slices > 1) {
1136 /* Ask the maximum number of slices it supports */
1137 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
1139 if_printf(sc->ifp, "failed to get number of slices\n");
1144 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1145 * to setting up the interrupt queue DMA
1147 cmd.data0 = sc->num_slices;
1148 cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1149 #ifdef IFNET_BUF_RING
1150 cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1152 status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES, &cmd);
1154 if_printf(sc->ifp, "failed to set number of slices\n");
1159 if (interrupts_setup) {
1160 /* Now exchange information about interrupts */
1161 for (slice = 0; slice < sc->num_slices; slice++) {
1162 rx_done = &sc->ss[slice].rx_done;
1163 memset(rx_done->entry, 0, sc->rx_ring_size);
1165 MXGE_LOWPART_TO_U32(rx_done->dma.dmem_busaddr);
1167 MXGE_HIGHPART_TO_U32(rx_done->dma.dmem_busaddr);
1169 status |= mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_DMA,
1174 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET,
1176 sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1178 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1179 irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1181 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET, &cmd);
1182 sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1185 if_printf(sc->ifp, "failed set interrupt parameters\n");
1189 *sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1191 /* Run a DMA benchmark */
1192 mxge_dma_test(sc, MXGEFW_DMA_TEST);
1194 for (slice = 0; slice < sc->num_slices; slice++) {
1195 ss = &sc->ss[slice];
1197 ss->irq_claim = irq_claim + (2 * slice);
1199 /* Reset mcp/driver shared state back to 0 */
1200 ss->rx_done.idx = 0;
1201 ss->rx_done.cnt = 0;
1204 ss->tx.pkt_done = 0;
1205 ss->tx.queue_active = 0;
1206 ss->tx.activate = 0;
1207 ss->tx.deactivate = 0;
1209 ss->rx_small.cnt = 0;
1210 if (ss->fw_stats != NULL)
1211 bzero(ss->fw_stats, sizeof(*ss->fw_stats));
1213 sc->rdma_tags_available = 15;
1215 status = mxge_update_mac_address(sc);
1216 mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1217 mxge_change_pause(sc, sc->pause);
1218 mxge_set_multicast_list(sc);
1221 cmd.data0 = sc->throttle;
1222 if (mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd))
1223 if_printf(sc->ifp, "can't enable throttle\n");
1229 mxge_change_throttle(SYSCTL_HANDLER_ARGS)
1234 unsigned int throttle;
1237 throttle = sc->throttle;
1238 err = sysctl_handle_int(oidp, &throttle, arg2, req);
1242 if (throttle == sc->throttle)
1245 if (throttle < MXGE_MIN_THROTTLE || throttle > MXGE_MAX_THROTTLE)
1248 lwkt_serialize_enter(sc->ifp->if_serializer);
1250 cmd.data0 = throttle;
1251 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd);
1253 sc->throttle = throttle;
1255 lwkt_serialize_exit(sc->ifp->if_serializer);
1260 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1263 unsigned int intr_coal_delay;
1267 intr_coal_delay = sc->intr_coal_delay;
1268 err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1272 if (intr_coal_delay == sc->intr_coal_delay)
1275 if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1278 lwkt_serialize_enter(sc->ifp->if_serializer);
1280 *sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1281 sc->intr_coal_delay = intr_coal_delay;
1283 lwkt_serialize_exit(sc->ifp->if_serializer);
1288 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1291 unsigned int enabled;
1295 enabled = sc->pause;
1296 err = sysctl_handle_int(oidp, &enabled, arg2, req);
1300 if (enabled == sc->pause)
1303 lwkt_serialize_enter(sc->ifp->if_serializer);
1304 err = mxge_change_pause(sc, enabled);
1305 lwkt_serialize_exit(sc->ifp->if_serializer);
1311 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1317 arg2 = be32toh(*(int *)arg1);
1319 err = sysctl_handle_int(oidp, arg1, arg2, req);
1325 mxge_rem_sysctls(mxge_softc_t *sc)
1327 if (sc->ss != NULL) {
1328 struct mxge_slice_state *ss;
1331 for (slice = 0; slice < sc->num_slices; slice++) {
1332 ss = &sc->ss[slice];
1333 if (ss->sysctl_tree != NULL) {
1334 sysctl_ctx_free(&ss->sysctl_ctx);
1335 ss->sysctl_tree = NULL;
1340 if (sc->slice_sysctl_tree != NULL) {
1341 sysctl_ctx_free(&sc->slice_sysctl_ctx);
1342 sc->slice_sysctl_tree = NULL;
1345 if (sc->sysctl_tree != NULL) {
1346 sysctl_ctx_free(&sc->sysctl_ctx);
1347 sc->sysctl_tree = NULL;
1352 mxge_add_sysctls(mxge_softc_t *sc)
1354 struct sysctl_ctx_list *ctx;
1355 struct sysctl_oid_list *children;
1357 struct mxge_slice_state *ss;
1361 ctx = &sc->sysctl_ctx;
1362 sysctl_ctx_init(ctx);
1363 sc->sysctl_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_STATIC_CHILDREN(_hw),
1364 OID_AUTO, device_get_nameunit(sc->dev), CTLFLAG_RD, 0, "");
1365 if (sc->sysctl_tree == NULL) {
1366 device_printf(sc->dev, "can't add sysctl node\n");
1370 children = SYSCTL_CHILDREN(sc->sysctl_tree);
1371 fw = sc->ss[0].fw_stats;
1374 * Random information
1376 SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "firmware_version",
1377 CTLFLAG_RD, &sc->fw_version, 0, "firmware version");
1379 SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "serial_number",
1380 CTLFLAG_RD, &sc->serial_number_string, 0, "serial number");
1382 SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "product_code",
1383 CTLFLAG_RD, &sc->product_code_string, 0, "product code");
1385 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "pcie_link_width",
1386 CTLFLAG_RD, &sc->link_width, 0, "link width");
1388 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_boundary",
1389 CTLFLAG_RD, &sc->tx_boundary, 0, "tx boundary");
1391 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "write_combine",
1392 CTLFLAG_RD, &sc->wc, 0, "write combining PIO");
1394 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "read_dma_MBs",
1395 CTLFLAG_RD, &sc->read_dma, 0, "DMA Read speed in MB/s");
1397 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "write_dma_MBs",
1398 CTLFLAG_RD, &sc->write_dma, 0, "DMA Write speed in MB/s");
1400 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "read_write_dma_MBs",
1401 CTLFLAG_RD, &sc->read_write_dma, 0,
1402 "DMA concurrent Read/Write speed in MB/s");
1404 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "watchdog_resets",
1405 CTLFLAG_RD, &sc->watchdog_resets, 0,
1406 "Number of times NIC was reset");
1409 * Performance related tunables
1411 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "intr_coal_delay",
1412 CTLTYPE_INT|CTLFLAG_RW, sc, 0, mxge_change_intr_coal, "I",
1413 "Interrupt coalescing delay in usecs");
1415 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "throttle",
1416 CTLTYPE_INT|CTLFLAG_RW, sc, 0, mxge_change_throttle, "I",
1417 "Transmit throttling");
1419 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "flow_control_enabled",
1420 CTLTYPE_INT|CTLFLAG_RW, sc, 0, mxge_change_flow_control, "I",
1421 "Interrupt coalescing delay in usecs");
1423 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "deassert_wait",
1424 CTLFLAG_RW, &mxge_deassert_wait, 0,
1425 "Wait for IRQ line to go low in ihandler");
1428 * Stats block from firmware is in network byte order.
1431 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "link_up",
1432 CTLTYPE_INT|CTLFLAG_RD, &fw->link_up, 0,
1433 mxge_handle_be32, "I", "link up");
1435 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rdma_tags_available",
1436 CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available, 0,
1437 mxge_handle_be32, "I", "rdma_tags_available");
1439 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_bad_crc32",
1440 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_bad_crc32, 0,
1441 mxge_handle_be32, "I", "dropped_bad_crc32");
1443 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_bad_phy",
1444 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_bad_phy, 0,
1445 mxge_handle_be32, "I", "dropped_bad_phy");
1447 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_link_error_or_filtered",
1448 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_error_or_filtered, 0,
1449 mxge_handle_be32, "I", "dropped_link_error_or_filtered");
1451 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_link_overflow",
1452 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow, 0,
1453 mxge_handle_be32, "I", "dropped_link_overflow");
1455 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_multicast_filtered",
1456 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_multicast_filtered, 0,
1457 mxge_handle_be32, "I", "dropped_multicast_filtered");
1459 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_no_big_buffer",
1460 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer, 0,
1461 mxge_handle_be32, "I", "dropped_no_big_buffer");
1463 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_no_small_buffer",
1464 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_small_buffer, 0,
1465 mxge_handle_be32, "I", "dropped_no_small_buffer");
1467 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_overrun",
1468 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun, 0,
1469 mxge_handle_be32, "I", "dropped_overrun");
1471 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_pause",
1472 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_pause, 0,
1473 mxge_handle_be32, "I", "dropped_pause");
1475 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_runt",
1476 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt, 0,
1477 mxge_handle_be32, "I", "dropped_runt");
1479 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_unicast_filtered",
1480 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered, 0,
1481 mxge_handle_be32, "I", "dropped_unicast_filtered");
1483 /* add counters exported for debugging from all slices */
1484 sysctl_ctx_init(&sc->slice_sysctl_ctx);
1485 sc->slice_sysctl_tree = SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx,
1486 children, OID_AUTO, "slice", CTLFLAG_RD, 0, "");
1487 if (sc->slice_sysctl_tree == NULL) {
1488 device_printf(sc->dev, "can't add slice sysctl node\n");
1492 for (slice = 0; slice < sc->num_slices; slice++) {
1493 ss = &sc->ss[slice];
1494 sysctl_ctx_init(&ss->sysctl_ctx);
1495 ctx = &ss->sysctl_ctx;
1496 children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1497 ksprintf(slice_num, "%d", slice);
1498 ss->sysctl_tree = SYSCTL_ADD_NODE(ctx, children, OID_AUTO,
1499 slice_num, CTLFLAG_RD, 0, "");
1500 if (ss->sysctl_tree == NULL) {
1501 device_printf(sc->dev,
1502 "can't add %d slice sysctl node\n", slice);
1503 return; /* XXX continue? */
1505 children = SYSCTL_CHILDREN(ss->sysctl_tree);
1508 * XXX change to ULONG
1511 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rx_small_cnt",
1512 CTLFLAG_RD, &ss->rx_small.cnt, 0, "rx_small_cnt");
1514 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rx_big_cnt",
1515 CTLFLAG_RD, &ss->rx_big.cnt, 0, "rx_small_cnt");
1517 #ifndef IFNET_BUF_RING
1518 /* only transmit from slice 0 for now */
1523 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_req",
1524 CTLFLAG_RD, &ss->tx.req, 0, "tx_req");
1526 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_done",
1527 CTLFLAG_RD, &ss->tx.done, 0, "tx_done");
1529 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_pkt_done",
1530 CTLFLAG_RD, &ss->tx.pkt_done, 0, "tx_done");
1532 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_queue_active",
1533 CTLFLAG_RD, &ss->tx.queue_active, 0, "tx_queue_active");
1535 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_activate",
1536 CTLFLAG_RD, &ss->tx.activate, 0, "tx_activate");
1538 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_deactivate",
1539 CTLFLAG_RD, &ss->tx.deactivate, 0, "tx_deactivate");
1544 * Copy an array of mcp_kreq_ether_send_t's to the mcp. Copy
1545 * backwards one at a time and handle ring wraps
1547 static __inline void
1548 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1549 mcp_kreq_ether_send_t *src, int cnt)
1551 int idx, starting_slot;
1553 starting_slot = tx->req;
1556 idx = (starting_slot + cnt) & tx->mask;
1557 mxge_pio_copy(&tx->lanai[idx], &src[cnt], sizeof(*src));
1563 * Copy an array of mcp_kreq_ether_send_t's to the mcp. Copy
1564 * at most 32 bytes at a time, so as to avoid involving the software
1565 * pio handler in the nic. We re-write the first segment's flags
1566 * to mark them valid only after writing the entire chain
1568 static __inline void
1569 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src, int cnt)
1573 volatile uint32_t *dst_ints;
1574 mcp_kreq_ether_send_t *srcp;
1575 volatile mcp_kreq_ether_send_t *dstp, *dst;
1578 idx = tx->req & tx->mask;
1580 last_flags = src->flags;
1583 dst = dstp = &tx->lanai[idx];
1586 if ((idx + cnt) < tx->mask) {
1587 for (i = 0; i < cnt - 1; i += 2) {
1588 mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1589 wmb(); /* force write every 32 bytes */
1595 * Submit all but the first request, and ensure
1596 * that it is submitted below
1598 mxge_submit_req_backwards(tx, src, cnt);
1602 /* Submit the first request */
1603 mxge_pio_copy(dstp, srcp, sizeof(*src));
1604 wmb(); /* barrier before setting valid flag */
1607 /* Re-write the last 32-bits with the valid flags */
1608 src->flags = last_flags;
1609 src_ints = (uint32_t *)src;
1611 dst_ints = (volatile uint32_t *)dst;
1613 *dst_ints = *src_ints;
1619 mxge_pullup_tso(struct mbuf **mp)
1621 int hoff, iphlen, thoff;
1625 KASSERT(M_WRITABLE(m), ("TSO mbuf not writable"));
1627 iphlen = m->m_pkthdr.csum_iphlen;
1628 thoff = m->m_pkthdr.csum_thlen;
1629 hoff = m->m_pkthdr.csum_lhlen;
1631 KASSERT(iphlen > 0, ("invalid ip hlen"));
1632 KASSERT(thoff > 0, ("invalid tcp hlen"));
1633 KASSERT(hoff > 0, ("invalid ether hlen"));
1635 if (__predict_false(m->m_len < hoff + iphlen + thoff)) {
1636 m = m_pullup(m, hoff + iphlen + thoff);
1647 mxge_encap_tso(mxge_tx_ring_t *tx, struct mbuf *m, int busdma_seg_cnt)
1649 mcp_kreq_ether_send_t *req;
1650 bus_dma_segment_t *seg;
1651 uint32_t low, high_swapped;
1652 int len, seglen, cum_len, cum_len_next;
1653 int next_is_first, chop, cnt, rdma_count, small;
1654 uint16_t pseudo_hdr_offset, cksum_offset, mss;
1655 uint8_t flags, flags_next;
1657 mss = m->m_pkthdr.tso_segsz;
1660 * Negative cum_len signifies to the send loop that we are
1661 * still in the header portion of the TSO packet.
1663 cum_len = -(m->m_pkthdr.csum_lhlen + m->m_pkthdr.csum_iphlen +
1664 m->m_pkthdr.csum_thlen);
1667 * TSO implies checksum offload on this hardware
1669 cksum_offset = m->m_pkthdr.csum_lhlen + m->m_pkthdr.csum_iphlen;
1670 flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1673 * For TSO, pseudo_hdr_offset holds mss. The firmware figures
1674 * out where to put the checksum by parsing the header.
1676 pseudo_hdr_offset = htobe16(mss);
1684 * "rdma_count" is the number of RDMAs belonging to the current
1685 * packet BEFORE the current send request. For non-TSO packets,
1686 * this is equal to "count".
1688 * For TSO packets, rdma_count needs to be reset to 0 after a
1691 * The rdma_count field of the send request is the number of
1692 * RDMAs of the packet starting at that request. For TSO send
1693 * requests with one ore more cuts in the middle, this is the
1694 * number of RDMAs starting after the last cut in the request.
1695 * All previous segments before the last cut implicitly have 1
1698 * Since the number of RDMAs is not known beforehand, it must be
1699 * filled-in retroactively - after each segmentation cut or at
1700 * the end of the entire packet.
1703 while (busdma_seg_cnt) {
1705 * Break the busdma segment up into pieces
1707 low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1708 high_swapped = htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1712 flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1714 cum_len_next = cum_len + seglen;
1715 (req - rdma_count)->rdma_count = rdma_count + 1;
1716 if (__predict_true(cum_len >= 0)) {
1718 chop = (cum_len_next > mss);
1719 cum_len_next = cum_len_next % mss;
1720 next_is_first = (cum_len_next == 0);
1721 flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1723 next_is_first * MXGEFW_FLAGS_FIRST;
1724 rdma_count |= -(chop | next_is_first);
1725 rdma_count += chop & !next_is_first;
1726 } else if (cum_len_next >= 0) {
1731 small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1732 flags_next = MXGEFW_FLAGS_TSO_PLD |
1733 MXGEFW_FLAGS_FIRST |
1734 (small * MXGEFW_FLAGS_SMALL);
1737 req->addr_high = high_swapped;
1738 req->addr_low = htobe32(low);
1739 req->pseudo_hdr_offset = pseudo_hdr_offset;
1741 req->rdma_count = 1;
1742 req->length = htobe16(seglen);
1743 req->cksum_offset = cksum_offset;
1745 flags | ((cum_len & 1) * MXGEFW_FLAGS_ALIGN_ODD);
1748 cum_len = cum_len_next;
1753 if (__predict_false(cksum_offset > seglen))
1754 cksum_offset -= seglen;
1757 if (__predict_false(cnt > tx->max_desc))
1763 (req - rdma_count)->rdma_count = rdma_count;
1767 req->flags |= MXGEFW_FLAGS_TSO_LAST;
1768 } while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1770 tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1771 mxge_submit_req(tx, tx->req_list, cnt);
1772 #ifdef IFNET_BUF_RING
1773 if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1774 /* tell the NIC to start polling this slice */
1776 tx->queue_active = 1;
1784 bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1790 mxge_encap(mxge_tx_ring_t *tx, struct mbuf *m)
1792 mcp_kreq_ether_send_t *req;
1793 bus_dma_segment_t *seg;
1794 int cnt, cum_len, err, i, idx, odd_flag;
1795 uint16_t pseudo_hdr_offset;
1796 uint8_t flags, cksum_offset;
1798 if (m->m_pkthdr.csum_flags & CSUM_TSO) {
1799 err = mxge_pullup_tso(&m);
1800 if (__predict_false(err))
1805 * Map the frame for DMA
1807 idx = tx->req & tx->mask;
1808 err = bus_dmamap_load_mbuf_defrag(tx->dmat, tx->info[idx].map, &m,
1809 tx->seg_list, tx->max_desc - 2, &cnt, BUS_DMA_NOWAIT);
1810 if (__predict_false(err != 0))
1812 bus_dmamap_sync(tx->dmat, tx->info[idx].map, BUS_DMASYNC_PREWRITE);
1813 tx->info[idx].m = m;
1816 * TSO is different enough, we handle it in another routine
1818 if (m->m_pkthdr.csum_flags & CSUM_TSO)
1819 return mxge_encap_tso(tx, m, cnt);
1823 pseudo_hdr_offset = 0;
1824 flags = MXGEFW_FLAGS_NO_TSO;
1827 * Checksum offloading
1829 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
1830 cksum_offset = m->m_pkthdr.csum_lhlen + m->m_pkthdr.csum_iphlen;
1831 pseudo_hdr_offset = cksum_offset + m->m_pkthdr.csum_data;
1832 pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
1833 req->cksum_offset = cksum_offset;
1834 flags |= MXGEFW_FLAGS_CKSUM;
1835 odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
1839 if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
1840 flags |= MXGEFW_FLAGS_SMALL;
1843 * Convert segments into a request list
1847 req->flags = MXGEFW_FLAGS_FIRST;
1848 for (i = 0; i < cnt; i++) {
1849 req->addr_low = htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
1850 req->addr_high = htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1851 req->length = htobe16(seg->ds_len);
1852 req->cksum_offset = cksum_offset;
1853 if (cksum_offset > seg->ds_len)
1854 cksum_offset -= seg->ds_len;
1857 req->pseudo_hdr_offset = pseudo_hdr_offset;
1858 req->pad = 0; /* complete solid 16-byte block */
1859 req->rdma_count = 1;
1860 req->flags |= flags | ((cum_len & 1) * odd_flag);
1861 cum_len += seg->ds_len;
1869 * Pad runt to 60 bytes
1873 req->addr_low = htobe32(
1874 MXGE_LOWPART_TO_U32(tx->sc->zeropad_dma.dmem_busaddr));
1875 req->addr_high = htobe32(
1876 MXGE_HIGHPART_TO_U32(tx->sc->zeropad_dma.dmem_busaddr));
1877 req->length = htobe16(60 - cum_len);
1878 req->cksum_offset = 0;
1879 req->pseudo_hdr_offset = pseudo_hdr_offset;
1880 req->pad = 0; /* complete solid 16-byte block */
1881 req->rdma_count = 1;
1882 req->flags |= flags | ((cum_len & 1) * odd_flag);
1886 tx->req_list[0].rdma_count = cnt;
1888 /* print what the firmware will see */
1889 for (i = 0; i < cnt; i++) {
1890 kprintf("%d: addr: 0x%x 0x%x len:%d pso%d,"
1891 "cso:%d, flags:0x%x, rdma:%d\n",
1892 i, (int)ntohl(tx->req_list[i].addr_high),
1893 (int)ntohl(tx->req_list[i].addr_low),
1894 (int)ntohs(tx->req_list[i].length),
1895 (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
1896 tx->req_list[i].cksum_offset, tx->req_list[i].flags,
1897 tx->req_list[i].rdma_count);
1899 kprintf("--------------\n");
1901 tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1902 mxge_submit_req(tx, tx->req_list, cnt);
1903 #ifdef IFNET_BUF_RING
1904 if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1905 /* tell the NIC to start polling this slice */
1907 tx->queue_active = 1;
1920 mxge_start(struct ifnet *ifp, struct ifaltq_subque *ifsq)
1922 mxge_softc_t *sc = ifp->if_softc;
1923 struct mxge_slice_state *ss;
1927 ASSERT_ALTQ_SQ_DEFAULT(ifp, ifsq);
1928 ASSERT_SERIALIZED(sc->ifp->if_serializer);
1930 if ((ifp->if_flags & IFF_RUNNING) == 0 || ifsq_is_oactive(ifsq))
1933 /* XXX Only use the first slice for now */
1937 while (tx->mask - (tx->req - tx->done) > tx->max_desc) {
1941 m = ifsq_dequeue(ifsq);
1946 error = mxge_encap(tx, m);
1950 IFNET_STAT_INC(ifp, oerrors, 1);
1953 /* Ran out of transmit slots */
1954 ifsq_set_oactive(ifsq);
1961 mxge_watchdog(struct ifnet *ifp)
1963 struct mxge_softc *sc = ifp->if_softc;
1964 uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
1965 mxge_tx_ring_t *tx = &sc->ss[0].tx;
1967 ASSERT_SERIALIZED(ifp->if_serializer);
1969 /* Check for pause blocking before resetting */
1970 if (tx->watchdog_rx_pause == rx_pause) {
1971 mxge_warn_stuck(sc, tx, 0);
1972 mxge_watchdog_reset(sc);
1975 if_printf(ifp, "Flow control blocking xmits, "
1976 "check link partner\n");
1978 tx->watchdog_rx_pause = rx_pause;
1982 * Copy an array of mcp_kreq_ether_recv_t's to the mcp. Copy
1983 * at most 32 bytes at a time, so as to avoid involving the software
1984 * pio handler in the nic. We re-write the first segment's low
1985 * DMA address to mark it valid only after we write the entire chunk
1988 static __inline void
1989 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
1990 mcp_kreq_ether_recv_t *src)
1994 low = src->addr_low;
1995 src->addr_low = 0xffffffff;
1996 mxge_pio_copy(dst, src, 4 * sizeof (*src));
1998 mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2000 src->addr_low = low;
2001 dst->addr_low = low;
2006 mxge_get_buf_small(mxge_rx_ring_t *rx, bus_dmamap_t map, int idx,
2009 bus_dma_segment_t seg;
2011 int cnt, err, mflag;
2013 mflag = MB_DONTWAIT;
2014 if (__predict_false(init))
2017 m = m_gethdr(mflag, MT_DATA);
2021 if (__predict_false(init)) {
2023 * During initialization, there
2024 * is nothing to setup; bail out
2030 m->m_len = m->m_pkthdr.len = MHLEN;
2032 err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2033 &seg, 1, &cnt, BUS_DMA_NOWAIT);
2036 if (__predict_false(init)) {
2038 * During initialization, there
2039 * is nothing to setup; bail out
2046 rx->info[idx].m = m;
2047 rx->shadow[idx].addr_low = htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2048 rx->shadow[idx].addr_high = htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2052 mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2057 mxge_get_buf_big(mxge_rx_ring_t *rx, bus_dmamap_t map, int idx,
2060 bus_dma_segment_t seg;
2062 int cnt, err, mflag;
2064 mflag = MB_DONTWAIT;
2065 if (__predict_false(init))
2068 if (rx->cl_size == MCLBYTES)
2069 m = m_getcl(mflag, MT_DATA, M_PKTHDR);
2071 m = m_getjcl(mflag, MT_DATA, M_PKTHDR, MJUMPAGESIZE);
2075 if (__predict_false(init)) {
2077 * During initialization, there
2078 * is nothing to setup; bail out
2084 m->m_len = m->m_pkthdr.len = rx->mlen;
2086 err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2087 &seg, 1, &cnt, BUS_DMA_NOWAIT);
2090 if (__predict_false(init)) {
2092 * During initialization, there
2093 * is nothing to setup; bail out
2100 rx->info[idx].m = m;
2101 rx->shadow[idx].addr_low = htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2102 rx->shadow[idx].addr_high = htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2106 mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2111 * Myri10GE hardware checksums are not valid if the sender
2112 * padded the frame with non-zero padding. This is because
2113 * the firmware just does a simple 16-bit 1s complement
2114 * checksum across the entire frame, excluding the first 14
2115 * bytes. It is best to simply to check the checksum and
2116 * tell the stack about it only if the checksum is good
2118 static __inline uint16_t
2119 mxge_rx_csum(struct mbuf *m, int csum)
2121 const struct ether_header *eh;
2122 const struct ip *ip;
2125 eh = mtod(m, const struct ether_header *);
2127 /* Only deal with IPv4 TCP & UDP for now */
2128 if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2131 ip = (const struct ip *)(eh + 1);
2132 if (__predict_false(ip->ip_p != IPPROTO_TCP && ip->ip_p != IPPROTO_UDP))
2136 c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2137 htonl(ntohs(csum) + ntohs(ip->ip_len) +
2138 - (ip->ip_hl << 2) + ip->ip_p));
2147 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2149 struct ether_vlan_header *evl;
2152 evl = mtod(m, struct ether_vlan_header *);
2155 * Fix checksum by subtracting EVL_ENCAPLEN bytes after
2156 * what the firmware thought was the end of the ethernet
2160 /* Put checksum into host byte order */
2161 *csum = ntohs(*csum);
2163 partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2165 *csum += ((*csum) < ~partial);
2166 *csum = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2167 *csum = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2170 * Restore checksum to network byte order;
2171 * later consumers expect this
2173 *csum = htons(*csum);
2176 m->m_pkthdr.ether_vlantag = ntohs(evl->evl_tag);
2177 m->m_flags |= M_VLANTAG;
2180 * Remove the 802.1q header by copying the Ethernet
2181 * addresses over it and adjusting the beginning of
2182 * the data in the mbuf. The encapsulated Ethernet
2183 * type field is already in place.
2185 bcopy((char *)evl, (char *)evl + EVL_ENCAPLEN,
2186 ETHER_HDR_LEN - ETHER_TYPE_LEN);
2187 m_adj(m, EVL_ENCAPLEN);
2191 static __inline void
2192 mxge_rx_done_big(mxge_rx_ring_t *rx, uint32_t len, uint32_t csum)
2194 struct ifnet *ifp = rx->sc->ifp;
2196 const struct ether_header *eh;
2197 bus_dmamap_t old_map;
2200 idx = rx->cnt & rx->mask;
2203 /* Save a pointer to the received mbuf */
2204 m = rx->info[idx].m;
2206 /* Try to replace the received mbuf */
2207 if (mxge_get_buf_big(rx, rx->extra_map, idx, FALSE)) {
2208 /* Drop the frame -- the old mbuf is re-cycled */
2209 IFNET_STAT_INC(ifp, ierrors, 1);
2213 /* Unmap the received buffer */
2214 old_map = rx->info[idx].map;
2215 bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2216 bus_dmamap_unload(rx->dmat, old_map);
2218 /* Swap the bus_dmamap_t's */
2219 rx->info[idx].map = rx->extra_map;
2220 rx->extra_map = old_map;
2223 * mcp implicitly skips 1st 2 bytes so that packet is properly
2226 m->m_data += MXGEFW_PAD;
2228 m->m_pkthdr.rcvif = ifp;
2229 m->m_len = m->m_pkthdr.len = len;
2231 IFNET_STAT_INC(ifp, ipackets, 1);
2233 eh = mtod(m, const struct ether_header *);
2234 if (eh->ether_type == htons(ETHERTYPE_VLAN))
2235 mxge_vlan_tag_remove(m, &csum);
2237 /* If the checksum is valid, mark it in the mbuf header */
2238 if ((ifp->if_capenable & IFCAP_RXCSUM) &&
2239 mxge_rx_csum(m, csum) == 0) {
2240 /* Tell the stack that the checksum is good */
2241 m->m_pkthdr.csum_data = 0xffff;
2242 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2245 ifp->if_input(ifp, m);
2248 static __inline void
2249 mxge_rx_done_small(mxge_rx_ring_t *rx, uint32_t len, uint32_t csum)
2251 struct ifnet *ifp = rx->sc->ifp;
2252 const struct ether_header *eh;
2254 bus_dmamap_t old_map;
2257 idx = rx->cnt & rx->mask;
2260 /* Save a pointer to the received mbuf */
2261 m = rx->info[idx].m;
2263 /* Try to replace the received mbuf */
2264 if (mxge_get_buf_small(rx, rx->extra_map, idx, FALSE)) {
2265 /* Drop the frame -- the old mbuf is re-cycled */
2266 IFNET_STAT_INC(ifp, ierrors, 1);
2270 /* Unmap the received buffer */
2271 old_map = rx->info[idx].map;
2272 bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2273 bus_dmamap_unload(rx->dmat, old_map);
2275 /* Swap the bus_dmamap_t's */
2276 rx->info[idx].map = rx->extra_map;
2277 rx->extra_map = old_map;
2280 * mcp implicitly skips 1st 2 bytes so that packet is properly
2283 m->m_data += MXGEFW_PAD;
2285 m->m_pkthdr.rcvif = ifp;
2286 m->m_len = m->m_pkthdr.len = len;
2288 IFNET_STAT_INC(ifp, ipackets, 1);
2290 eh = mtod(m, const struct ether_header *);
2291 if (eh->ether_type == htons(ETHERTYPE_VLAN))
2292 mxge_vlan_tag_remove(m, &csum);
2294 /* If the checksum is valid, mark it in the mbuf header */
2295 if ((ifp->if_capenable & IFCAP_RXCSUM) &&
2296 mxge_rx_csum(m, csum) == 0) {
2297 /* Tell the stack that the checksum is good */
2298 m->m_pkthdr.csum_data = 0xffff;
2299 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2302 ifp->if_input(ifp, m);
2305 static __inline void
2306 mxge_clean_rx_done(mxge_rx_done_t *rx_done)
2308 while (rx_done->entry[rx_done->idx].length != 0) {
2309 uint16_t length, checksum;
2311 length = ntohs(rx_done->entry[rx_done->idx].length);
2312 rx_done->entry[rx_done->idx].length = 0;
2314 checksum = rx_done->entry[rx_done->idx].checksum;
2316 if (length <= (MHLEN - MXGEFW_PAD))
2317 mxge_rx_done_small(rx_done->rx_small, length, checksum);
2319 mxge_rx_done_big(rx_done->rx_big, length, checksum);
2322 rx_done->idx = rx_done->cnt & rx_done->mask;
2326 static __inline void
2327 mxge_tx_done(mxge_tx_ring_t *tx, uint32_t mcp_idx)
2329 struct ifnet *ifp = tx->sc->ifp;
2331 ASSERT_SERIALIZED(ifp->if_serializer);
2333 while (tx->pkt_done != mcp_idx) {
2337 idx = tx->done & tx->mask;
2340 m = tx->info[idx].m;
2342 * mbuf and DMA map only attached to the first
2346 IFNET_STAT_INC(ifp, opackets, 1);
2347 tx->info[idx].m = NULL;
2348 bus_dmamap_unload(tx->dmat, tx->info[idx].map);
2351 if (tx->info[idx].flag) {
2352 tx->info[idx].flag = 0;
2358 * If we have space, clear OACTIVE to tell the stack that
2359 * its OK to send packets
2361 if (tx->req - tx->done < (tx->mask + 1) / 4) {
2362 ifq_clr_oactive(&ifp->if_snd);
2363 if (tx->req == tx->done)
2367 if (!ifq_is_empty(&ifp->if_snd))
2370 #ifdef IFNET_BUF_RING
2371 if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2372 /* let the NIC stop polling this queue, since there
2373 * are no more transmits pending */
2374 if (tx->req == tx->done) {
2376 tx->queue_active = 0;
2384 static struct mxge_media_type mxge_xfp_media_types[] = {
2385 {IFM_10G_CX4, 0x7f, "10GBASE-CX4 (module)"},
2386 {IFM_10G_SR, (1 << 7), "10GBASE-SR"},
2387 {IFM_10G_LR, (1 << 6), "10GBASE-LR"},
2388 {0, (1 << 5), "10GBASE-ER"},
2389 {IFM_10G_LRM, (1 << 4), "10GBASE-LRM"},
2390 {0, (1 << 3), "10GBASE-SW"},
2391 {0, (1 << 2), "10GBASE-LW"},
2392 {0, (1 << 1), "10GBASE-EW"},
2393 {0, (1 << 0), "Reserved"}
2396 static struct mxge_media_type mxge_sfp_media_types[] = {
2397 {IFM_10G_TWINAX, 0, "10GBASE-Twinax"},
2398 {0, (1 << 7), "Reserved"},
2399 {IFM_10G_LRM, (1 << 6), "10GBASE-LRM"},
2400 {IFM_10G_LR, (1 << 5), "10GBASE-LR"},
2401 {IFM_10G_SR, (1 << 4), "10GBASE-SR"},
2402 {IFM_10G_TWINAX,(1 << 0), "10GBASE-Twinax"}
2406 mxge_media_set(mxge_softc_t *sc, int media_type)
2408 ifmedia_add(&sc->media, IFM_ETHER | IFM_FDX | media_type, 0, NULL);
2409 ifmedia_set(&sc->media, IFM_ETHER | IFM_FDX | media_type);
2410 sc->current_media = media_type;
2411 sc->media.ifm_media = sc->media.ifm_cur->ifm_media;
2415 mxge_media_init(mxge_softc_t *sc)
2420 ifmedia_removeall(&sc->media);
2421 mxge_media_set(sc, IFM_AUTO);
2424 * Parse the product code to deterimine the interface type
2425 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2426 * after the 3rd dash in the driver's cached copy of the
2427 * EEPROM's product code string.
2429 ptr = sc->product_code_string;
2431 if_printf(sc->ifp, "Missing product code\n");
2435 for (i = 0; i < 3; i++, ptr++) {
2436 ptr = strchr(ptr, '-');
2438 if_printf(sc->ifp, "only %d dashes in PC?!?\n", i);
2442 if (*ptr == 'C' || *(ptr +1) == 'C') {
2444 sc->connector = MXGE_CX4;
2445 mxge_media_set(sc, IFM_10G_CX4);
2446 } else if (*ptr == 'Q') {
2447 /* -Q is Quad Ribbon Fiber */
2448 sc->connector = MXGE_QRF;
2449 if_printf(sc->ifp, "Quad Ribbon Fiber Media\n");
2450 /* DragonFly has no media type for Quad ribbon fiber */
2451 } else if (*ptr == 'R') {
2453 sc->connector = MXGE_XFP;
2454 } else if (*ptr == 'S' || *(ptr +1) == 'S') {
2455 /* -S or -2S is SFP+ */
2456 sc->connector = MXGE_SFP;
2458 if_printf(sc->ifp, "Unknown media type: %c\n", *ptr);
2463 * Determine the media type for a NIC. Some XFPs will identify
2464 * themselves only when their link is up, so this is initiated via a
2465 * link up interrupt. However, this can potentially take up to
2466 * several milliseconds, so it is run via the watchdog routine, rather
2467 * than in the interrupt handler itself.
2470 mxge_media_probe(mxge_softc_t *sc)
2473 const char *cage_type;
2474 struct mxge_media_type *mxge_media_types = NULL;
2475 int i, err, ms, mxge_media_type_entries;
2478 sc->need_media_probe = 0;
2480 if (sc->connector == MXGE_XFP) {
2482 mxge_media_types = mxge_xfp_media_types;
2483 mxge_media_type_entries = sizeof(mxge_xfp_media_types) /
2484 sizeof(mxge_xfp_media_types[0]);
2485 byte = MXGE_XFP_COMPLIANCE_BYTE;
2487 } else if (sc->connector == MXGE_SFP) {
2488 /* -S or -2S is SFP+ */
2489 mxge_media_types = mxge_sfp_media_types;
2490 mxge_media_type_entries = sizeof(mxge_sfp_media_types) /
2491 sizeof(mxge_sfp_media_types[0]);
2495 /* nothing to do; media type cannot change */
2500 * At this point we know the NIC has an XFP cage, so now we
2501 * try to determine what is in the cage by using the
2502 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2503 * register. We read just one byte, which may take over
2507 cmd.data0 = 0; /* just fetch 1 byte, not all 256 */
2509 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2510 if (err == MXGEFW_CMD_ERROR_I2C_FAILURE)
2511 if_printf(sc->ifp, "failed to read XFP\n");
2512 if (err == MXGEFW_CMD_ERROR_I2C_ABSENT)
2513 if_printf(sc->ifp, "Type R/S with no XFP!?!?\n");
2514 if (err != MXGEFW_CMD_OK)
2517 /* Now we wait for the data to be cached */
2519 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2520 for (ms = 0; err == EBUSY && ms < 50; ms++) {
2523 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2525 if (err != MXGEFW_CMD_OK) {
2526 if_printf(sc->ifp, "failed to read %s (%d, %dms)\n",
2527 cage_type, err, ms);
2531 if (cmd.data0 == mxge_media_types[0].bitmask) {
2533 if_printf(sc->ifp, "%s:%s\n", cage_type,
2534 mxge_media_types[0].name);
2536 if (sc->current_media != mxge_media_types[0].flag) {
2537 mxge_media_init(sc);
2538 mxge_media_set(sc, mxge_media_types[0].flag);
2542 for (i = 1; i < mxge_media_type_entries; i++) {
2543 if (cmd.data0 & mxge_media_types[i].bitmask) {
2545 if_printf(sc->ifp, "%s:%s\n", cage_type,
2546 mxge_media_types[i].name);
2549 if (sc->current_media != mxge_media_types[i].flag) {
2550 mxge_media_init(sc);
2551 mxge_media_set(sc, mxge_media_types[i].flag);
2557 if_printf(sc->ifp, "%s media 0x%x unknown\n", cage_type,
2563 mxge_intr_status(struct mxge_softc *sc, const mcp_irq_data_t *stats)
2565 if (sc->link_state != stats->link_up) {
2566 sc->link_state = stats->link_up;
2567 if (sc->link_state) {
2568 sc->ifp->if_link_state = LINK_STATE_UP;
2569 if_link_state_change(sc->ifp);
2571 if_printf(sc->ifp, "link up\n");
2573 sc->ifp->if_link_state = LINK_STATE_DOWN;
2574 if_link_state_change(sc->ifp);
2576 if_printf(sc->ifp, "link down\n");
2578 sc->need_media_probe = 1;
2581 if (sc->rdma_tags_available != be32toh(stats->rdma_tags_available)) {
2582 sc->rdma_tags_available = be32toh(stats->rdma_tags_available);
2583 if_printf(sc->ifp, "RDMA timed out! %d tags left\n",
2584 sc->rdma_tags_available);
2587 if (stats->link_down) {
2588 sc->down_cnt += stats->link_down;
2590 sc->ifp->if_link_state = LINK_STATE_DOWN;
2591 if_link_state_change(sc->ifp);
2596 mxge_legacy(void *arg)
2598 struct mxge_slice_state *ss = arg;
2599 mxge_softc_t *sc = ss->sc;
2600 mcp_irq_data_t *stats = ss->fw_stats;
2601 mxge_tx_ring_t *tx = &ss->tx;
2602 mxge_rx_done_t *rx_done = &ss->rx_done;
2603 uint32_t send_done_count;
2607 /* an interrupt on a non-zero slice is implicitly valid
2608 since MSI-X irqs are not shared */
2610 mxge_clean_rx_done(rx_done);
2611 *ss->irq_claim = be32toh(3);
2616 /* Make sure the DMA has finished */
2619 valid = stats->valid;
2621 /* Lower legacy IRQ */
2622 *sc->irq_deassert = 0;
2623 if (!mxge_deassert_wait) {
2624 /* Don't wait for conf. that irq is low */
2629 * Loop while waiting for legacy irq deassertion
2630 * XXX do we really want to loop?
2633 /* Check for transmit completes and receives */
2634 send_done_count = be32toh(stats->send_done_count);
2635 while ((send_done_count != tx->pkt_done) ||
2636 (rx_done->entry[rx_done->idx].length != 0)) {
2637 if (send_done_count != tx->pkt_done)
2638 mxge_tx_done(tx, (int)send_done_count);
2639 mxge_clean_rx_done(rx_done);
2640 send_done_count = be32toh(stats->send_done_count);
2642 if (mxge_deassert_wait)
2644 } while (*((volatile uint8_t *)&stats->valid));
2646 /* Fw link & error stats meaningful only on the first slice */
2647 if (__predict_false(stats->stats_updated))
2648 mxge_intr_status(sc, stats);
2650 /* Check to see if we have rx token to pass back */
2652 *ss->irq_claim = be32toh(3);
2653 *(ss->irq_claim + 1) = be32toh(3);
2659 struct mxge_slice_state *ss = arg;
2660 mxge_softc_t *sc = ss->sc;
2661 mcp_irq_data_t *stats = ss->fw_stats;
2662 mxge_tx_ring_t *tx = &ss->tx;
2663 mxge_rx_done_t *rx_done = &ss->rx_done;
2664 uint32_t send_done_count;
2667 /* Make sure the DMA has finished */
2671 valid = stats->valid;
2674 /* Check for receives */
2675 if (rx_done->entry[rx_done->idx].length != 0)
2676 mxge_clean_rx_done(rx_done);
2678 /* Check for transmit completes */
2679 send_done_count = be32toh(stats->send_done_count);
2680 if (send_done_count != tx->pkt_done)
2681 mxge_tx_done(tx, (int)send_done_count);
2683 if (__predict_false(stats->stats_updated))
2684 mxge_intr_status(sc, stats);
2686 /* Check to see if we have rx token to pass back */
2688 *ss->irq_claim = be32toh(3);
2689 *(ss->irq_claim + 1) = be32toh(3);
2693 mxge_init(void *arg)
2695 struct mxge_softc *sc = arg;
2697 ASSERT_SERIALIZED(sc->ifp->if_serializer);
2698 if ((sc->ifp->if_flags & IFF_RUNNING) == 0)
2703 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
2707 for (i = 0; i <= ss->rx_big.mask; i++) {
2708 if (ss->rx_big.info[i].m == NULL)
2710 bus_dmamap_unload(ss->rx_big.dmat, ss->rx_big.info[i].map);
2711 m_freem(ss->rx_big.info[i].m);
2712 ss->rx_big.info[i].m = NULL;
2715 for (i = 0; i <= ss->rx_small.mask; i++) {
2716 if (ss->rx_small.info[i].m == NULL)
2718 bus_dmamap_unload(ss->rx_small.dmat, ss->rx_small.info[i].map);
2719 m_freem(ss->rx_small.info[i].m);
2720 ss->rx_small.info[i].m = NULL;
2723 /* Transmit ring used only on the first slice */
2724 if (ss->tx.info == NULL)
2727 for (i = 0; i <= ss->tx.mask; i++) {
2728 ss->tx.info[i].flag = 0;
2729 if (ss->tx.info[i].m == NULL)
2731 bus_dmamap_unload(ss->tx.dmat, ss->tx.info[i].map);
2732 m_freem(ss->tx.info[i].m);
2733 ss->tx.info[i].m = NULL;
2738 mxge_free_mbufs(mxge_softc_t *sc)
2742 for (slice = 0; slice < sc->num_slices; slice++)
2743 mxge_free_slice_mbufs(&sc->ss[slice]);
2747 mxge_free_slice_rings(struct mxge_slice_state *ss)
2751 if (ss->rx_done.entry != NULL) {
2752 mxge_dma_free(&ss->rx_done.dma);
2753 ss->rx_done.entry = NULL;
2756 if (ss->tx.req_list != NULL) {
2757 kfree(ss->tx.req_list, M_DEVBUF);
2758 ss->tx.req_list = NULL;
2761 if (ss->tx.seg_list != NULL) {
2762 kfree(ss->tx.seg_list, M_DEVBUF);
2763 ss->tx.seg_list = NULL;
2766 if (ss->rx_small.shadow != NULL) {
2767 kfree(ss->rx_small.shadow, M_DEVBUF);
2768 ss->rx_small.shadow = NULL;
2771 if (ss->rx_big.shadow != NULL) {
2772 kfree(ss->rx_big.shadow, M_DEVBUF);
2773 ss->rx_big.shadow = NULL;
2776 if (ss->tx.info != NULL) {
2777 if (ss->tx.dmat != NULL) {
2778 for (i = 0; i <= ss->tx.mask; i++) {
2779 bus_dmamap_destroy(ss->tx.dmat,
2780 ss->tx.info[i].map);
2782 bus_dma_tag_destroy(ss->tx.dmat);
2784 kfree(ss->tx.info, M_DEVBUF);
2788 if (ss->rx_small.info != NULL) {
2789 if (ss->rx_small.dmat != NULL) {
2790 for (i = 0; i <= ss->rx_small.mask; i++) {
2791 bus_dmamap_destroy(ss->rx_small.dmat,
2792 ss->rx_small.info[i].map);
2794 bus_dmamap_destroy(ss->rx_small.dmat,
2795 ss->rx_small.extra_map);
2796 bus_dma_tag_destroy(ss->rx_small.dmat);
2798 kfree(ss->rx_small.info, M_DEVBUF);
2799 ss->rx_small.info = NULL;
2802 if (ss->rx_big.info != NULL) {
2803 if (ss->rx_big.dmat != NULL) {
2804 for (i = 0; i <= ss->rx_big.mask; i++) {
2805 bus_dmamap_destroy(ss->rx_big.dmat,
2806 ss->rx_big.info[i].map);
2808 bus_dmamap_destroy(ss->rx_big.dmat,
2809 ss->rx_big.extra_map);
2810 bus_dma_tag_destroy(ss->rx_big.dmat);
2812 kfree(ss->rx_big.info, M_DEVBUF);
2813 ss->rx_big.info = NULL;
2818 mxge_free_rings(mxge_softc_t *sc)
2825 for (slice = 0; slice < sc->num_slices; slice++)
2826 mxge_free_slice_rings(&sc->ss[slice]);
2830 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
2831 int tx_ring_entries)
2833 mxge_softc_t *sc = ss->sc;
2838 * Allocate per-slice receive resources
2841 ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
2842 ss->rx_done.mask = (2 * rx_ring_entries) - 1;
2844 /* Allocate the rx shadow rings */
2845 bytes = rx_ring_entries * sizeof(*ss->rx_small.shadow);
2846 ss->rx_small.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2848 bytes = rx_ring_entries * sizeof(*ss->rx_big.shadow);
2849 ss->rx_big.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2851 /* Allocate the rx host info rings */
2852 bytes = rx_ring_entries * sizeof(*ss->rx_small.info);
2853 ss->rx_small.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2855 bytes = rx_ring_entries * sizeof(*ss->rx_big.info);
2856 ss->rx_big.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2858 /* Allocate the rx busdma resources */
2859 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
2861 4096, /* boundary */
2862 BUS_SPACE_MAXADDR, /* low */
2863 BUS_SPACE_MAXADDR, /* high */
2864 NULL, NULL, /* filter */
2865 MHLEN, /* maxsize */
2867 MHLEN, /* maxsegsize */
2868 BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW,
2870 &ss->rx_small.dmat); /* tag */
2872 device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
2877 err = bus_dmamap_create(ss->rx_small.dmat, BUS_DMA_WAITOK,
2878 &ss->rx_small.extra_map);
2880 device_printf(sc->dev, "Err %d extra rx_small dmamap\n", err);
2881 bus_dma_tag_destroy(ss->rx_small.dmat);
2882 ss->rx_small.dmat = NULL;
2885 for (i = 0; i <= ss->rx_small.mask; i++) {
2886 err = bus_dmamap_create(ss->rx_small.dmat, BUS_DMA_WAITOK,
2887 &ss->rx_small.info[i].map);
2891 device_printf(sc->dev, "Err %d rx_small dmamap\n", err);
2893 for (j = 0; j < i; ++j) {
2894 bus_dmamap_destroy(ss->rx_small.dmat,
2895 ss->rx_small.info[j].map);
2897 bus_dmamap_destroy(ss->rx_small.dmat,
2898 ss->rx_small.extra_map);
2899 bus_dma_tag_destroy(ss->rx_small.dmat);
2900 ss->rx_small.dmat = NULL;
2905 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
2907 4096, /* boundary */
2908 BUS_SPACE_MAXADDR, /* low */
2909 BUS_SPACE_MAXADDR, /* high */
2910 NULL, NULL, /* filter */
2913 4096, /* maxsegsize*/
2914 BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW,
2916 &ss->rx_big.dmat); /* tag */
2918 device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
2923 err = bus_dmamap_create(ss->rx_big.dmat, BUS_DMA_WAITOK,
2924 &ss->rx_big.extra_map);
2926 device_printf(sc->dev, "Err %d extra rx_big dmamap\n", err);
2927 bus_dma_tag_destroy(ss->rx_big.dmat);
2928 ss->rx_big.dmat = NULL;
2931 for (i = 0; i <= ss->rx_big.mask; i++) {
2932 err = bus_dmamap_create(ss->rx_big.dmat, BUS_DMA_WAITOK,
2933 &ss->rx_big.info[i].map);
2937 device_printf(sc->dev, "Err %d rx_big dmamap\n", err);
2938 for (j = 0; j < i; ++j) {
2939 bus_dmamap_destroy(ss->rx_big.dmat,
2940 ss->rx_big.info[j].map);
2942 bus_dmamap_destroy(ss->rx_big.dmat,
2943 ss->rx_big.extra_map);
2944 bus_dma_tag_destroy(ss->rx_big.dmat);
2945 ss->rx_big.dmat = NULL;
2951 * Now allocate TX resources
2954 #ifndef IFNET_BUF_RING
2955 /* only use a single TX ring for now */
2956 if (ss != ss->sc->ss)
2960 ss->tx.mask = tx_ring_entries - 1;
2961 ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
2963 /* Allocate the tx request copy block; MUST be 8 bytes aligned */
2964 bytes = sizeof(*ss->tx.req_list) * (ss->tx.max_desc + 4);
2965 ss->tx.req_list = kmalloc(bytes, M_DEVBUF, M_WAITOK);
2966 /* DragonFly's kmalloc(9) promises at least 8 bytes alignment */
2967 KASSERT(((uintptr_t)ss->tx.req_list & 0x7) == 0,
2968 ("req_list not 8 bytes aligned"));
2970 /* Allocate the tx busdma segment list */
2971 bytes = sizeof(*ss->tx.seg_list) * ss->tx.max_desc;
2972 ss->tx.seg_list = kmalloc(bytes, M_DEVBUF, M_WAITOK);
2974 /* Allocate the tx host info ring */
2975 bytes = tx_ring_entries * sizeof(*ss->tx.info);
2976 ss->tx.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2978 /* Allocate the tx busdma resources */
2979 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
2981 sc->tx_boundary, /* boundary */
2982 BUS_SPACE_MAXADDR, /* low */
2983 BUS_SPACE_MAXADDR, /* high */
2984 NULL, NULL, /* filter */
2986 sizeof(struct ether_vlan_header),
2988 ss->tx.max_desc - 2, /* num segs */
2989 sc->tx_boundary, /* maxsegsz */
2990 BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW |
2991 BUS_DMA_ONEBPAGE, /* flags */
2992 &ss->tx.dmat); /* tag */
2994 device_printf(sc->dev, "Err %d allocating tx dmat\n", err);
2999 * Now use these tags to setup DMA maps for each slot in the ring
3001 for (i = 0; i <= ss->tx.mask; i++) {
3002 err = bus_dmamap_create(ss->tx.dmat,
3003 BUS_DMA_WAITOK | BUS_DMA_ONEBPAGE, &ss->tx.info[i].map);
3007 device_printf(sc->dev, "Err %d tx dmamap\n", err);
3008 for (j = 0; j < i; ++j) {
3009 bus_dmamap_destroy(ss->tx.dmat,
3010 ss->tx.info[j].map);
3012 bus_dma_tag_destroy(ss->tx.dmat);
3021 mxge_alloc_rings(mxge_softc_t *sc)
3025 int tx_ring_entries, rx_ring_entries;
3028 /* Get ring sizes */
3029 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3031 device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3034 tx_ring_size = cmd.data0;
3036 tx_ring_entries = tx_ring_size / sizeof(mcp_kreq_ether_send_t);
3037 rx_ring_entries = sc->rx_ring_size / sizeof(mcp_dma_addr_t);
3038 ifq_set_maxlen(&sc->ifp->if_snd, tx_ring_entries - 1);
3039 ifq_set_ready(&sc->ifp->if_snd);
3041 for (slice = 0; slice < sc->num_slices; slice++) {
3042 err = mxge_alloc_slice_rings(&sc->ss[slice],
3043 rx_ring_entries, tx_ring_entries);
3045 device_printf(sc->dev,
3046 "alloc %d slice rings failed\n", slice);
3054 mxge_choose_params(int mtu, int *cl_size)
3056 int bufsize = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN + MXGEFW_PAD;
3058 if (bufsize < MCLBYTES) {
3059 *cl_size = MCLBYTES;
3061 KASSERT(bufsize < MJUMPAGESIZE, ("invalid MTU %d", mtu));
3062 *cl_size = MJUMPAGESIZE;
3067 mxge_slice_open(struct mxge_slice_state *ss, int cl_size)
3072 slice = ss - ss->sc->ss;
3075 * Get the lanai pointers to the send and receive rings
3078 #ifndef IFNET_BUF_RING
3079 /* We currently only send from the first slice */
3083 err = mxge_send_cmd(ss->sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3084 ss->tx.lanai = (volatile mcp_kreq_ether_send_t *)
3085 (ss->sc->sram + cmd.data0);
3086 ss->tx.send_go = (volatile uint32_t *)
3087 (ss->sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3088 ss->tx.send_stop = (volatile uint32_t *)
3089 (ss->sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3090 #ifndef IFNET_BUF_RING
3095 err |= mxge_send_cmd(ss->sc, MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3096 ss->rx_small.lanai =
3097 (volatile mcp_kreq_ether_recv_t *)(ss->sc->sram + cmd.data0);
3100 err |= mxge_send_cmd(ss->sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3102 (volatile mcp_kreq_ether_recv_t *)(ss->sc->sram + cmd.data0);
3105 if_printf(ss->sc->ifp,
3106 "failed to get ring sizes or locations\n");
3111 * Stock small receive ring
3113 for (i = 0; i <= ss->rx_small.mask; i++) {
3114 err = mxge_get_buf_small(&ss->rx_small,
3115 ss->rx_small.info[i].map, i, TRUE);
3117 if_printf(ss->sc->ifp, "alloced %d/%d smalls\n", i,
3118 ss->rx_small.mask + 1);
3124 * Stock big receive ring
3126 for (i = 0; i <= ss->rx_big.mask; i++) {
3127 ss->rx_big.shadow[i].addr_low = 0xffffffff;
3128 ss->rx_big.shadow[i].addr_high = 0xffffffff;
3131 ss->rx_big.cl_size = cl_size;
3132 ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3133 EVL_ENCAPLEN + MXGEFW_PAD;
3135 for (i = 0; i <= ss->rx_big.mask; i++) {
3136 err = mxge_get_buf_big(&ss->rx_big,
3137 ss->rx_big.info[i].map, i, TRUE);
3139 if_printf(ss->sc->ifp, "alloced %d/%d bigs\n", i,
3140 ss->rx_big.mask + 1);
3148 mxge_open(mxge_softc_t *sc)
3150 struct ifnet *ifp = sc->ifp;
3152 int err, slice, cl_size, i;
3154 volatile uint8_t *itable;
3155 struct mxge_slice_state *ss;
3157 ASSERT_SERIALIZED(ifp->if_serializer);
3159 /* Copy the MAC address in case it was overridden */
3160 bcopy(IF_LLADDR(ifp), sc->mac_addr, ETHER_ADDR_LEN);
3162 err = mxge_reset(sc, 1);
3164 if_printf(ifp, "failed to reset\n");
3168 if (sc->num_slices > 1) {
3169 /* Setup the indirection table */
3170 cmd.data0 = sc->num_slices;
3171 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE, &cmd);
3173 err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET, &cmd);
3175 if_printf(ifp, "failed to setup rss tables\n");
3179 /* Just enable an identity mapping */
3180 itable = sc->sram + cmd.data0;
3181 for (i = 0; i < sc->num_slices; i++)
3182 itable[i] = (uint8_t)i;
3185 cmd.data1 = MXGEFW_RSS_HASH_TYPE_TCP_IPV4;
3186 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3188 if_printf(ifp, "failed to enable slices\n");
3193 cmd.data0 = MXGEFW_TSO_MODE_NDIS;
3194 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_TSO_MODE, &cmd);
3197 * Can't change TSO mode to NDIS, never allow TSO then
3199 if_printf(ifp, "failed to set TSO mode\n");
3200 ifp->if_capenable &= ~IFCAP_TSO;
3201 ifp->if_capabilities &= ~IFCAP_TSO;
3202 ifp->if_hwassist &= ~CSUM_TSO;
3205 mxge_choose_params(ifp->if_mtu, &cl_size);
3208 err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS, &cmd);
3210 * Error is only meaningful if we're trying to set
3211 * MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1
3215 * Give the firmware the mtu and the big and small buffer
3216 * sizes. The firmware wants the big buf size to be a power
3217 * of two. Luckily, DragonFly's clusters are powers of two
3219 cmd.data0 = ifp->if_mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
3220 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3222 /* XXX need to cut MXGEFW_PAD here? */
3223 cmd.data0 = MHLEN - MXGEFW_PAD;
3224 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE, &cmd);
3226 cmd.data0 = cl_size;
3227 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3230 if_printf(ifp, "failed to setup params\n");
3234 /* Now give him the pointer to the stats block */
3236 #ifdef IFNET_BUF_RING
3237 slice < sc->num_slices;
3242 ss = &sc->ss[slice];
3243 cmd.data0 = MXGE_LOWPART_TO_U32(ss->fw_stats_dma.dmem_busaddr);
3244 cmd.data1 = MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.dmem_busaddr);
3245 cmd.data2 = sizeof(struct mcp_irq_data);
3246 cmd.data2 |= (slice << 16);
3247 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3251 bus = sc->ss->fw_stats_dma.dmem_busaddr;
3252 bus += offsetof(struct mcp_irq_data, send_done_count);
3253 cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3254 cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3255 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3258 /* Firmware cannot support multicast without STATS_DMA_V2 */
3259 sc->fw_multicast_support = 0;
3261 sc->fw_multicast_support = 1;
3265 if_printf(ifp, "failed to setup params\n");
3269 for (slice = 0; slice < sc->num_slices; slice++) {
3270 err = mxge_slice_open(&sc->ss[slice], cl_size);
3272 if_printf(ifp, "couldn't open slice %d\n", slice);
3277 /* Finally, start the firmware running */
3278 err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3280 if_printf(ifp, "Couldn't bring up link\n");
3283 ifp->if_flags |= IFF_RUNNING;
3284 ifq_clr_oactive(&ifp->if_snd);
3290 mxge_free_mbufs(sc);
3295 mxge_close(mxge_softc_t *sc, int down)
3297 struct ifnet *ifp = sc->ifp;
3299 int err, old_down_cnt;
3301 ASSERT_SERIALIZED(ifp->if_serializer);
3303 ifp->if_flags &= ~IFF_RUNNING;
3304 ifq_clr_oactive(&ifp->if_snd);
3308 old_down_cnt = sc->down_cnt;
3311 err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3313 if_printf(ifp, "Couldn't bring down link\n");
3315 if (old_down_cnt == sc->down_cnt) {
3316 /* Wait for down irq */
3317 lwkt_serialize_exit(ifp->if_serializer);
3318 DELAY(10 * sc->intr_coal_delay);
3319 lwkt_serialize_enter(ifp->if_serializer);
3323 if (old_down_cnt == sc->down_cnt)
3324 if_printf(ifp, "never got down irq\n");
3326 mxge_free_mbufs(sc);
3330 mxge_setup_cfg_space(mxge_softc_t *sc)
3332 device_t dev = sc->dev;
3334 uint16_t lnk, pectl;
3336 /* Find the PCIe link width and set max read request to 4KB */
3337 if (pci_find_extcap(dev, PCIY_EXPRESS, ®) == 0) {
3338 lnk = pci_read_config(dev, reg + 0x12, 2);
3339 sc->link_width = (lnk >> 4) & 0x3f;
3341 if (sc->pectl == 0) {
3342 pectl = pci_read_config(dev, reg + 0x8, 2);
3343 pectl = (pectl & ~0x7000) | (5 << 12);
3344 pci_write_config(dev, reg + 0x8, pectl, 2);
3347 /* Restore saved pectl after watchdog reset */
3348 pci_write_config(dev, reg + 0x8, sc->pectl, 2);
3352 /* Enable DMA and memory space access */
3353 pci_enable_busmaster(dev);
3357 mxge_read_reboot(mxge_softc_t *sc)
3359 device_t dev = sc->dev;
3362 /* Find the vendor specific offset */
3363 if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
3364 if_printf(sc->ifp, "could not find vendor specific offset\n");
3365 return (uint32_t)-1;
3367 /* Enable read32 mode */
3368 pci_write_config(dev, vs + 0x10, 0x3, 1);
3369 /* Tell NIC which register to read */
3370 pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3371 return pci_read_config(dev, vs + 0x14, 4);
3375 mxge_watchdog_reset(mxge_softc_t *sc)
3377 struct pci_devinfo *dinfo;
3384 if_printf(sc->ifp, "Watchdog reset!\n");
3387 * Check to see if the NIC rebooted. If it did, then all of
3388 * PCI config space has been reset, and things like the
3389 * busmaster bit will be zero. If this is the case, then we
3390 * must restore PCI config space before the NIC can be used
3393 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3394 if (cmd == 0xffff) {
3396 * Maybe the watchdog caught the NIC rebooting; wait
3397 * up to 100ms for it to finish. If it does not come
3398 * back, then give up
3401 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3403 if_printf(sc->ifp, "NIC disappeared!\n");
3405 if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3406 /* Print the reboot status */
3407 reboot = mxge_read_reboot(sc);
3408 if_printf(sc->ifp, "NIC rebooted, status = 0x%x\n", reboot);
3410 running = sc->ifp->if_flags & IFF_RUNNING;
3413 * Quiesce NIC so that TX routines will not try to
3414 * xmit after restoration of BAR
3417 /* Mark the link as down */
3418 if (sc->link_state) {
3419 sc->ifp->if_link_state = LINK_STATE_DOWN;
3420 if_link_state_change(sc->ifp);
3424 /* Restore PCI configuration space */
3425 dinfo = device_get_ivars(sc->dev);
3426 pci_cfg_restore(sc->dev, dinfo);
3428 /* And redo any changes we made to our config space */
3429 mxge_setup_cfg_space(sc);
3432 err = mxge_load_firmware(sc, 0);
3434 if_printf(sc->ifp, "Unable to re-load f/w\n");
3435 if (running && !err) {
3436 err = mxge_open(sc);
3437 if_devstart_sched(sc->ifp);
3439 sc->watchdog_resets++;
3441 if_printf(sc->ifp, "NIC did not reboot, not resetting\n");
3445 if_printf(sc->ifp, "watchdog reset failed\n");
3449 callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3454 mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice)
3456 if_printf(sc->ifp, "slice %d struck? ring state:\n", slice);
3457 if_printf(sc->ifp, "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3458 tx->req, tx->done, tx->queue_active);
3459 if_printf(sc->ifp, "tx.activate=%d tx.deactivate=%d\n",
3460 tx->activate, tx->deactivate);
3461 if_printf(sc->ifp, "pkt_done=%d fw=%d\n",
3462 tx->pkt_done, be32toh(sc->ss->fw_stats->send_done_count));
3466 mxge_update_stats(mxge_softc_t *sc)
3468 u_long ipackets, opackets, pkts;
3470 IFNET_STAT_GET(sc->ifp, ipackets, ipackets);
3471 IFNET_STAT_GET(sc->ifp, opackets, opackets);
3473 pkts = ipackets - sc->ipackets;
3474 pkts += opackets - sc->opackets;
3476 sc->ipackets = ipackets;
3477 sc->opackets = opackets;
3483 mxge_tick(void *arg)
3485 mxge_softc_t *sc = arg;
3490 lwkt_serialize_enter(sc->ifp->if_serializer);
3493 if (sc->ifp->if_flags & IFF_RUNNING) {
3494 /* Aggregate stats from different slices */
3495 pkts = mxge_update_stats(sc);
3496 if (sc->need_media_probe)
3497 mxge_media_probe(sc);
3502 /* Ensure NIC did not suffer h/w fault while idle */
3503 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3504 if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3506 mxge_watchdog_reset(sc);
3510 /* Look less often if NIC is idle */
3515 callout_reset(&sc->co_hdl, ticks, mxge_tick, sc);
3517 lwkt_serialize_exit(sc->ifp->if_serializer);
3521 mxge_media_change(struct ifnet *ifp)
3527 mxge_change_mtu(mxge_softc_t *sc, int mtu)
3529 struct ifnet *ifp = sc->ifp;
3530 int real_mtu, old_mtu;
3533 real_mtu = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
3534 if (mtu > sc->max_mtu || real_mtu < 60)
3537 old_mtu = ifp->if_mtu;
3539 if (ifp->if_flags & IFF_RUNNING) {
3541 err = mxge_open(sc);
3543 ifp->if_mtu = old_mtu;
3552 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
3554 mxge_softc_t *sc = ifp->if_softc;
3559 ifmr->ifm_status = IFM_AVALID;
3560 ifmr->ifm_active = IFM_ETHER | IFM_FDX;
3561 ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
3562 ifmr->ifm_active |= sc->current_media;
3566 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data,
3567 struct ucred *cr __unused)
3569 mxge_softc_t *sc = ifp->if_softc;
3570 struct ifreq *ifr = (struct ifreq *)data;
3573 ASSERT_SERIALIZED(ifp->if_serializer);
3578 err = mxge_change_mtu(sc, ifr->ifr_mtu);
3585 if (ifp->if_flags & IFF_UP) {
3586 if (!(ifp->if_flags & IFF_RUNNING)) {
3587 err = mxge_open(sc);
3590 * Take care of PROMISC and ALLMULTI
3593 mxge_change_promisc(sc,
3594 ifp->if_flags & IFF_PROMISC);
3595 mxge_set_multicast_list(sc);
3598 if (ifp->if_flags & IFF_RUNNING)
3605 mxge_set_multicast_list(sc);
3609 mask = ifr->ifr_reqcap ^ ifp->if_capenable;
3610 if (mask & IFCAP_TXCSUM) {
3611 ifp->if_capenable ^= IFCAP_TXCSUM;
3612 if (ifp->if_capenable & IFCAP_TXCSUM)
3613 ifp->if_hwassist |= CSUM_TCP | CSUM_UDP;
3615 ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP);
3617 if (mask & IFCAP_TSO) {
3618 ifp->if_capenable ^= IFCAP_TSO;
3619 if (ifp->if_capenable & IFCAP_TSO)
3620 ifp->if_hwassist |= CSUM_TSO;
3622 ifp->if_hwassist &= ~CSUM_TSO;
3624 if (mask & IFCAP_RXCSUM)
3625 ifp->if_capenable ^= IFCAP_RXCSUM;
3626 if (mask & IFCAP_VLAN_HWTAGGING)
3627 ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
3631 mxge_media_probe(sc);
3632 err = ifmedia_ioctl(ifp, (struct ifreq *)data,
3633 &sc->media, command);
3637 err = ether_ioctl(ifp, command, data);
3644 mxge_fetch_tunables(mxge_softc_t *sc)
3646 sc->intr_coal_delay = mxge_intr_coal_delay;
3647 if (sc->intr_coal_delay < 0 || sc->intr_coal_delay > (10 * 1000))
3648 sc->intr_coal_delay = MXGE_INTR_COAL_DELAY;
3651 if (mxge_ticks == 0)
3652 mxge_ticks = hz / 2;
3654 sc->pause = mxge_flow_control;
3656 sc->throttle = mxge_throttle;
3657 if (sc->throttle && sc->throttle > MXGE_MAX_THROTTLE)
3658 sc->throttle = MXGE_MAX_THROTTLE;
3659 if (sc->throttle && sc->throttle < MXGE_MIN_THROTTLE)
3660 sc->throttle = MXGE_MIN_THROTTLE;
3664 mxge_free_slices(mxge_softc_t *sc)
3666 struct mxge_slice_state *ss;
3672 for (i = 0; i < sc->num_slices; i++) {
3674 if (ss->fw_stats != NULL) {
3675 mxge_dma_free(&ss->fw_stats_dma);
3676 ss->fw_stats = NULL;
3678 if (ss->rx_done.entry != NULL) {
3679 mxge_dma_free(&ss->rx_done.dma);
3680 ss->rx_done.entry = NULL;
3683 kfree(sc->ss, M_DEVBUF);
3688 mxge_alloc_slices(mxge_softc_t *sc)
3691 struct mxge_slice_state *ss;
3693 int err, i, max_intr_slots;
3695 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
3697 device_printf(sc->dev, "Cannot determine rx ring size\n");
3700 sc->rx_ring_size = cmd.data0;
3701 max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
3703 bytes = sizeof(*sc->ss) * sc->num_slices;
3704 sc->ss = kmalloc(bytes, M_DEVBUF, M_WAITOK | M_ZERO);
3706 for (i = 0; i < sc->num_slices; i++) {
3711 ss->rx_small.sc = sc;
3713 ss->rx_done.rx_big = &ss->rx_big;
3714 ss->rx_done.rx_small = &ss->rx_small;
3717 * Allocate per-slice rx interrupt queues
3719 bytes = max_intr_slots * sizeof(*ss->rx_done.entry);
3720 err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
3722 device_printf(sc->dev,
3723 "alloc %d slice rx_done failed\n", i);
3726 ss->rx_done.entry = ss->rx_done.dma.dmem_addr;
3729 * Allocate the per-slice firmware stats; stats
3730 * (including tx) are used used only on the first
3733 #ifndef IFNET_BUF_RING
3738 bytes = sizeof(*ss->fw_stats);
3739 err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
3740 sizeof(*ss->fw_stats), 64);
3742 device_printf(sc->dev,
3743 "alloc %d fw_stats failed\n", i);
3746 ss->fw_stats = ss->fw_stats_dma.dmem_addr;
3752 mxge_slice_probe(mxge_softc_t *sc)
3756 int msix_cnt, status, max_intr_slots;
3763 * Don't enable multiple slices if they are not enabled,
3764 * or if this is not an SMP system
3766 if (mxge_max_slices == 0 || mxge_max_slices == 1 || ncpus < 2)
3769 /* see how many MSI-X interrupts are available */
3770 msix_cnt = pci_msix_count(sc->dev);
3774 /* now load the slice aware firmware see what it supports */
3775 old_fw = sc->fw_name;
3776 if (old_fw == mxge_fw_aligned)
3777 sc->fw_name = mxge_fw_rss_aligned;
3779 sc->fw_name = mxge_fw_rss_unaligned;
3780 status = mxge_load_firmware(sc, 0);
3782 device_printf(sc->dev, "Falling back to a single slice\n");
3786 /* try to send a reset command to the card to see if it
3788 memset(&cmd, 0, sizeof (cmd));
3789 status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
3791 device_printf(sc->dev, "failed reset\n");
3795 /* get rx ring size */
3796 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
3798 device_printf(sc->dev, "Cannot determine rx ring size\n");
3801 max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
3803 /* tell it the size of the interrupt queues */
3804 cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
3805 status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
3807 device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
3811 /* ask the maximum number of slices it supports */
3812 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
3814 device_printf(sc->dev,
3815 "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
3818 sc->num_slices = cmd.data0;
3819 if (sc->num_slices > msix_cnt)
3820 sc->num_slices = msix_cnt;
3822 if (mxge_max_slices == -1) {
3823 /* cap to number of CPUs in system */
3824 if (sc->num_slices > ncpus)
3825 sc->num_slices = ncpus;
3827 if (sc->num_slices > mxge_max_slices)
3828 sc->num_slices = mxge_max_slices;
3830 /* make sure it is a power of two */
3831 while (sc->num_slices & (sc->num_slices - 1))
3835 device_printf(sc->dev, "using %d slices\n",
3841 sc->fw_name = old_fw;
3842 (void) mxge_load_firmware(sc, 0);
3847 mxge_add_msix_irqs(mxge_softc_t *sc)
3850 int count, err, i, rid;
3853 sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
3856 if (sc->msix_table_res == NULL) {
3857 device_printf(sc->dev, "couldn't alloc MSIX table res\n");
3861 count = sc->num_slices;
3862 err = pci_alloc_msix(sc->dev, &count);
3864 device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
3865 "err = %d \n", sc->num_slices, err);
3866 goto abort_with_msix_table;
3868 if (count < sc->num_slices) {
3869 device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
3870 count, sc->num_slices);
3871 device_printf(sc->dev,
3872 "Try setting hw.mxge.max_slices to %d\n",
3875 goto abort_with_msix;
3877 bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
3878 sc->msix_irq_res = kmalloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
3879 if (sc->msix_irq_res == NULL) {
3881 goto abort_with_msix;
3884 for (i = 0; i < sc->num_slices; i++) {
3886 sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
3889 if (sc->msix_irq_res[i] == NULL) {
3890 device_printf(sc->dev, "couldn't allocate IRQ res"
3891 " for message %d\n", i);
3893 goto abort_with_res;
3897 bytes = sizeof (*sc->msix_ih) * sc->num_slices;
3898 sc->msix_ih = kmalloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
3900 for (i = 0; i < sc->num_slices; i++) {
3901 err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
3903 mxge_intr, &sc->ss[i], &sc->msix_ih[i],
3904 sc->ifp->if_serializer);
3906 device_printf(sc->dev, "couldn't setup intr for "
3908 goto abort_with_intr;
3913 device_printf(sc->dev, "using %d msix IRQs:",
3915 for (i = 0; i < sc->num_slices; i++)
3916 kprintf(" %ld", rman_get_start(sc->msix_irq_res[i]));
3922 for (i = 0; i < sc->num_slices; i++) {
3923 if (sc->msix_ih[i] != NULL) {
3924 bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
3926 sc->msix_ih[i] = NULL;
3929 kfree(sc->msix_ih, M_DEVBUF);
3933 for (i = 0; i < sc->num_slices; i++) {
3935 if (sc->msix_irq_res[i] != NULL)
3936 bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
3937 sc->msix_irq_res[i]);
3938 sc->msix_irq_res[i] = NULL;
3940 kfree(sc->msix_irq_res, M_DEVBUF);
3944 pci_release_msi(sc->dev);
3946 abort_with_msix_table:
3947 bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
3948 sc->msix_table_res);
3955 mxge_add_single_irq(mxge_softc_t *sc)
3957 driver_intr_t *intr_func;
3960 sc->irq_type = pci_alloc_1intr(sc->dev, mxge_msi_enable,
3961 &sc->irq_rid, &irq_flags);
3963 sc->irq_res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ,
3964 &sc->irq_rid, irq_flags);
3965 if (sc->irq_res == NULL) {
3966 device_printf(sc->dev, "could not alloc interrupt\n");
3970 if (sc->irq_type == PCI_INTR_TYPE_LEGACY)
3971 intr_func = mxge_legacy;
3973 intr_func = mxge_msi;
3975 return bus_setup_intr(sc->dev, sc->irq_res, INTR_MPSAFE,
3976 intr_func, &sc->ss[0], &sc->ih, sc->ifp->if_serializer);
3981 mxge_rem_msix_irqs(mxge_softc_t *sc)
3985 for (i = 0; i < sc->num_slices; i++) {
3986 if (sc->msix_ih[i] != NULL) {
3987 bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
3989 sc->msix_ih[i] = NULL;
3992 kfree(sc->msix_ih, M_DEVBUF);
3994 for (i = 0; i < sc->num_slices; i++) {
3996 if (sc->msix_irq_res[i] != NULL)
3997 bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
3998 sc->msix_irq_res[i]);
3999 sc->msix_irq_res[i] = NULL;
4001 kfree(sc->msix_irq_res, M_DEVBUF);
4003 bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4004 sc->msix_table_res);
4006 pci_release_msi(sc->dev);
4012 mxge_add_irq(mxge_softc_t *sc)
4017 if (sc->num_slices > 1)
4018 err = mxge_add_msix_irqs(sc);
4020 err = mxge_add_single_irq(sc);
4022 if (0 && err == 0 && sc->num_slices > 1) {
4023 mxge_rem_msix_irqs(sc);
4024 err = mxge_add_msix_irqs(sc);
4028 return mxge_add_single_irq(sc);
4033 mxge_attach(device_t dev)
4035 mxge_softc_t *sc = device_get_softc(dev);
4036 struct ifnet *ifp = &sc->arpcom.ac_if;
4040 * Avoid rewriting half the lines in this file to use
4041 * &sc->arpcom.ac_if instead
4045 if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4046 ifmedia_init(&sc->media, 0, mxge_media_change, mxge_media_status);
4048 mxge_fetch_tunables(sc);
4050 err = bus_dma_tag_create(NULL, /* parent */
4053 BUS_SPACE_MAXADDR, /* low */
4054 BUS_SPACE_MAXADDR, /* high */
4055 NULL, NULL, /* filter */
4056 BUS_SPACE_MAXSIZE_32BIT,/* maxsize */
4058 BUS_SPACE_MAXSIZE_32BIT,/* maxsegsize */
4060 &sc->parent_dmat); /* tag */
4062 device_printf(dev, "Err %d allocating parent dmat\n", err);
4066 callout_init_mp(&sc->co_hdl);
4068 mxge_setup_cfg_space(sc);
4071 * Map the board into the kernel
4074 sc->mem_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY,
4076 if (sc->mem_res == NULL) {
4077 device_printf(dev, "could not map memory\n");
4082 sc->sram = rman_get_virtual(sc->mem_res);
4083 sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4084 if (sc->sram_size > rman_get_size(sc->mem_res)) {
4085 device_printf(dev, "impossible memory region size %ld\n",
4086 rman_get_size(sc->mem_res));
4092 * Make NULL terminated copy of the EEPROM strings section of
4095 bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4096 bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4097 rman_get_bushandle(sc->mem_res),
4098 sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4099 sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE - 2);
4100 err = mxge_parse_strings(sc);
4102 device_printf(dev, "parse EEPROM string failed\n");
4107 * Enable write combining for efficient use of PCIe bus
4112 * Allocate the out of band DMA memory
4114 err = mxge_dma_alloc(sc, &sc->cmd_dma, sizeof(mxge_cmd_t), 64);
4116 device_printf(dev, "alloc cmd DMA buf failed\n");
4119 sc->cmd = sc->cmd_dma.dmem_addr;
4121 err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4123 device_printf(dev, "alloc zeropad DMA buf failed\n");
4127 err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4129 device_printf(dev, "alloc dmabench DMA buf failed\n");
4133 /* Select & load the firmware */
4134 err = mxge_select_firmware(sc);
4136 device_printf(dev, "select firmware failed\n");
4140 mxge_slice_probe(sc);
4141 err = mxge_alloc_slices(sc);
4143 device_printf(dev, "alloc slices failed\n");
4147 err = mxge_reset(sc, 0);
4149 device_printf(dev, "reset failed\n");
4153 err = mxge_alloc_rings(sc);
4155 device_printf(dev, "failed to allocate rings\n");
4159 ifp->if_baudrate = IF_Gbps(10UL);
4160 ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO;
4161 ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4163 ifp->if_capabilities |= IFCAP_VLAN_MTU;
4165 /* Well, its software, sigh */
4166 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING;
4168 ifp->if_capenable = ifp->if_capabilities;
4171 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4172 ifp->if_init = mxge_init;
4173 ifp->if_ioctl = mxge_ioctl;
4174 ifp->if_start = mxge_start;
4175 ifp->if_watchdog = mxge_watchdog;
4177 /* Increase TSO burst length */
4178 ifp->if_tsolen = (32 * ETHERMTU);
4180 /* Initialise the ifmedia structure */
4181 mxge_media_init(sc);
4182 mxge_media_probe(sc);
4184 ether_ifattach(ifp, sc->mac_addr, NULL);
4188 * We are not ready to do "gather" jumbo frame, so
4189 * limit MTU to MJUMPAGESIZE
4191 sc->max_mtu = MJUMPAGESIZE -
4192 ETHER_HDR_LEN - EVL_ENCAPLEN - MXGEFW_PAD - 1;
4195 /* must come after ether_ifattach() */
4196 err = mxge_add_irq(sc);
4198 device_printf(dev, "alloc and setup intr failed\n");
4199 ether_ifdetach(ifp);
4202 ifq_set_cpuid(&ifp->if_snd, rman_get_cpuid(sc->irq_res));
4204 mxge_add_sysctls(sc);
4206 callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
4215 mxge_detach(device_t dev)
4217 mxge_softc_t *sc = device_get_softc(dev);
4219 if (device_is_attached(dev)) {
4220 struct ifnet *ifp = sc->ifp;
4222 lwkt_serialize_enter(ifp->if_serializer);
4225 if (ifp->if_flags & IFF_RUNNING)
4227 callout_stop(&sc->co_hdl);
4229 bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4231 lwkt_serialize_exit(ifp->if_serializer);
4233 callout_terminate(&sc->co_hdl);
4235 ether_ifdetach(ifp);
4237 ifmedia_removeall(&sc->media);
4239 if (sc->cmd != NULL && sc->zeropad_dma.dmem_addr != NULL &&
4241 mxge_dummy_rdma(sc, 0);
4243 mxge_rem_sysctls(sc);
4244 mxge_free_rings(sc);
4246 /* MUST after sysctls and rings are freed */
4247 mxge_free_slices(sc);
4249 if (sc->dmabench_dma.dmem_addr != NULL)
4250 mxge_dma_free(&sc->dmabench_dma);
4251 if (sc->zeropad_dma.dmem_addr != NULL)
4252 mxge_dma_free(&sc->zeropad_dma);
4253 if (sc->cmd_dma.dmem_addr != NULL)
4254 mxge_dma_free(&sc->cmd_dma);
4256 if (sc->irq_res != NULL) {
4257 bus_release_resource(dev, SYS_RES_IRQ, sc->irq_rid,
4260 if (sc->irq_type == PCI_INTR_TYPE_MSI)
4261 pci_release_msi(dev);
4263 if (sc->mem_res != NULL) {
4264 bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS,
4268 if (sc->parent_dmat != NULL)
4269 bus_dma_tag_destroy(sc->parent_dmat);
4275 mxge_shutdown(device_t dev)