1 /******************************************************************************
3 Copyright (c) 2006-2009, Myricom Inc.
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
9 1. Redistributions of source code must retain the above copyright notice,
10 this list of conditions and the following disclaimer.
12 2. Neither the name of the Myricom Inc, nor the names of its
13 contributors may be used to endorse or promote products derived from
14 this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
28 ***************************************************************************/
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD: src/sys/dev/mxge/if_mxge.c,v 1.63 2009/06/26 11:45:06 rwatson Exp $");
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/linker.h>
36 #include <sys/firmware.h>
37 #include <sys/endian.h>
38 #include <sys/sockio.h>
40 #include <sys/malloc.h>
42 #include <sys/kernel.h>
44 #include <sys/module.h>
45 #include <sys/socket.h>
46 #include <sys/sysctl.h>
49 /* count xmits ourselves, rather than via drbr */
52 #include <net/if_arp.h>
53 #include <net/ethernet.h>
54 #include <net/if_dl.h>
55 #include <net/if_media.h>
59 #include <net/if_types.h>
60 #include <net/if_vlan_var.h>
63 #include <netinet/in_systm.h>
64 #include <netinet/in.h>
65 #include <netinet/ip.h>
66 #include <netinet/tcp.h>
68 #include <machine/bus.h>
69 #include <machine/in_cksum.h>
70 #include <machine/resource.h>
75 #include <dev/pci/pcireg.h>
76 #include <dev/pci/pcivar.h>
77 #include <dev/pci/pci_private.h> /* XXX for pci_cfg_restore */
79 #include <vm/vm.h> /* for pmap_mapdev() */
82 #if defined(__i386) || defined(__amd64)
83 #include <machine/specialreg.h>
86 #include <dev/mxge/mxge_mcp.h>
87 #include <dev/mxge/mcp_gen_header.h>
88 /*#define MXGE_FAKE_IFP*/
89 #include <dev/mxge/if_mxge_var.h>
91 #include <sys/buf_ring.h>
97 static int mxge_nvidia_ecrc_enable = 1;
98 static int mxge_force_firmware = 0;
99 static int mxge_intr_coal_delay = 30;
100 static int mxge_deassert_wait = 1;
101 static int mxge_flow_control = 1;
102 static int mxge_verbose = 0;
103 static int mxge_lro_cnt = 8;
104 static int mxge_ticks;
105 static int mxge_max_slices = 1;
106 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
107 static int mxge_always_promisc = 0;
108 static int mxge_initial_mtu = ETHERMTU_JUMBO;
109 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
110 static char *mxge_fw_aligned = "mxge_eth_z8e";
111 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
112 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
114 static int mxge_probe(device_t dev);
115 static int mxge_attach(device_t dev);
116 static int mxge_detach(device_t dev);
117 static int mxge_shutdown(device_t dev);
118 static void mxge_intr(void *arg);
120 static device_method_t mxge_methods[] =
122 /* Device interface */
123 DEVMETHOD(device_probe, mxge_probe),
124 DEVMETHOD(device_attach, mxge_attach),
125 DEVMETHOD(device_detach, mxge_detach),
126 DEVMETHOD(device_shutdown, mxge_shutdown),
130 static driver_t mxge_driver =
134 sizeof(mxge_softc_t),
137 static devclass_t mxge_devclass;
139 /* Declare ourselves to be a child of the PCI bus.*/
140 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
141 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
142 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
144 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
145 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
146 static int mxge_close(mxge_softc_t *sc);
147 static int mxge_open(mxge_softc_t *sc);
148 static void mxge_tick(void *arg);
151 mxge_probe(device_t dev)
156 if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
157 ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
158 (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
159 rev = pci_get_revid(dev);
161 case MXGE_PCI_REV_Z8E:
162 device_set_desc(dev, "Myri10G-PCIE-8A");
164 case MXGE_PCI_REV_Z8ES:
165 device_set_desc(dev, "Myri10G-PCIE-8B");
168 device_set_desc(dev, "Myri10G-PCIE-8??");
169 device_printf(dev, "Unrecognized rev %d NIC\n",
179 mxge_enable_wc(mxge_softc_t *sc)
181 #if defined(__i386) || defined(__amd64)
186 len = rman_get_size(sc->mem_res);
187 err = pmap_change_attr((vm_offset_t) sc->sram,
188 len, PAT_WRITE_COMBINING);
190 device_printf(sc->dev, "pmap_change_attr failed, %d\n",
198 /* callback to get our DMA address */
200 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
204 *(bus_addr_t *) arg = segs->ds_addr;
209 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
210 bus_size_t alignment)
213 device_t dev = sc->dev;
214 bus_size_t boundary, maxsegsize;
216 if (bytes > 4096 && alignment == 4096) {
224 /* allocate DMAable memory tags */
225 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
226 alignment, /* alignment */
227 boundary, /* boundary */
228 BUS_SPACE_MAXADDR, /* low */
229 BUS_SPACE_MAXADDR, /* high */
230 NULL, NULL, /* filter */
233 maxsegsize, /* maxsegsize */
234 BUS_DMA_COHERENT, /* flags */
235 NULL, NULL, /* lock */
236 &dma->dmat); /* tag */
238 device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
242 /* allocate DMAable memory & map */
243 err = bus_dmamem_alloc(dma->dmat, &dma->addr,
244 (BUS_DMA_WAITOK | BUS_DMA_COHERENT
245 | BUS_DMA_ZERO), &dma->map);
247 device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
248 goto abort_with_dmat;
251 /* load the memory */
252 err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
253 mxge_dmamap_callback,
254 (void *)&dma->bus_addr, 0);
256 device_printf(dev, "couldn't load map (err = %d)\n", err);
262 bus_dmamem_free(dma->dmat, dma->addr, dma->map);
264 (void)bus_dma_tag_destroy(dma->dmat);
270 mxge_dma_free(mxge_dma_t *dma)
272 bus_dmamap_unload(dma->dmat, dma->map);
273 bus_dmamem_free(dma->dmat, dma->addr, dma->map);
274 (void)bus_dma_tag_destroy(dma->dmat);
278 * The eeprom strings on the lanaiX have the format
285 mxge_parse_strings(mxge_softc_t *sc)
287 #define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
292 ptr = sc->eeprom_strings;
293 limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
295 while (ptr < limit && *ptr != '\0') {
296 if (memcmp(ptr, "MAC=", 4) == 0) {
298 sc->mac_addr_string = ptr;
299 for (i = 0; i < 6; i++) {
301 if ((ptr + 2) > limit)
303 sc->mac_addr[i] = strtoul(ptr, NULL, 16);
306 } else if (memcmp(ptr, "PC=", 3) == 0) {
308 strncpy(sc->product_code_string, ptr,
309 sizeof (sc->product_code_string) - 1);
310 } else if (memcmp(ptr, "SN=", 3) == 0) {
312 strncpy(sc->serial_number_string, ptr,
313 sizeof (sc->serial_number_string) - 1);
315 MXGE_NEXT_STRING(ptr);
322 device_printf(sc->dev, "failed to parse eeprom_strings\n");
327 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
329 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
332 unsigned long base, off;
334 device_t pdev, mcp55;
335 uint16_t vendor_id, device_id, word;
336 uintptr_t bus, slot, func, ivend, idev;
340 if (!mxge_nvidia_ecrc_enable)
343 pdev = device_get_parent(device_get_parent(sc->dev));
345 device_printf(sc->dev, "could not find parent?\n");
348 vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
349 device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
351 if (vendor_id != 0x10de)
356 if (device_id == 0x005d) {
357 /* ck804, base address is magic */
359 } else if (device_id >= 0x0374 && device_id <= 0x378) {
360 /* mcp55, base address stored in chipset */
361 mcp55 = pci_find_bsf(0, 0, 0);
363 0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
364 0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
365 word = pci_read_config(mcp55, 0x90, 2);
366 base = ((unsigned long)word & 0x7ffeU) << 25;
373 Test below is commented because it is believed that doing
374 config read/write beyond 0xff will access the config space
375 for the next larger function. Uncomment this and remove
376 the hacky pmap_mapdev() way of accessing config space when
377 FreeBSD grows support for extended pcie config space access
380 /* See if we can, by some miracle, access the extended
382 val = pci_read_config(pdev, 0x178, 4);
383 if (val != 0xffffffff) {
385 pci_write_config(pdev, 0x178, val, 4);
389 /* Rather than using normal pci config space writes, we must
390 * map the Nvidia config space ourselves. This is because on
391 * opteron/nvidia class machine the 0xe000000 mapping is
392 * handled by the nvidia chipset, that means the internal PCI
393 * device (the on-chip northbridge), or the amd-8131 bridge
394 * and things behind them are not visible by this method.
397 BUS_READ_IVAR(device_get_parent(pdev), pdev,
399 BUS_READ_IVAR(device_get_parent(pdev), pdev,
400 PCI_IVAR_SLOT, &slot);
401 BUS_READ_IVAR(device_get_parent(pdev), pdev,
402 PCI_IVAR_FUNCTION, &func);
403 BUS_READ_IVAR(device_get_parent(pdev), pdev,
404 PCI_IVAR_VENDOR, &ivend);
405 BUS_READ_IVAR(device_get_parent(pdev), pdev,
406 PCI_IVAR_DEVICE, &idev);
409 + 0x00100000UL * (unsigned long)bus
410 + 0x00001000UL * (unsigned long)(func
413 /* map it into the kernel */
414 va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
418 device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
421 /* get a pointer to the config space mapped into the kernel */
422 cfgptr = va + (off & PAGE_MASK);
424 /* make sure that we can really access it */
425 vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
426 device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
427 if (! (vendor_id == ivend && device_id == idev)) {
428 device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
429 vendor_id, device_id);
430 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
434 ptr32 = (uint32_t*)(cfgptr + 0x178);
437 if (val == 0xffffffff) {
438 device_printf(sc->dev, "extended mapping failed\n");
439 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
443 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
445 device_printf(sc->dev,
446 "Enabled ECRC on upstream Nvidia bridge "
448 (int)bus, (int)slot, (int)func);
453 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
455 device_printf(sc->dev,
456 "Nforce 4 chipset on non-x86/amd64!?!?!\n");
463 mxge_dma_test(mxge_softc_t *sc, int test_type)
466 bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
472 /* Run a small DMA test.
473 * The magic multipliers to the length tell the firmware
474 * to do DMA read, write, or read+write tests. The
475 * results are returned in cmd.data0. The upper 16
476 * bits of the return is the number of transfers completed.
477 * The lower 16 bits is the time in 0.5us ticks that the
478 * transfers took to complete.
481 len = sc->tx_boundary;
483 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
484 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
485 cmd.data2 = len * 0x10000;
486 status = mxge_send_cmd(sc, test_type, &cmd);
491 sc->read_dma = ((cmd.data0>>16) * len * 2) /
492 (cmd.data0 & 0xffff);
493 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
494 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
495 cmd.data2 = len * 0x1;
496 status = mxge_send_cmd(sc, test_type, &cmd);
501 sc->write_dma = ((cmd.data0>>16) * len * 2) /
502 (cmd.data0 & 0xffff);
504 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
505 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
506 cmd.data2 = len * 0x10001;
507 status = mxge_send_cmd(sc, test_type, &cmd);
512 sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
513 (cmd.data0 & 0xffff);
516 if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
517 device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
524 * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
525 * when the PCI-E Completion packets are aligned on an 8-byte
526 * boundary. Some PCI-E chip sets always align Completion packets; on
527 * the ones that do not, the alignment can be enforced by enabling
528 * ECRC generation (if supported).
530 * When PCI-E Completion packets are not aligned, it is actually more
531 * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
533 * If the driver can neither enable ECRC nor verify that it has
534 * already been enabled, then it must use a firmware image which works
535 * around unaligned completion packets (ethp_z8e.dat), and it should
536 * also ensure that it never gives the device a Read-DMA which is
537 * larger than 2KB by setting the tx_boundary to 2KB. If ECRC is
538 * enabled, then the driver should use the aligned (eth_z8e.dat)
539 * firmware image, and set tx_boundary to 4KB.
543 mxge_firmware_probe(mxge_softc_t *sc)
545 device_t dev = sc->dev;
549 sc->tx_boundary = 4096;
551 * Verify the max read request size was set to 4KB
552 * before trying the test with 4KB.
554 if (pci_find_extcap(dev, PCIY_EXPRESS, ®) == 0) {
555 pectl = pci_read_config(dev, reg + 0x8, 2);
556 if ((pectl & (5 << 12)) != (5 << 12)) {
557 device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
559 sc->tx_boundary = 2048;
564 * load the optimized firmware (which assumes aligned PCIe
565 * completions) in order to see if it works on this host.
567 sc->fw_name = mxge_fw_aligned;
568 status = mxge_load_firmware(sc, 1);
574 * Enable ECRC if possible
576 mxge_enable_nvidia_ecrc(sc);
579 * Run a DMA test which watches for unaligned completions and
580 * aborts on the first one seen.
583 status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
585 return 0; /* keep the aligned firmware */
588 device_printf(dev, "DMA test failed: %d\n", status);
589 if (status == ENOSYS)
590 device_printf(dev, "Falling back to ethp! "
591 "Please install up to date fw\n");
596 mxge_select_firmware(mxge_softc_t *sc)
601 if (mxge_force_firmware != 0) {
602 if (mxge_force_firmware == 1)
607 device_printf(sc->dev,
608 "Assuming %s completions (forced)\n",
609 aligned ? "aligned" : "unaligned");
613 /* if the PCIe link width is 4 or less, we can use the aligned
614 firmware and skip any checks */
615 if (sc->link_width != 0 && sc->link_width <= 4) {
616 device_printf(sc->dev,
617 "PCIe x%d Link, expect reduced performance\n",
623 if (0 == mxge_firmware_probe(sc))
628 sc->fw_name = mxge_fw_aligned;
629 sc->tx_boundary = 4096;
631 sc->fw_name = mxge_fw_unaligned;
632 sc->tx_boundary = 2048;
634 return (mxge_load_firmware(sc, 0));
644 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
648 if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
649 device_printf(sc->dev, "Bad firmware type: 0x%x\n",
650 be32toh(hdr->mcp_type));
654 /* save firmware version for sysctl */
655 strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
657 device_printf(sc->dev, "firmware id: %s\n", hdr->version);
659 sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
660 &sc->fw_ver_minor, &sc->fw_ver_tiny);
662 if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
663 && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
664 device_printf(sc->dev, "Found firmware version %s\n",
666 device_printf(sc->dev, "Driver needs %d.%d\n",
667 MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
675 z_alloc(void *nil, u_int items, u_int size)
679 ptr = malloc(items * size, M_TEMP, M_NOWAIT);
684 z_free(void *nil, void *ptr)
691 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
694 char *inflate_buffer;
695 const struct firmware *fw;
696 const mcp_gen_header_t *hdr;
703 fw = firmware_get(sc->fw_name);
705 device_printf(sc->dev, "Could not find firmware image %s\n",
712 /* setup zlib and decompress f/w */
713 bzero(&zs, sizeof (zs));
716 status = inflateInit(&zs);
717 if (status != Z_OK) {
722 /* the uncompressed size is stored as the firmware version,
723 which would otherwise go unused */
724 fw_len = (size_t) fw->version;
725 inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT);
726 if (inflate_buffer == NULL)
728 zs.avail_in = fw->datasize;
729 zs.next_in = __DECONST(char *, fw->data);
730 zs.avail_out = fw_len;
731 zs.next_out = inflate_buffer;
732 status = inflate(&zs, Z_FINISH);
733 if (status != Z_STREAM_END) {
734 device_printf(sc->dev, "zlib %d\n", status);
736 goto abort_with_buffer;
740 hdr_offset = htobe32(*(const uint32_t *)
741 (inflate_buffer + MCP_HEADER_PTR_OFFSET));
742 if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
743 device_printf(sc->dev, "Bad firmware file");
745 goto abort_with_buffer;
747 hdr = (const void*)(inflate_buffer + hdr_offset);
749 status = mxge_validate_firmware(sc, hdr);
751 goto abort_with_buffer;
753 /* Copy the inflated firmware to NIC SRAM. */
754 for (i = 0; i < fw_len; i += 256) {
755 mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
757 min(256U, (unsigned)(fw_len - i)));
766 free(inflate_buffer, M_TEMP);
770 firmware_put(fw, FIRMWARE_UNLOAD);
775 * Enable or disable periodic RDMAs from the host to make certain
776 * chipsets resend dropped PCIe messages
780 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
783 volatile uint32_t *confirm;
784 volatile char *submit;
785 uint32_t *buf, dma_low, dma_high;
788 buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
790 /* clear confirmation addr */
791 confirm = (volatile uint32_t *)sc->cmd;
795 /* send an rdma command to the PCIe engine, and wait for the
796 response in the confirmation address. The firmware should
797 write a -1 there to indicate it is alive and well
800 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
801 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
802 buf[0] = htobe32(dma_high); /* confirm addr MSW */
803 buf[1] = htobe32(dma_low); /* confirm addr LSW */
804 buf[2] = htobe32(0xffffffff); /* confirm data */
805 dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
806 dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
807 buf[3] = htobe32(dma_high); /* dummy addr MSW */
808 buf[4] = htobe32(dma_low); /* dummy addr LSW */
809 buf[5] = htobe32(enable); /* enable? */
812 submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
814 mxge_pio_copy(submit, buf, 64);
819 while (*confirm != 0xffffffff && i < 20) {
823 if (*confirm != 0xffffffff) {
824 device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
825 (enable ? "enable" : "disable"), confirm,
832 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
835 char buf_bytes[sizeof(*buf) + 8];
836 volatile mcp_cmd_response_t *response = sc->cmd;
837 volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
838 uint32_t dma_low, dma_high;
839 int err, sleep_total = 0;
841 /* ensure buf is aligned to 8 bytes */
842 buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
844 buf->data0 = htobe32(data->data0);
845 buf->data1 = htobe32(data->data1);
846 buf->data2 = htobe32(data->data2);
847 buf->cmd = htobe32(cmd);
848 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
849 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
851 buf->response_addr.low = htobe32(dma_low);
852 buf->response_addr.high = htobe32(dma_high);
853 mtx_lock(&sc->cmd_mtx);
854 response->result = 0xffffffff;
856 mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
858 /* wait up to 20ms */
860 for (sleep_total = 0; sleep_total < 20; sleep_total++) {
861 bus_dmamap_sync(sc->cmd_dma.dmat,
862 sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
864 switch (be32toh(response->result)) {
866 data->data0 = be32toh(response->data);
872 case MXGEFW_CMD_UNKNOWN:
875 case MXGEFW_CMD_ERROR_UNALIGNED:
878 case MXGEFW_CMD_ERROR_BUSY:
882 device_printf(sc->dev,
884 "failed, result = %d\n",
885 cmd, be32toh(response->result));
893 device_printf(sc->dev, "mxge: command %d timed out"
895 cmd, be32toh(response->result));
896 mtx_unlock(&sc->cmd_mtx);
901 mxge_adopt_running_firmware(mxge_softc_t *sc)
903 struct mcp_gen_header *hdr;
904 const size_t bytes = sizeof (struct mcp_gen_header);
908 /* find running firmware header */
909 hdr_offset = htobe32(*(volatile uint32_t *)
910 (sc->sram + MCP_HEADER_PTR_OFFSET));
912 if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
913 device_printf(sc->dev,
914 "Running firmware has bad header offset (%d)\n",
919 /* copy header of running firmware from SRAM to host memory to
920 * validate firmware */
921 hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
923 device_printf(sc->dev, "could not malloc firmware hdr\n");
926 bus_space_read_region_1(rman_get_bustag(sc->mem_res),
927 rman_get_bushandle(sc->mem_res),
928 hdr_offset, (char *)hdr, bytes);
929 status = mxge_validate_firmware(sc, hdr);
933 * check to see if adopted firmware has bug where adopting
934 * it will cause broadcasts to be filtered unless the NIC
935 * is kept in ALLMULTI mode
937 if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
938 sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
939 sc->adopted_rx_filter_bug = 1;
940 device_printf(sc->dev, "Adopting fw %d.%d.%d: "
941 "working around rx filter bug\n",
942 sc->fw_ver_major, sc->fw_ver_minor,
951 mxge_load_firmware(mxge_softc_t *sc, int adopt)
953 volatile uint32_t *confirm;
954 volatile char *submit;
956 uint32_t *buf, size, dma_low, dma_high;
959 buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
961 size = sc->sram_size;
962 status = mxge_load_firmware_helper(sc, &size);
966 /* Try to use the currently running firmware, if
968 status = mxge_adopt_running_firmware(sc);
970 device_printf(sc->dev,
971 "failed to adopt running firmware\n");
974 device_printf(sc->dev,
975 "Successfully adopted running firmware\n");
976 if (sc->tx_boundary == 4096) {
977 device_printf(sc->dev,
978 "Using firmware currently running on NIC"
980 device_printf(sc->dev,
981 "performance consider loading optimized "
984 sc->fw_name = mxge_fw_unaligned;
985 sc->tx_boundary = 2048;
988 /* clear confirmation addr */
989 confirm = (volatile uint32_t *)sc->cmd;
992 /* send a reload command to the bootstrap MCP, and wait for the
993 response in the confirmation address. The firmware should
994 write a -1 there to indicate it is alive and well
997 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
998 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
1000 buf[0] = htobe32(dma_high); /* confirm addr MSW */
1001 buf[1] = htobe32(dma_low); /* confirm addr LSW */
1002 buf[2] = htobe32(0xffffffff); /* confirm data */
1004 /* FIX: All newest firmware should un-protect the bottom of
1005 the sram before handoff. However, the very first interfaces
1006 do not. Therefore the handoff copy must skip the first 8 bytes
1008 /* where the code starts*/
1009 buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1010 buf[4] = htobe32(size - 8); /* length of code */
1011 buf[5] = htobe32(8); /* where to copy to */
1012 buf[6] = htobe32(0); /* where to jump to */
1014 submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1015 mxge_pio_copy(submit, buf, 64);
1020 while (*confirm != 0xffffffff && i < 20) {
1023 bus_dmamap_sync(sc->cmd_dma.dmat,
1024 sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1026 if (*confirm != 0xffffffff) {
1027 device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1036 mxge_update_mac_address(mxge_softc_t *sc)
1039 uint8_t *addr = sc->mac_addr;
1043 cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1044 | (addr[2] << 8) | addr[3]);
1046 cmd.data1 = ((addr[4] << 8) | (addr[5]));
1048 status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1053 mxge_change_pause(mxge_softc_t *sc, int pause)
1059 status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1062 status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1066 device_printf(sc->dev, "Failed to set flow control mode\n");
1074 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1079 if (mxge_always_promisc)
1083 status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1086 status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1090 device_printf(sc->dev, "Failed to set promisc mode\n");
1095 mxge_set_multicast_list(mxge_softc_t *sc)
1098 struct ifmultiaddr *ifma;
1099 struct ifnet *ifp = sc->ifp;
1102 /* This firmware is known to not support multicast */
1103 if (!sc->fw_multicast_support)
1106 /* Disable multicast filtering while we play with the lists*/
1107 err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1109 device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1110 " error status: %d\n", err);
1114 if (sc->adopted_rx_filter_bug)
1117 if (ifp->if_flags & IFF_ALLMULTI)
1118 /* request to disable multicast filtering, so quit here */
1121 /* Flush all the filters */
1123 err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1125 device_printf(sc->dev,
1126 "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1127 ", error status: %d\n", err);
1131 /* Walk the multicast list, and add each address */
1133 if_maddr_rlock(ifp);
1134 TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1135 if (ifma->ifma_addr->sa_family != AF_LINK)
1137 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1139 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1141 cmd.data0 = htonl(cmd.data0);
1142 cmd.data1 = htonl(cmd.data1);
1143 err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1145 device_printf(sc->dev, "Failed "
1146 "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1148 /* abort, leaving multicast filtering off */
1149 if_maddr_runlock(ifp);
1153 if_maddr_runlock(ifp);
1154 /* Enable multicast filtering */
1155 err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1157 device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1158 ", error status: %d\n", err);
1163 mxge_max_mtu(mxge_softc_t *sc)
1168 if (MJUMPAGESIZE - MXGEFW_PAD > MXGEFW_MAX_MTU)
1169 return MXGEFW_MAX_MTU - MXGEFW_PAD;
1171 /* try to set nbufs to see if it we can
1172 use virtually contiguous jumbos */
1174 status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1177 return MXGEFW_MAX_MTU - MXGEFW_PAD;
1179 /* otherwise, we're limited to MJUMPAGESIZE */
1180 return MJUMPAGESIZE - MXGEFW_PAD;
1184 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1186 struct mxge_slice_state *ss;
1187 mxge_rx_done_t *rx_done;
1188 volatile uint32_t *irq_claim;
1192 /* try to send a reset command to the card to see if it
1194 memset(&cmd, 0, sizeof (cmd));
1195 status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1197 device_printf(sc->dev, "failed reset\n");
1201 mxge_dummy_rdma(sc, 1);
1204 /* set the intrq size */
1205 cmd.data0 = sc->rx_ring_size;
1206 status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1209 * Even though we already know how many slices are supported
1210 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1211 * has magic side effects, and must be called after a reset.
1212 * It must be called prior to calling any RSS related cmds,
1213 * including assigning an interrupt queue for anything but
1214 * slice 0. It must also be called *after*
1215 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1216 * the firmware to compute offsets.
1219 if (sc->num_slices > 1) {
1220 /* ask the maximum number of slices it supports */
1221 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1224 device_printf(sc->dev,
1225 "failed to get number of slices\n");
1229 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1230 * to setting up the interrupt queue DMA
1232 cmd.data0 = sc->num_slices;
1233 cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1234 #ifdef IFNET_BUF_RING
1235 cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1237 status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1240 device_printf(sc->dev,
1241 "failed to set number of slices\n");
1247 if (interrupts_setup) {
1248 /* Now exchange information about interrupts */
1249 for (slice = 0; slice < sc->num_slices; slice++) {
1250 rx_done = &sc->ss[slice].rx_done;
1251 memset(rx_done->entry, 0, sc->rx_ring_size);
1252 cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1253 cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1255 status |= mxge_send_cmd(sc,
1256 MXGEFW_CMD_SET_INTRQ_DMA,
1261 status |= mxge_send_cmd(sc,
1262 MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1265 sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1267 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1268 irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1271 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1273 sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1275 device_printf(sc->dev, "failed set interrupt parameters\n");
1280 *sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1283 /* run a DMA benchmark */
1284 (void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1286 for (slice = 0; slice < sc->num_slices; slice++) {
1287 ss = &sc->ss[slice];
1289 ss->irq_claim = irq_claim + (2 * slice);
1290 /* reset mcp/driver shared state back to 0 */
1291 ss->rx_done.idx = 0;
1292 ss->rx_done.cnt = 0;
1295 ss->tx.pkt_done = 0;
1296 ss->tx.queue_active = 0;
1297 ss->tx.activate = 0;
1298 ss->tx.deactivate = 0;
1303 ss->rx_small.cnt = 0;
1304 ss->lro_bad_csum = 0;
1306 ss->lro_flushed = 0;
1307 if (ss->fw_stats != NULL) {
1308 ss->fw_stats->valid = 0;
1309 ss->fw_stats->send_done_count = 0;
1312 sc->rdma_tags_available = 15;
1313 status = mxge_update_mac_address(sc);
1314 mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1315 mxge_change_pause(sc, sc->pause);
1316 mxge_set_multicast_list(sc);
1321 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1324 unsigned int intr_coal_delay;
1328 intr_coal_delay = sc->intr_coal_delay;
1329 err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1333 if (intr_coal_delay == sc->intr_coal_delay)
1336 if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1339 mtx_lock(&sc->driver_mtx);
1340 *sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1341 sc->intr_coal_delay = intr_coal_delay;
1343 mtx_unlock(&sc->driver_mtx);
1348 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1351 unsigned int enabled;
1355 enabled = sc->pause;
1356 err = sysctl_handle_int(oidp, &enabled, arg2, req);
1360 if (enabled == sc->pause)
1363 mtx_lock(&sc->driver_mtx);
1364 err = mxge_change_pause(sc, enabled);
1365 mtx_unlock(&sc->driver_mtx);
1370 mxge_change_lro_locked(mxge_softc_t *sc, int lro_cnt)
1377 ifp->if_capenable &= ~IFCAP_LRO;
1379 ifp->if_capenable |= IFCAP_LRO;
1380 sc->lro_cnt = lro_cnt;
1381 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
1383 err = mxge_open(sc);
1389 mxge_change_lro(SYSCTL_HANDLER_ARGS)
1392 unsigned int lro_cnt;
1396 lro_cnt = sc->lro_cnt;
1397 err = sysctl_handle_int(oidp, &lro_cnt, arg2, req);
1401 if (lro_cnt == sc->lro_cnt)
1407 mtx_lock(&sc->driver_mtx);
1408 err = mxge_change_lro_locked(sc, lro_cnt);
1409 mtx_unlock(&sc->driver_mtx);
1414 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1420 arg2 = be32toh(*(int *)arg1);
1422 err = sysctl_handle_int(oidp, arg1, arg2, req);
1428 mxge_rem_sysctls(mxge_softc_t *sc)
1430 struct mxge_slice_state *ss;
1433 if (sc->slice_sysctl_tree == NULL)
1436 for (slice = 0; slice < sc->num_slices; slice++) {
1437 ss = &sc->ss[slice];
1438 if (ss == NULL || ss->sysctl_tree == NULL)
1440 sysctl_ctx_free(&ss->sysctl_ctx);
1441 ss->sysctl_tree = NULL;
1443 sysctl_ctx_free(&sc->slice_sysctl_ctx);
1444 sc->slice_sysctl_tree = NULL;
1448 mxge_add_sysctls(mxge_softc_t *sc)
1450 struct sysctl_ctx_list *ctx;
1451 struct sysctl_oid_list *children;
1453 struct mxge_slice_state *ss;
1457 ctx = device_get_sysctl_ctx(sc->dev);
1458 children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1459 fw = sc->ss[0].fw_stats;
1461 /* random information */
1462 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1464 CTLFLAG_RD, &sc->fw_version,
1465 0, "firmware version");
1466 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1468 CTLFLAG_RD, &sc->serial_number_string,
1469 0, "serial number");
1470 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1472 CTLFLAG_RD, &sc->product_code_string,
1474 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1476 CTLFLAG_RD, &sc->link_width,
1478 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1480 CTLFLAG_RD, &sc->tx_boundary,
1482 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1484 CTLFLAG_RD, &sc->wc,
1485 0, "write combining PIO?");
1486 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1488 CTLFLAG_RD, &sc->read_dma,
1489 0, "DMA Read speed in MB/s");
1490 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1492 CTLFLAG_RD, &sc->write_dma,
1493 0, "DMA Write speed in MB/s");
1494 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1495 "read_write_dma_MBs",
1496 CTLFLAG_RD, &sc->read_write_dma,
1497 0, "DMA concurrent Read/Write speed in MB/s");
1500 /* performance related tunables */
1501 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1503 CTLTYPE_INT|CTLFLAG_RW, sc,
1504 0, mxge_change_intr_coal,
1505 "I", "interrupt coalescing delay in usecs");
1507 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1508 "flow_control_enabled",
1509 CTLTYPE_INT|CTLFLAG_RW, sc,
1510 0, mxge_change_flow_control,
1511 "I", "interrupt coalescing delay in usecs");
1513 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1515 CTLFLAG_RW, &mxge_deassert_wait,
1516 0, "Wait for IRQ line to go low in ihandler");
1518 /* stats block from firmware is in network byte order.
1520 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1522 CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1523 0, mxge_handle_be32,
1525 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1526 "rdma_tags_available",
1527 CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1528 0, mxge_handle_be32,
1529 "I", "rdma_tags_available");
1530 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1531 "dropped_bad_crc32",
1532 CTLTYPE_INT|CTLFLAG_RD,
1533 &fw->dropped_bad_crc32,
1534 0, mxge_handle_be32,
1535 "I", "dropped_bad_crc32");
1536 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1538 CTLTYPE_INT|CTLFLAG_RD,
1539 &fw->dropped_bad_phy,
1540 0, mxge_handle_be32,
1541 "I", "dropped_bad_phy");
1542 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1543 "dropped_link_error_or_filtered",
1544 CTLTYPE_INT|CTLFLAG_RD,
1545 &fw->dropped_link_error_or_filtered,
1546 0, mxge_handle_be32,
1547 "I", "dropped_link_error_or_filtered");
1548 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1549 "dropped_link_overflow",
1550 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1551 0, mxge_handle_be32,
1552 "I", "dropped_link_overflow");
1553 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1554 "dropped_multicast_filtered",
1555 CTLTYPE_INT|CTLFLAG_RD,
1556 &fw->dropped_multicast_filtered,
1557 0, mxge_handle_be32,
1558 "I", "dropped_multicast_filtered");
1559 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1560 "dropped_no_big_buffer",
1561 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1562 0, mxge_handle_be32,
1563 "I", "dropped_no_big_buffer");
1564 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1565 "dropped_no_small_buffer",
1566 CTLTYPE_INT|CTLFLAG_RD,
1567 &fw->dropped_no_small_buffer,
1568 0, mxge_handle_be32,
1569 "I", "dropped_no_small_buffer");
1570 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1572 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1573 0, mxge_handle_be32,
1574 "I", "dropped_overrun");
1575 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1577 CTLTYPE_INT|CTLFLAG_RD,
1579 0, mxge_handle_be32,
1580 "I", "dropped_pause");
1581 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1583 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1584 0, mxge_handle_be32,
1585 "I", "dropped_runt");
1587 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1588 "dropped_unicast_filtered",
1589 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1590 0, mxge_handle_be32,
1591 "I", "dropped_unicast_filtered");
1593 /* verbose printing? */
1594 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1596 CTLFLAG_RW, &mxge_verbose,
1597 0, "verbose printing");
1600 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1602 CTLTYPE_INT|CTLFLAG_RW, sc,
1604 "I", "number of lro merge queues");
1607 /* add counters exported for debugging from all slices */
1608 sysctl_ctx_init(&sc->slice_sysctl_ctx);
1609 sc->slice_sysctl_tree =
1610 SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1611 "slice", CTLFLAG_RD, 0, "");
1613 for (slice = 0; slice < sc->num_slices; slice++) {
1614 ss = &sc->ss[slice];
1615 sysctl_ctx_init(&ss->sysctl_ctx);
1616 ctx = &ss->sysctl_ctx;
1617 children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1618 sprintf(slice_num, "%d", slice);
1620 SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1622 children = SYSCTL_CHILDREN(ss->sysctl_tree);
1623 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1625 CTLFLAG_RD, &ss->rx_small.cnt,
1627 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1629 CTLFLAG_RD, &ss->rx_big.cnt,
1631 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1632 "lro_flushed", CTLFLAG_RD, &ss->lro_flushed,
1633 0, "number of lro merge queues flushed");
1635 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1636 "lro_queued", CTLFLAG_RD, &ss->lro_queued,
1637 0, "number of frames appended to lro merge"
1640 #ifndef IFNET_BUF_RING
1641 /* only transmit from slice 0 for now */
1645 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1647 CTLFLAG_RD, &ss->tx.req,
1650 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1652 CTLFLAG_RD, &ss->tx.done,
1654 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1656 CTLFLAG_RD, &ss->tx.pkt_done,
1658 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1660 CTLFLAG_RD, &ss->tx.stall,
1662 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1664 CTLFLAG_RD, &ss->tx.wake,
1666 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1668 CTLFLAG_RD, &ss->tx.defrag,
1670 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1672 CTLFLAG_RD, &ss->tx.queue_active,
1673 0, "tx_queue_active");
1674 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1676 CTLFLAG_RD, &ss->tx.activate,
1678 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1680 CTLFLAG_RD, &ss->tx.deactivate,
1681 0, "tx_deactivate");
1685 /* copy an array of mcp_kreq_ether_send_t's to the mcp. Copy
1686 backwards one at a time and handle ring wraps */
1689 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1690 mcp_kreq_ether_send_t *src, int cnt)
1692 int idx, starting_slot;
1693 starting_slot = tx->req;
1696 idx = (starting_slot + cnt) & tx->mask;
1697 mxge_pio_copy(&tx->lanai[idx],
1698 &src[cnt], sizeof(*src));
1704 * copy an array of mcp_kreq_ether_send_t's to the mcp. Copy
1705 * at most 32 bytes at a time, so as to avoid involving the software
1706 * pio handler in the nic. We re-write the first segment's flags
1707 * to mark them valid only after writing the entire chain
1711 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1716 volatile uint32_t *dst_ints;
1717 mcp_kreq_ether_send_t *srcp;
1718 volatile mcp_kreq_ether_send_t *dstp, *dst;
1721 idx = tx->req & tx->mask;
1723 last_flags = src->flags;
1726 dst = dstp = &tx->lanai[idx];
1729 if ((idx + cnt) < tx->mask) {
1730 for (i = 0; i < (cnt - 1); i += 2) {
1731 mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1732 wmb(); /* force write every 32 bytes */
1737 /* submit all but the first request, and ensure
1738 that it is submitted below */
1739 mxge_submit_req_backwards(tx, src, cnt);
1743 /* submit the first request */
1744 mxge_pio_copy(dstp, srcp, sizeof(*src));
1745 wmb(); /* barrier before setting valid flag */
1748 /* re-write the last 32-bits with the valid flags */
1749 src->flags = last_flags;
1750 src_ints = (uint32_t *)src;
1752 dst_ints = (volatile uint32_t *)dst;
1754 *dst_ints = *src_ints;
1762 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1763 int busdma_seg_cnt, int ip_off)
1766 mcp_kreq_ether_send_t *req;
1767 bus_dma_segment_t *seg;
1770 uint32_t low, high_swapped;
1771 int len, seglen, cum_len, cum_len_next;
1772 int next_is_first, chop, cnt, rdma_count, small;
1773 uint16_t pseudo_hdr_offset, cksum_offset, mss;
1774 uint8_t flags, flags_next;
1777 mss = m->m_pkthdr.tso_segsz;
1779 /* negative cum_len signifies to the
1780 * send loop that we are still in the
1781 * header portion of the TSO packet.
1784 /* ensure we have the ethernet, IP and TCP
1785 header together in the first mbuf, copy
1786 it to a scratch buffer if not */
1787 if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1788 m_copydata(m, 0, ip_off + sizeof (*ip),
1790 ip = (struct ip *)(ss->scratch + ip_off);
1792 ip = (struct ip *)(mtod(m, char *) + ip_off);
1794 if (__predict_false(m->m_len < ip_off + (ip->ip_hl << 2)
1796 m_copydata(m, 0, ip_off + (ip->ip_hl << 2)
1797 + sizeof (*tcp), ss->scratch);
1798 ip = (struct ip *)(mtod(m, char *) + ip_off);
1801 tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1802 cum_len = -(ip_off + ((ip->ip_hl + tcp->th_off) << 2));
1804 /* TSO implies checksum offload on this hardware */
1805 cksum_offset = ip_off + (ip->ip_hl << 2);
1806 flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1809 /* for TSO, pseudo_hdr_offset holds mss.
1810 * The firmware figures out where to put
1811 * the checksum by parsing the header. */
1812 pseudo_hdr_offset = htobe16(mss);
1819 /* "rdma_count" is the number of RDMAs belonging to the
1820 * current packet BEFORE the current send request. For
1821 * non-TSO packets, this is equal to "count".
1822 * For TSO packets, rdma_count needs to be reset
1823 * to 0 after a segment cut.
1825 * The rdma_count field of the send request is
1826 * the number of RDMAs of the packet starting at
1827 * that request. For TSO send requests with one ore more cuts
1828 * in the middle, this is the number of RDMAs starting
1829 * after the last cut in the request. All previous
1830 * segments before the last cut implicitly have 1 RDMA.
1832 * Since the number of RDMAs is not known beforehand,
1833 * it must be filled-in retroactively - after each
1834 * segmentation cut or at the end of the entire packet.
1837 while (busdma_seg_cnt) {
1838 /* Break the busdma segment up into pieces*/
1839 low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1840 high_swapped = htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1844 flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1846 cum_len_next = cum_len + seglen;
1847 (req-rdma_count)->rdma_count = rdma_count + 1;
1848 if (__predict_true(cum_len >= 0)) {
1850 chop = (cum_len_next > mss);
1851 cum_len_next = cum_len_next % mss;
1852 next_is_first = (cum_len_next == 0);
1853 flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1854 flags_next |= next_is_first *
1856 rdma_count |= -(chop | next_is_first);
1857 rdma_count += chop & !next_is_first;
1858 } else if (cum_len_next >= 0) {
1863 small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1864 flags_next = MXGEFW_FLAGS_TSO_PLD |
1865 MXGEFW_FLAGS_FIRST |
1866 (small * MXGEFW_FLAGS_SMALL);
1869 req->addr_high = high_swapped;
1870 req->addr_low = htobe32(low);
1871 req->pseudo_hdr_offset = pseudo_hdr_offset;
1873 req->rdma_count = 1;
1874 req->length = htobe16(seglen);
1875 req->cksum_offset = cksum_offset;
1876 req->flags = flags | ((cum_len & 1) *
1877 MXGEFW_FLAGS_ALIGN_ODD);
1880 cum_len = cum_len_next;
1885 if (__predict_false(cksum_offset > seglen))
1886 cksum_offset -= seglen;
1889 if (__predict_false(cnt > tx->max_desc))
1895 (req-rdma_count)->rdma_count = rdma_count;
1899 req->flags |= MXGEFW_FLAGS_TSO_LAST;
1900 } while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1902 tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1903 mxge_submit_req(tx, tx->req_list, cnt);
1904 #ifdef IFNET_BUF_RING
1905 if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1906 /* tell the NIC to start polling this slice */
1908 tx->queue_active = 1;
1916 bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1920 printf("tx->max_desc exceeded via TSO!\n");
1921 printf("mss = %d, %ld, %d!\n", mss,
1922 (long)seg - (long)tx->seg_list, tx->max_desc);
1929 #endif /* IFCAP_TSO4 */
1931 #ifdef MXGE_NEW_VLAN_API
1933 * We reproduce the software vlan tag insertion from
1934 * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
1935 * vlan tag insertion. We need to advertise this in order to have the
1936 * vlan interface respect our csum offload flags.
1938 static struct mbuf *
1939 mxge_vlan_tag_insert(struct mbuf *m)
1941 struct ether_vlan_header *evl;
1943 M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_DONTWAIT);
1944 if (__predict_false(m == NULL))
1946 if (m->m_len < sizeof(*evl)) {
1947 m = m_pullup(m, sizeof(*evl));
1948 if (__predict_false(m == NULL))
1952 * Transform the Ethernet header into an Ethernet header
1953 * with 802.1Q encapsulation.
1955 evl = mtod(m, struct ether_vlan_header *);
1956 bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
1957 (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
1958 evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
1959 evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
1960 m->m_flags &= ~M_VLANTAG;
1963 #endif /* MXGE_NEW_VLAN_API */
1966 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
1969 mcp_kreq_ether_send_t *req;
1970 bus_dma_segment_t *seg;
1975 int cnt, cum_len, err, i, idx, odd_flag, ip_off;
1976 uint16_t pseudo_hdr_offset;
1977 uint8_t flags, cksum_offset;
1984 ip_off = sizeof (struct ether_header);
1985 #ifdef MXGE_NEW_VLAN_API
1986 if (m->m_flags & M_VLANTAG) {
1987 m = mxge_vlan_tag_insert(m);
1988 if (__predict_false(m == NULL))
1990 ip_off += ETHER_VLAN_ENCAP_LEN;
1993 /* (try to) map the frame for DMA */
1994 idx = tx->req & tx->mask;
1995 err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
1996 m, tx->seg_list, &cnt,
1998 if (__predict_false(err == EFBIG)) {
1999 /* Too many segments in the chain. Try
2001 m_tmp = m_defrag(m, M_NOWAIT);
2002 if (m_tmp == NULL) {
2007 err = bus_dmamap_load_mbuf_sg(tx->dmat,
2009 m, tx->seg_list, &cnt,
2012 if (__predict_false(err != 0)) {
2013 device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
2014 " packet len = %d\n", err, m->m_pkthdr.len);
2017 bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2018 BUS_DMASYNC_PREWRITE);
2019 tx->info[idx].m = m;
2022 /* TSO is different enough, we handle it in another routine */
2023 if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2024 mxge_encap_tso(ss, m, cnt, ip_off);
2031 pseudo_hdr_offset = 0;
2032 flags = MXGEFW_FLAGS_NO_TSO;
2034 /* checksum offloading? */
2035 if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
2036 /* ensure ip header is in first mbuf, copy
2037 it to a scratch buffer if not */
2038 if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
2039 m_copydata(m, 0, ip_off + sizeof (*ip),
2041 ip = (struct ip *)(ss->scratch + ip_off);
2043 ip = (struct ip *)(mtod(m, char *) + ip_off);
2045 cksum_offset = ip_off + (ip->ip_hl << 2);
2046 pseudo_hdr_offset = cksum_offset + m->m_pkthdr.csum_data;
2047 pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2048 req->cksum_offset = cksum_offset;
2049 flags |= MXGEFW_FLAGS_CKSUM;
2050 odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2054 if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2055 flags |= MXGEFW_FLAGS_SMALL;
2057 /* convert segments into a request list */
2060 req->flags = MXGEFW_FLAGS_FIRST;
2061 for (i = 0; i < cnt; i++) {
2063 htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2065 htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2066 req->length = htobe16(seg->ds_len);
2067 req->cksum_offset = cksum_offset;
2068 if (cksum_offset > seg->ds_len)
2069 cksum_offset -= seg->ds_len;
2072 req->pseudo_hdr_offset = pseudo_hdr_offset;
2073 req->pad = 0; /* complete solid 16-byte block */
2074 req->rdma_count = 1;
2075 req->flags |= flags | ((cum_len & 1) * odd_flag);
2076 cum_len += seg->ds_len;
2082 /* pad runts to 60 bytes */
2086 htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2088 htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2089 req->length = htobe16(60 - cum_len);
2090 req->cksum_offset = 0;
2091 req->pseudo_hdr_offset = pseudo_hdr_offset;
2092 req->pad = 0; /* complete solid 16-byte block */
2093 req->rdma_count = 1;
2094 req->flags |= flags | ((cum_len & 1) * odd_flag);
2098 tx->req_list[0].rdma_count = cnt;
2100 /* print what the firmware will see */
2101 for (i = 0; i < cnt; i++) {
2102 printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2103 "cso:%d, flags:0x%x, rdma:%d\n",
2104 i, (int)ntohl(tx->req_list[i].addr_high),
2105 (int)ntohl(tx->req_list[i].addr_low),
2106 (int)ntohs(tx->req_list[i].length),
2107 (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2108 tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2109 tx->req_list[i].rdma_count);
2111 printf("--------------\n");
2113 tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2114 mxge_submit_req(tx, tx->req_list, cnt);
2115 #ifdef IFNET_BUF_RING
2116 if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2117 /* tell the NIC to start polling this slice */
2119 tx->queue_active = 1;
2132 #ifdef IFNET_BUF_RING
2134 mxge_qflush(struct ifnet *ifp)
2136 mxge_softc_t *sc = ifp->if_softc;
2141 for (slice = 0; slice < sc->num_slices; slice++) {
2142 tx = &sc->ss[slice].tx;
2144 while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2146 mtx_unlock(&tx->mtx);
2152 mxge_start_locked(struct mxge_slice_state *ss)
2163 while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2164 m = drbr_dequeue(ifp, tx->br);
2168 /* let BPF see it */
2171 /* give it to the nic */
2174 /* ran out of transmit slots */
2175 if (((ss->if_drv_flags & IFF_DRV_OACTIVE) == 0)
2176 && (!drbr_empty(ifp, tx->br))) {
2177 ss->if_drv_flags |= IFF_DRV_OACTIVE;
2183 mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2194 if ((ss->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
2196 err = drbr_enqueue(ifp, tx->br, m);
2200 if (drbr_empty(ifp, tx->br) &&
2201 ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2202 /* let BPF see it */
2204 /* give it to the nic */
2206 } else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2209 if (!drbr_empty(ifp, tx->br))
2210 mxge_start_locked(ss);
2215 mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2217 mxge_softc_t *sc = ifp->if_softc;
2218 struct mxge_slice_state *ss;
2223 slice = m->m_pkthdr.flowid;
2224 slice &= (sc->num_slices - 1); /* num_slices always power of 2 */
2226 ss = &sc->ss[slice];
2229 if (mtx_trylock(&tx->mtx)) {
2230 err = mxge_transmit_locked(ss, m);
2231 mtx_unlock(&tx->mtx);
2233 err = drbr_enqueue(ifp, tx->br, m);
2242 mxge_start_locked(struct mxge_slice_state *ss)
2252 while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2253 IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
2257 /* let BPF see it */
2260 /* give it to the nic */
2263 /* ran out of transmit slots */
2264 if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
2265 sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
2271 mxge_start(struct ifnet *ifp)
2273 mxge_softc_t *sc = ifp->if_softc;
2274 struct mxge_slice_state *ss;
2276 /* only use the first slice for now */
2278 mtx_lock(&ss->tx.mtx);
2279 mxge_start_locked(ss);
2280 mtx_unlock(&ss->tx.mtx);
2284 * copy an array of mcp_kreq_ether_recv_t's to the mcp. Copy
2285 * at most 32 bytes at a time, so as to avoid involving the software
2286 * pio handler in the nic. We re-write the first segment's low
2287 * DMA address to mark it valid only after we write the entire chunk
2291 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2292 mcp_kreq_ether_recv_t *src)
2296 low = src->addr_low;
2297 src->addr_low = 0xffffffff;
2298 mxge_pio_copy(dst, src, 4 * sizeof (*src));
2300 mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2302 src->addr_low = low;
2303 dst->addr_low = low;
2308 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2310 bus_dma_segment_t seg;
2312 mxge_rx_ring_t *rx = &ss->rx_small;
2315 m = m_gethdr(M_DONTWAIT, MT_DATA);
2322 err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2323 &seg, &cnt, BUS_DMA_NOWAIT);
2328 rx->info[idx].m = m;
2329 rx->shadow[idx].addr_low =
2330 htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2331 rx->shadow[idx].addr_high =
2332 htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2336 mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2341 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2343 bus_dma_segment_t seg[3];
2345 mxge_rx_ring_t *rx = &ss->rx_big;
2348 if (rx->cl_size == MCLBYTES)
2349 m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR);
2351 m = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2357 m->m_len = rx->mlen;
2358 err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2359 seg, &cnt, BUS_DMA_NOWAIT);
2364 rx->info[idx].m = m;
2365 rx->shadow[idx].addr_low =
2366 htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2367 rx->shadow[idx].addr_high =
2368 htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2370 #if MXGE_VIRT_JUMBOS
2371 for (i = 1; i < cnt; i++) {
2372 rx->shadow[idx + i].addr_low =
2373 htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2374 rx->shadow[idx + i].addr_high =
2375 htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2380 for (i = 0; i < rx->nbufs; i++) {
2381 if ((idx & 7) == 7) {
2382 mxge_submit_8rx(&rx->lanai[idx - 7],
2383 &rx->shadow[idx - 7]);
2391 * Myri10GE hardware checksums are not valid if the sender
2392 * padded the frame with non-zero padding. This is because
2393 * the firmware just does a simple 16-bit 1s complement
2394 * checksum across the entire frame, excluding the first 14
2395 * bytes. It is best to simply to check the checksum and
2396 * tell the stack about it only if the checksum is good
2399 static inline uint16_t
2400 mxge_rx_csum(struct mbuf *m, int csum)
2402 struct ether_header *eh;
2406 eh = mtod(m, struct ether_header *);
2408 /* only deal with IPv4 TCP & UDP for now */
2409 if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2411 ip = (struct ip *)(eh + 1);
2412 if (__predict_false(ip->ip_p != IPPROTO_TCP &&
2413 ip->ip_p != IPPROTO_UDP))
2416 c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2417 htonl(ntohs(csum) + ntohs(ip->ip_len) +
2418 - (ip->ip_hl << 2) + ip->ip_p));
2427 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2429 struct ether_vlan_header *evl;
2430 struct ether_header *eh;
2433 evl = mtod(m, struct ether_vlan_header *);
2434 eh = mtod(m, struct ether_header *);
2437 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2438 * after what the firmware thought was the end of the ethernet
2442 /* put checksum into host byte order */
2443 *csum = ntohs(*csum);
2444 partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2445 (*csum) += ~partial;
2446 (*csum) += ((*csum) < ~partial);
2447 (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2448 (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2450 /* restore checksum to network byte order;
2451 later consumers expect this */
2452 *csum = htons(*csum);
2455 #ifdef MXGE_NEW_VLAN_API
2456 m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2460 mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2464 VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2465 m_tag_prepend(m, mtag);
2469 m->m_flags |= M_VLANTAG;
2472 * Remove the 802.1q header by copying the Ethernet
2473 * addresses over it and adjusting the beginning of
2474 * the data in the mbuf. The encapsulated Ethernet
2475 * type field is already in place.
2477 bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2478 ETHER_HDR_LEN - ETHER_TYPE_LEN);
2479 m_adj(m, ETHER_VLAN_ENCAP_LEN);
2484 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2489 struct ether_header *eh;
2491 bus_dmamap_t old_map;
2493 uint16_t tcpudp_csum;
2498 idx = rx->cnt & rx->mask;
2499 rx->cnt += rx->nbufs;
2500 /* save a pointer to the received mbuf */
2501 m = rx->info[idx].m;
2502 /* try to replace the received mbuf */
2503 if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2504 /* drop the frame -- the old mbuf is re-cycled */
2509 /* unmap the received buffer */
2510 old_map = rx->info[idx].map;
2511 bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2512 bus_dmamap_unload(rx->dmat, old_map);
2514 /* swap the bus_dmamap_t's */
2515 rx->info[idx].map = rx->extra_map;
2516 rx->extra_map = old_map;
2518 /* mcp implicitly skips 1st 2 bytes so that packet is properly
2520 m->m_data += MXGEFW_PAD;
2522 m->m_pkthdr.rcvif = ifp;
2523 m->m_len = m->m_pkthdr.len = len;
2525 eh = mtod(m, struct ether_header *);
2526 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2527 mxge_vlan_tag_remove(m, &csum);
2529 /* if the checksum is valid, mark it in the mbuf header */
2530 if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2531 if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2533 /* otherwise, it was a UDP frame, or a TCP frame which
2534 we could not do LRO on. Tell the stack that the
2536 m->m_pkthdr.csum_data = 0xffff;
2537 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2539 /* flowid only valid if RSS hashing is enabled */
2540 if (sc->num_slices > 1) {
2541 m->m_pkthdr.flowid = (ss - sc->ss);
2542 m->m_flags |= M_FLOWID;
2544 /* pass the frame up the stack */
2545 (*ifp->if_input)(ifp, m);
2549 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2553 struct ether_header *eh;
2556 bus_dmamap_t old_map;
2558 uint16_t tcpudp_csum;
2563 idx = rx->cnt & rx->mask;
2565 /* save a pointer to the received mbuf */
2566 m = rx->info[idx].m;
2567 /* try to replace the received mbuf */
2568 if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2569 /* drop the frame -- the old mbuf is re-cycled */
2574 /* unmap the received buffer */
2575 old_map = rx->info[idx].map;
2576 bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2577 bus_dmamap_unload(rx->dmat, old_map);
2579 /* swap the bus_dmamap_t's */
2580 rx->info[idx].map = rx->extra_map;
2581 rx->extra_map = old_map;
2583 /* mcp implicitly skips 1st 2 bytes so that packet is properly
2585 m->m_data += MXGEFW_PAD;
2587 m->m_pkthdr.rcvif = ifp;
2588 m->m_len = m->m_pkthdr.len = len;
2590 eh = mtod(m, struct ether_header *);
2591 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2592 mxge_vlan_tag_remove(m, &csum);
2594 /* if the checksum is valid, mark it in the mbuf header */
2595 if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2596 if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2598 /* otherwise, it was a UDP frame, or a TCP frame which
2599 we could not do LRO on. Tell the stack that the
2601 m->m_pkthdr.csum_data = 0xffff;
2602 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2604 /* flowid only valid if RSS hashing is enabled */
2605 if (sc->num_slices > 1) {
2606 m->m_pkthdr.flowid = (ss - sc->ss);
2607 m->m_flags |= M_FLOWID;
2609 /* pass the frame up the stack */
2610 (*ifp->if_input)(ifp, m);
2614 mxge_clean_rx_done(struct mxge_slice_state *ss)
2616 mxge_rx_done_t *rx_done = &ss->rx_done;
2622 while (rx_done->entry[rx_done->idx].length != 0) {
2623 length = ntohs(rx_done->entry[rx_done->idx].length);
2624 rx_done->entry[rx_done->idx].length = 0;
2625 checksum = rx_done->entry[rx_done->idx].checksum;
2626 if (length <= (MHLEN - MXGEFW_PAD))
2627 mxge_rx_done_small(ss, length, checksum);
2629 mxge_rx_done_big(ss, length, checksum);
2631 rx_done->idx = rx_done->cnt & rx_done->mask;
2633 /* limit potential for livelock */
2634 if (__predict_false(++limit > rx_done->mask / 2))
2638 while (!SLIST_EMPTY(&ss->lro_active)) {
2639 struct lro_entry *lro = SLIST_FIRST(&ss->lro_active);
2640 SLIST_REMOVE_HEAD(&ss->lro_active, next);
2641 mxge_lro_flush(ss, lro);
2648 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2659 while (tx->pkt_done != mcp_idx) {
2660 idx = tx->done & tx->mask;
2662 m = tx->info[idx].m;
2663 /* mbuf and DMA map only attached to the first
2666 ss->obytes += m->m_pkthdr.len;
2667 if (m->m_flags & M_MCAST)
2670 tx->info[idx].m = NULL;
2671 map = tx->info[idx].map;
2672 bus_dmamap_unload(tx->dmat, map);
2675 if (tx->info[idx].flag) {
2676 tx->info[idx].flag = 0;
2681 /* If we have space, clear IFF_OACTIVE to tell the stack that
2682 its OK to send packets */
2683 #ifdef IFNET_BUF_RING
2684 flags = &ss->if_drv_flags;
2686 flags = &ifp->if_drv_flags;
2688 mtx_lock(&ss->tx.mtx);
2689 if ((*flags) & IFF_DRV_OACTIVE &&
2690 tx->req - tx->done < (tx->mask + 1)/4) {
2691 *(flags) &= ~IFF_DRV_OACTIVE;
2693 mxge_start_locked(ss);
2695 #ifdef IFNET_BUF_RING
2696 if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2697 /* let the NIC stop polling this queue, since there
2698 * are no more transmits pending */
2699 if (tx->req == tx->done) {
2701 tx->queue_active = 0;
2707 mtx_unlock(&ss->tx.mtx);
2711 static struct mxge_media_type mxge_xfp_media_types[] =
2713 {IFM_10G_CX4, 0x7f, "10GBASE-CX4 (module)"},
2714 {IFM_10G_SR, (1 << 7), "10GBASE-SR"},
2715 {IFM_10G_LR, (1 << 6), "10GBASE-LR"},
2716 {0, (1 << 5), "10GBASE-ER"},
2717 {IFM_10G_LRM, (1 << 4), "10GBASE-LRM"},
2718 {0, (1 << 3), "10GBASE-SW"},
2719 {0, (1 << 2), "10GBASE-LW"},
2720 {0, (1 << 1), "10GBASE-EW"},
2721 {0, (1 << 0), "Reserved"}
2723 static struct mxge_media_type mxge_sfp_media_types[] =
2725 {0, (1 << 7), "Reserved"},
2726 {IFM_10G_LRM, (1 << 6), "10GBASE-LRM"},
2727 {IFM_10G_LR, (1 << 5), "10GBASE-LR"},
2728 {IFM_10G_SR, (1 << 4), "10GBASE-SR"}
2732 mxge_set_media(mxge_softc_t *sc, int type)
2734 sc->media_flags |= type;
2735 ifmedia_add(&sc->media, sc->media_flags, 0, NULL);
2736 ifmedia_set(&sc->media, sc->media_flags);
2741 * Determine the media type for a NIC. Some XFPs will identify
2742 * themselves only when their link is up, so this is initiated via a
2743 * link up interrupt. However, this can potentially take up to
2744 * several milliseconds, so it is run via the watchdog routine, rather
2745 * than in the interrupt handler itself. This need only be done
2746 * once, not each time the link is up.
2749 mxge_media_probe(mxge_softc_t *sc)
2754 struct mxge_media_type *mxge_media_types = NULL;
2755 int i, err, ms, mxge_media_type_entries;
2758 sc->need_media_probe = 0;
2760 /* if we've already set a media type, we're done */
2761 if (sc->media_flags != (IFM_ETHER | IFM_AUTO))
2765 * parse the product code to deterimine the interface type
2766 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2767 * after the 3rd dash in the driver's cached copy of the
2768 * EEPROM's product code string.
2770 ptr = sc->product_code_string;
2772 device_printf(sc->dev, "Missing product code\n");
2775 for (i = 0; i < 3; i++, ptr++) {
2776 ptr = index(ptr, '-');
2778 device_printf(sc->dev,
2779 "only %d dashes in PC?!?\n", i);
2785 mxge_set_media(sc, IFM_10G_CX4);
2788 else if (*ptr == 'Q') {
2789 /* -Q is Quad Ribbon Fiber */
2790 device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2791 /* FreeBSD has no media type for Quad ribbon fiber */
2797 mxge_media_types = mxge_xfp_media_types;
2798 mxge_media_type_entries =
2799 sizeof (mxge_xfp_media_types) /
2800 sizeof (mxge_xfp_media_types[0]);
2801 byte = MXGE_XFP_COMPLIANCE_BYTE;
2805 if (*ptr == 'S' || *(ptr +1) == 'S') {
2806 /* -S or -2S is SFP+ */
2807 mxge_media_types = mxge_sfp_media_types;
2808 mxge_media_type_entries =
2809 sizeof (mxge_sfp_media_types) /
2810 sizeof (mxge_sfp_media_types[0]);
2815 if (mxge_media_types == NULL) {
2816 device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2821 * At this point we know the NIC has an XFP cage, so now we
2822 * try to determine what is in the cage by using the
2823 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2824 * register. We read just one byte, which may take over
2828 cmd.data0 = 0; /* just fetch 1 byte, not all 256 */
2830 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2831 if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
2832 device_printf(sc->dev, "failed to read XFP\n");
2834 if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
2835 device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
2837 if (err != MXGEFW_CMD_OK) {
2841 /* now we wait for the data to be cached */
2843 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2844 for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
2847 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2849 if (err != MXGEFW_CMD_OK) {
2850 device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
2851 cage_type, err, ms);
2855 if (cmd.data0 == mxge_media_types[0].bitmask) {
2857 device_printf(sc->dev, "%s:%s\n", cage_type,
2858 mxge_media_types[0].name);
2859 mxge_set_media(sc, IFM_10G_CX4);
2862 for (i = 1; i < mxge_media_type_entries; i++) {
2863 if (cmd.data0 & mxge_media_types[i].bitmask) {
2865 device_printf(sc->dev, "%s:%s\n",
2867 mxge_media_types[i].name);
2869 mxge_set_media(sc, mxge_media_types[i].flag);
2873 device_printf(sc->dev, "%s media 0x%x unknown\n", cage_type,
2880 mxge_intr(void *arg)
2882 struct mxge_slice_state *ss = arg;
2883 mxge_softc_t *sc = ss->sc;
2884 mcp_irq_data_t *stats = ss->fw_stats;
2885 mxge_tx_ring_t *tx = &ss->tx;
2886 mxge_rx_done_t *rx_done = &ss->rx_done;
2887 uint32_t send_done_count;
2891 #ifndef IFNET_BUF_RING
2892 /* an interrupt on a non-zero slice is implicitly valid
2893 since MSI-X irqs are not shared */
2895 mxge_clean_rx_done(ss);
2896 *ss->irq_claim = be32toh(3);
2901 /* make sure the DMA has finished */
2902 if (!stats->valid) {
2905 valid = stats->valid;
2907 if (sc->legacy_irq) {
2908 /* lower legacy IRQ */
2909 *sc->irq_deassert = 0;
2910 if (!mxge_deassert_wait)
2911 /* don't wait for conf. that irq is low */
2917 /* loop while waiting for legacy irq deassertion */
2919 /* check for transmit completes and receives */
2920 send_done_count = be32toh(stats->send_done_count);
2921 while ((send_done_count != tx->pkt_done) ||
2922 (rx_done->entry[rx_done->idx].length != 0)) {
2923 if (send_done_count != tx->pkt_done)
2924 mxge_tx_done(ss, (int)send_done_count);
2925 mxge_clean_rx_done(ss);
2926 send_done_count = be32toh(stats->send_done_count);
2928 if (sc->legacy_irq && mxge_deassert_wait)
2930 } while (*((volatile uint8_t *) &stats->valid));
2932 /* fw link & error stats meaningful only on the first slice */
2933 if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
2934 if (sc->link_state != stats->link_up) {
2935 sc->link_state = stats->link_up;
2936 if (sc->link_state) {
2937 if_link_state_change(sc->ifp, LINK_STATE_UP);
2939 device_printf(sc->dev, "link up\n");
2941 if_link_state_change(sc->ifp, LINK_STATE_DOWN);
2943 device_printf(sc->dev, "link down\n");
2945 sc->need_media_probe = 1;
2947 if (sc->rdma_tags_available !=
2948 be32toh(stats->rdma_tags_available)) {
2949 sc->rdma_tags_available =
2950 be32toh(stats->rdma_tags_available);
2951 device_printf(sc->dev, "RDMA timed out! %d tags "
2952 "left\n", sc->rdma_tags_available);
2955 if (stats->link_down) {
2956 sc->down_cnt += stats->link_down;
2958 if_link_state_change(sc->ifp, LINK_STATE_DOWN);
2962 /* check to see if we have rx token to pass back */
2964 *ss->irq_claim = be32toh(3);
2965 *(ss->irq_claim + 1) = be32toh(3);
2969 mxge_init(void *arg)
2976 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
2978 struct lro_entry *lro_entry;
2981 while (!SLIST_EMPTY(&ss->lro_free)) {
2982 lro_entry = SLIST_FIRST(&ss->lro_free);
2983 SLIST_REMOVE_HEAD(&ss->lro_free, next);
2984 free(lro_entry, M_DEVBUF);
2987 for (i = 0; i <= ss->rx_big.mask; i++) {
2988 if (ss->rx_big.info[i].m == NULL)
2990 bus_dmamap_unload(ss->rx_big.dmat,
2991 ss->rx_big.info[i].map);
2992 m_freem(ss->rx_big.info[i].m);
2993 ss->rx_big.info[i].m = NULL;
2996 for (i = 0; i <= ss->rx_small.mask; i++) {
2997 if (ss->rx_small.info[i].m == NULL)
2999 bus_dmamap_unload(ss->rx_small.dmat,
3000 ss->rx_small.info[i].map);
3001 m_freem(ss->rx_small.info[i].m);
3002 ss->rx_small.info[i].m = NULL;
3005 /* transmit ring used only on the first slice */
3006 if (ss->tx.info == NULL)
3009 for (i = 0; i <= ss->tx.mask; i++) {
3010 ss->tx.info[i].flag = 0;
3011 if (ss->tx.info[i].m == NULL)
3013 bus_dmamap_unload(ss->tx.dmat,
3014 ss->tx.info[i].map);
3015 m_freem(ss->tx.info[i].m);
3016 ss->tx.info[i].m = NULL;
3021 mxge_free_mbufs(mxge_softc_t *sc)
3025 for (slice = 0; slice < sc->num_slices; slice++)
3026 mxge_free_slice_mbufs(&sc->ss[slice]);
3030 mxge_free_slice_rings(struct mxge_slice_state *ss)
3035 if (ss->rx_done.entry != NULL)
3036 mxge_dma_free(&ss->rx_done.dma);
3037 ss->rx_done.entry = NULL;
3039 if (ss->tx.req_bytes != NULL)
3040 free(ss->tx.req_bytes, M_DEVBUF);
3041 ss->tx.req_bytes = NULL;
3043 if (ss->tx.seg_list != NULL)
3044 free(ss->tx.seg_list, M_DEVBUF);
3045 ss->tx.seg_list = NULL;
3047 if (ss->rx_small.shadow != NULL)
3048 free(ss->rx_small.shadow, M_DEVBUF);
3049 ss->rx_small.shadow = NULL;
3051 if (ss->rx_big.shadow != NULL)
3052 free(ss->rx_big.shadow, M_DEVBUF);
3053 ss->rx_big.shadow = NULL;
3055 if (ss->tx.info != NULL) {
3056 if (ss->tx.dmat != NULL) {
3057 for (i = 0; i <= ss->tx.mask; i++) {
3058 bus_dmamap_destroy(ss->tx.dmat,
3059 ss->tx.info[i].map);
3061 bus_dma_tag_destroy(ss->tx.dmat);
3063 free(ss->tx.info, M_DEVBUF);
3067 if (ss->rx_small.info != NULL) {
3068 if (ss->rx_small.dmat != NULL) {
3069 for (i = 0; i <= ss->rx_small.mask; i++) {
3070 bus_dmamap_destroy(ss->rx_small.dmat,
3071 ss->rx_small.info[i].map);
3073 bus_dmamap_destroy(ss->rx_small.dmat,
3074 ss->rx_small.extra_map);
3075 bus_dma_tag_destroy(ss->rx_small.dmat);
3077 free(ss->rx_small.info, M_DEVBUF);
3079 ss->rx_small.info = NULL;
3081 if (ss->rx_big.info != NULL) {
3082 if (ss->rx_big.dmat != NULL) {
3083 for (i = 0; i <= ss->rx_big.mask; i++) {
3084 bus_dmamap_destroy(ss->rx_big.dmat,
3085 ss->rx_big.info[i].map);
3087 bus_dmamap_destroy(ss->rx_big.dmat,
3088 ss->rx_big.extra_map);
3089 bus_dma_tag_destroy(ss->rx_big.dmat);
3091 free(ss->rx_big.info, M_DEVBUF);
3093 ss->rx_big.info = NULL;
3097 mxge_free_rings(mxge_softc_t *sc)
3101 for (slice = 0; slice < sc->num_slices; slice++)
3102 mxge_free_slice_rings(&sc->ss[slice]);
3106 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3107 int tx_ring_entries)
3109 mxge_softc_t *sc = ss->sc;
3115 /* allocate per-slice receive resources */
3117 ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3118 ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3120 /* allocate the rx shadow rings */
3121 bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3122 ss->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3123 if (ss->rx_small.shadow == NULL)
3126 bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3127 ss->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3128 if (ss->rx_big.shadow == NULL)
3131 /* allocate the rx host info rings */
3132 bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3133 ss->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3134 if (ss->rx_small.info == NULL)
3137 bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3138 ss->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3139 if (ss->rx_big.info == NULL)
3142 /* allocate the rx busdma resources */
3143 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3145 4096, /* boundary */
3146 BUS_SPACE_MAXADDR, /* low */
3147 BUS_SPACE_MAXADDR, /* high */
3148 NULL, NULL, /* filter */
3149 MHLEN, /* maxsize */
3151 MHLEN, /* maxsegsize */
3152 BUS_DMA_ALLOCNOW, /* flags */
3153 NULL, NULL, /* lock */
3154 &ss->rx_small.dmat); /* tag */
3156 device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3161 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3163 #if MXGE_VIRT_JUMBOS
3164 4096, /* boundary */
3168 BUS_SPACE_MAXADDR, /* low */
3169 BUS_SPACE_MAXADDR, /* high */
3170 NULL, NULL, /* filter */
3171 3*4096, /* maxsize */
3172 #if MXGE_VIRT_JUMBOS
3174 4096, /* maxsegsize*/
3177 MJUM9BYTES, /* maxsegsize*/
3179 BUS_DMA_ALLOCNOW, /* flags */
3180 NULL, NULL, /* lock */
3181 &ss->rx_big.dmat); /* tag */
3183 device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3187 for (i = 0; i <= ss->rx_small.mask; i++) {
3188 err = bus_dmamap_create(ss->rx_small.dmat, 0,
3189 &ss->rx_small.info[i].map);
3191 device_printf(sc->dev, "Err %d rx_small dmamap\n",
3196 err = bus_dmamap_create(ss->rx_small.dmat, 0,
3197 &ss->rx_small.extra_map);
3199 device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3204 for (i = 0; i <= ss->rx_big.mask; i++) {
3205 err = bus_dmamap_create(ss->rx_big.dmat, 0,
3206 &ss->rx_big.info[i].map);
3208 device_printf(sc->dev, "Err %d rx_big dmamap\n",
3213 err = bus_dmamap_create(ss->rx_big.dmat, 0,
3214 &ss->rx_big.extra_map);
3216 device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3221 /* now allocate TX resouces */
3223 #ifndef IFNET_BUF_RING
3224 /* only use a single TX ring for now */
3225 if (ss != ss->sc->ss)
3229 ss->tx.mask = tx_ring_entries - 1;
3230 ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3233 /* allocate the tx request copy block */
3235 sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3236 ss->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
3237 if (ss->tx.req_bytes == NULL)
3239 /* ensure req_list entries are aligned to 8 bytes */
3240 ss->tx.req_list = (mcp_kreq_ether_send_t *)
3241 ((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3243 /* allocate the tx busdma segment list */
3244 bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3245 ss->tx.seg_list = (bus_dma_segment_t *)
3246 malloc(bytes, M_DEVBUF, M_WAITOK);
3247 if (ss->tx.seg_list == NULL)
3250 /* allocate the tx host info ring */
3251 bytes = tx_ring_entries * sizeof (*ss->tx.info);
3252 ss->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3253 if (ss->tx.info == NULL)
3256 /* allocate the tx busdma resources */
3257 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3259 sc->tx_boundary, /* boundary */
3260 BUS_SPACE_MAXADDR, /* low */
3261 BUS_SPACE_MAXADDR, /* high */
3262 NULL, NULL, /* filter */
3263 65536 + 256, /* maxsize */
3264 ss->tx.max_desc - 2, /* num segs */
3265 sc->tx_boundary, /* maxsegsz */
3266 BUS_DMA_ALLOCNOW, /* flags */
3267 NULL, NULL, /* lock */
3268 &ss->tx.dmat); /* tag */
3271 device_printf(sc->dev, "Err %d allocating tx dmat\n",
3276 /* now use these tags to setup dmamaps for each slot
3278 for (i = 0; i <= ss->tx.mask; i++) {
3279 err = bus_dmamap_create(ss->tx.dmat, 0,
3280 &ss->tx.info[i].map);
3282 device_printf(sc->dev, "Err %d tx dmamap\n",
3292 mxge_alloc_rings(mxge_softc_t *sc)
3296 int tx_ring_entries, rx_ring_entries;
3299 /* get ring sizes */
3300 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3301 tx_ring_size = cmd.data0;
3303 device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3307 tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3308 rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3309 IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
3310 sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
3311 IFQ_SET_READY(&sc->ifp->if_snd);
3313 for (slice = 0; slice < sc->num_slices; slice++) {
3314 err = mxge_alloc_slice_rings(&sc->ss[slice],
3323 mxge_free_rings(sc);
3330 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3332 int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3334 if (bufsize < MCLBYTES) {
3335 /* easy, everything fits in a single buffer */
3336 *big_buf_size = MCLBYTES;
3337 *cl_size = MCLBYTES;
3342 if (bufsize < MJUMPAGESIZE) {
3343 /* still easy, everything still fits in a single buffer */
3344 *big_buf_size = MJUMPAGESIZE;
3345 *cl_size = MJUMPAGESIZE;
3349 #if MXGE_VIRT_JUMBOS
3350 /* now we need to use virtually contiguous buffers */
3351 *cl_size = MJUM9BYTES;
3352 *big_buf_size = 4096;
3353 *nbufs = mtu / 4096 + 1;
3354 /* needs to be a power of two, so round up */
3358 *cl_size = MJUM9BYTES;
3359 *big_buf_size = MJUM9BYTES;
3365 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3370 struct lro_entry *lro_entry;
3375 slice = ss - sc->ss;
3377 SLIST_INIT(&ss->lro_free);
3378 SLIST_INIT(&ss->lro_active);
3380 for (i = 0; i < sc->lro_cnt; i++) {
3381 lro_entry = (struct lro_entry *)
3382 malloc(sizeof (*lro_entry), M_DEVBUF,
3384 if (lro_entry == NULL) {
3388 SLIST_INSERT_HEAD(&ss->lro_free, lro_entry, next);
3390 /* get the lanai pointers to the send and receive rings */
3393 #ifndef IFNET_BUF_RING
3394 /* We currently only send from the first slice */
3398 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3400 (volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3401 ss->tx.send_go = (volatile uint32_t *)
3402 (sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3403 ss->tx.send_stop = (volatile uint32_t *)
3404 (sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3405 #ifndef IFNET_BUF_RING
3409 err |= mxge_send_cmd(sc,
3410 MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3411 ss->rx_small.lanai =
3412 (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3414 err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3416 (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3419 device_printf(sc->dev,
3420 "failed to get ring sizes or locations\n");
3424 /* stock receive rings */
3425 for (i = 0; i <= ss->rx_small.mask; i++) {
3426 map = ss->rx_small.info[i].map;
3427 err = mxge_get_buf_small(ss, map, i);
3429 device_printf(sc->dev, "alloced %d/%d smalls\n",
3430 i, ss->rx_small.mask + 1);
3434 for (i = 0; i <= ss->rx_big.mask; i++) {
3435 ss->rx_big.shadow[i].addr_low = 0xffffffff;
3436 ss->rx_big.shadow[i].addr_high = 0xffffffff;
3438 ss->rx_big.nbufs = nbufs;
3439 ss->rx_big.cl_size = cl_size;
3440 ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3441 ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3442 for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3443 map = ss->rx_big.info[i].map;
3444 err = mxge_get_buf_big(ss, map, i);
3446 device_printf(sc->dev, "alloced %d/%d bigs\n",
3447 i, ss->rx_big.mask + 1);
3455 mxge_open(mxge_softc_t *sc)
3458 int err, big_bytes, nbufs, slice, cl_size, i;
3460 volatile uint8_t *itable;
3461 struct mxge_slice_state *ss;
3463 /* Copy the MAC address in case it was overridden */
3464 bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3466 err = mxge_reset(sc, 1);
3468 device_printf(sc->dev, "failed to reset\n");
3472 if (sc->num_slices > 1) {
3473 /* setup the indirection table */
3474 cmd.data0 = sc->num_slices;
3475 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3478 err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3481 device_printf(sc->dev,
3482 "failed to setup rss tables\n");
3486 /* just enable an identity mapping */
3487 itable = sc->sram + cmd.data0;
3488 for (i = 0; i < sc->num_slices; i++)
3489 itable[i] = (uint8_t)i;
3492 cmd.data1 = mxge_rss_hash_type;
3493 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3495 device_printf(sc->dev, "failed to enable slices\n");
3501 mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3504 err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3506 /* error is only meaningful if we're trying to set
3507 MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3508 if (err && nbufs > 1) {
3509 device_printf(sc->dev,
3510 "Failed to set alway-use-n to %d\n",
3514 /* Give the firmware the mtu and the big and small buffer
3515 sizes. The firmware wants the big buf size to be a power
3516 of two. Luckily, FreeBSD's clusters are powers of two */
3517 cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3518 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3519 cmd.data0 = MHLEN - MXGEFW_PAD;
3520 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3522 cmd.data0 = big_bytes;
3523 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3526 device_printf(sc->dev, "failed to setup params\n");
3530 /* Now give him the pointer to the stats block */
3532 #ifdef IFNET_BUF_RING
3533 slice < sc->num_slices;
3538 ss = &sc->ss[slice];
3540 MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3542 MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3543 cmd.data2 = sizeof(struct mcp_irq_data);
3544 cmd.data2 |= (slice << 16);
3545 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3549 bus = sc->ss->fw_stats_dma.bus_addr;
3550 bus += offsetof(struct mcp_irq_data, send_done_count);
3551 cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3552 cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3553 err = mxge_send_cmd(sc,
3554 MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3556 /* Firmware cannot support multicast without STATS_DMA_V2 */
3557 sc->fw_multicast_support = 0;
3559 sc->fw_multicast_support = 1;
3563 device_printf(sc->dev, "failed to setup params\n");
3567 for (slice = 0; slice < sc->num_slices; slice++) {
3568 err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3570 device_printf(sc->dev, "couldn't open slice %d\n",
3576 /* Finally, start the firmware running */
3577 err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3579 device_printf(sc->dev, "Couldn't bring up link\n");
3582 #ifdef IFNET_BUF_RING
3583 for (slice = 0; slice < sc->num_slices; slice++) {
3584 ss = &sc->ss[slice];
3585 ss->if_drv_flags |= IFF_DRV_RUNNING;
3586 ss->if_drv_flags &= ~IFF_DRV_OACTIVE;
3589 sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
3590 sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
3591 callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3597 mxge_free_mbufs(sc);
3603 mxge_close(mxge_softc_t *sc)
3606 int err, old_down_cnt;
3607 #ifdef IFNET_BUF_RING
3608 struct mxge_slice_state *ss;
3612 callout_stop(&sc->co_hdl);
3613 #ifdef IFNET_BUF_RING
3614 for (slice = 0; slice < sc->num_slices; slice++) {
3615 ss = &sc->ss[slice];
3616 ss->if_drv_flags &= ~IFF_DRV_RUNNING;
3619 sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
3620 old_down_cnt = sc->down_cnt;
3622 err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3624 device_printf(sc->dev, "Couldn't bring down link\n");
3626 if (old_down_cnt == sc->down_cnt) {
3627 /* wait for down irq */
3628 DELAY(10 * sc->intr_coal_delay);
3631 if (old_down_cnt == sc->down_cnt) {
3632 device_printf(sc->dev, "never got down irq\n");
3635 mxge_free_mbufs(sc);
3641 mxge_setup_cfg_space(mxge_softc_t *sc)
3643 device_t dev = sc->dev;
3645 uint16_t cmd, lnk, pectl;
3647 /* find the PCIe link width and set max read request to 4KB*/
3648 if (pci_find_extcap(dev, PCIY_EXPRESS, ®) == 0) {
3649 lnk = pci_read_config(dev, reg + 0x12, 2);
3650 sc->link_width = (lnk >> 4) & 0x3f;
3652 pectl = pci_read_config(dev, reg + 0x8, 2);
3653 pectl = (pectl & ~0x7000) | (5 << 12);
3654 pci_write_config(dev, reg + 0x8, pectl, 2);
3657 /* Enable DMA and Memory space access */
3658 pci_enable_busmaster(dev);
3659 cmd = pci_read_config(dev, PCIR_COMMAND, 2);
3660 cmd |= PCIM_CMD_MEMEN;
3661 pci_write_config(dev, PCIR_COMMAND, cmd, 2);
3665 mxge_read_reboot(mxge_softc_t *sc)
3667 device_t dev = sc->dev;
3670 /* find the vendor specific offset */
3671 if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
3672 device_printf(sc->dev,
3673 "could not find vendor specific offset\n");
3674 return (uint32_t)-1;
3676 /* enable read32 mode */
3677 pci_write_config(dev, vs + 0x10, 0x3, 1);
3678 /* tell NIC which register to read */
3679 pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3680 return (pci_read_config(dev, vs + 0x14, 4));
3684 mxge_watchdog_reset(mxge_softc_t *sc, int slice)
3686 struct pci_devinfo *dinfo;
3694 device_printf(sc->dev, "Watchdog reset!\n");
3697 * check to see if the NIC rebooted. If it did, then all of
3698 * PCI config space has been reset, and things like the
3699 * busmaster bit will be zero. If this is the case, then we
3700 * must restore PCI config space before the NIC can be used
3703 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3704 if (cmd == 0xffff) {
3706 * maybe the watchdog caught the NIC rebooting; wait
3707 * up to 100ms for it to finish. If it does not come
3708 * back, then give up
3711 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3712 if (cmd == 0xffff) {
3713 device_printf(sc->dev, "NIC disappeared!\n");
3717 if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3718 /* print the reboot status */
3719 reboot = mxge_read_reboot(sc);
3720 device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3722 /* restore PCI configuration space */
3723 dinfo = device_get_ivars(sc->dev);
3724 pci_cfg_restore(sc->dev, dinfo);
3726 /* and redo any changes we made to our config space */
3727 mxge_setup_cfg_space(sc);
3729 if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING) {
3731 err = mxge_open(sc);
3734 tx = &sc->ss[slice].tx;
3735 device_printf(sc->dev,
3736 "NIC did not reboot, slice %d ring state:\n",
3738 device_printf(sc->dev,
3739 "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3740 tx->req, tx->done, tx->queue_active);
3741 device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3742 tx->activate, tx->deactivate);
3743 device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3745 be32toh(sc->ss->fw_stats->send_done_count));
3746 device_printf(sc->dev, "not resetting\n");
3752 mxge_watchdog(mxge_softc_t *sc)
3755 uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3758 /* see if we have outstanding transmits, which
3759 have been pending for more than mxge_ticks */
3761 #ifdef IFNET_BUF_RING
3762 (i < sc->num_slices) && (err == 0);
3764 (i < 1) && (err == 0);
3768 if (tx->req != tx->done &&
3769 tx->watchdog_req != tx->watchdog_done &&
3770 tx->done == tx->watchdog_done) {
3771 /* check for pause blocking before resetting */
3772 if (tx->watchdog_rx_pause == rx_pause)
3773 err = mxge_watchdog_reset(sc, i);
3775 device_printf(sc->dev, "Flow control blocking "
3776 "xmits, check link partner\n");
3779 tx->watchdog_req = tx->req;
3780 tx->watchdog_done = tx->done;
3781 tx->watchdog_rx_pause = rx_pause;
3784 if (sc->need_media_probe)
3785 mxge_media_probe(sc);
3790 mxge_update_stats(mxge_softc_t *sc)
3792 struct mxge_slice_state *ss;
3793 u_long ipackets = 0;
3794 u_long opackets = 0;
3795 #ifdef IFNET_BUF_RING
3803 for (slice = 0; slice < sc->num_slices; slice++) {
3804 ss = &sc->ss[slice];
3805 ipackets += ss->ipackets;
3806 opackets += ss->opackets;
3807 #ifdef IFNET_BUF_RING
3808 obytes += ss->obytes;
3809 omcasts += ss->omcasts;
3810 odrops += ss->tx.br->br_drops;
3812 oerrors += ss->oerrors;
3814 sc->ifp->if_ipackets = ipackets;
3815 sc->ifp->if_opackets = opackets;
3816 #ifdef IFNET_BUF_RING
3817 sc->ifp->if_obytes = obytes;
3818 sc->ifp->if_omcasts = omcasts;
3819 sc->ifp->if_snd.ifq_drops = odrops;
3821 sc->ifp->if_oerrors = oerrors;
3825 mxge_tick(void *arg)
3827 mxge_softc_t *sc = arg;
3830 /* aggregate stats from different slices */
3831 mxge_update_stats(sc);
3832 if (!sc->watchdog_countdown) {
3833 err = mxge_watchdog(sc);
3834 sc->watchdog_countdown = 4;
3836 sc->watchdog_countdown--;
3838 callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3843 mxge_media_change(struct ifnet *ifp)
3849 mxge_change_mtu(mxge_softc_t *sc, int mtu)
3851 struct ifnet *ifp = sc->ifp;
3852 int real_mtu, old_mtu;
3856 real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3857 if ((real_mtu > sc->max_mtu) || real_mtu < 60)
3859 mtx_lock(&sc->driver_mtx);
3860 old_mtu = ifp->if_mtu;
3862 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3864 err = mxge_open(sc);
3866 ifp->if_mtu = old_mtu;
3868 (void) mxge_open(sc);
3871 mtx_unlock(&sc->driver_mtx);
3876 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
3878 mxge_softc_t *sc = ifp->if_softc;
3883 ifmr->ifm_status = IFM_AVALID;
3884 ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
3885 ifmr->ifm_active = IFM_AUTO | IFM_ETHER;
3886 ifmr->ifm_active |= sc->link_state ? IFM_FDX : 0;
3890 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
3892 mxge_softc_t *sc = ifp->if_softc;
3893 struct ifreq *ifr = (struct ifreq *)data;
3900 err = ether_ioctl(ifp, command, data);
3904 err = mxge_change_mtu(sc, ifr->ifr_mtu);
3908 mtx_lock(&sc->driver_mtx);
3910 mtx_unlock(&sc->driver_mtx);
3913 if (ifp->if_flags & IFF_UP) {
3914 if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
3915 err = mxge_open(sc);
3917 /* take care of promis can allmulti
3919 mxge_change_promisc(sc,
3920 ifp->if_flags & IFF_PROMISC);
3921 mxge_set_multicast_list(sc);
3924 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3928 mtx_unlock(&sc->driver_mtx);
3933 mtx_lock(&sc->driver_mtx);
3934 mxge_set_multicast_list(sc);
3935 mtx_unlock(&sc->driver_mtx);
3939 mtx_lock(&sc->driver_mtx);
3940 mask = ifr->ifr_reqcap ^ ifp->if_capenable;
3941 if (mask & IFCAP_TXCSUM) {
3942 if (IFCAP_TXCSUM & ifp->if_capenable) {
3943 ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
3944 ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
3947 ifp->if_capenable |= IFCAP_TXCSUM;
3948 ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
3950 } else if (mask & IFCAP_RXCSUM) {
3951 if (IFCAP_RXCSUM & ifp->if_capenable) {
3952 ifp->if_capenable &= ~IFCAP_RXCSUM;
3955 ifp->if_capenable |= IFCAP_RXCSUM;
3959 if (mask & IFCAP_TSO4) {
3960 if (IFCAP_TSO4 & ifp->if_capenable) {
3961 ifp->if_capenable &= ~IFCAP_TSO4;
3962 ifp->if_hwassist &= ~CSUM_TSO;
3963 } else if (IFCAP_TXCSUM & ifp->if_capenable) {
3964 ifp->if_capenable |= IFCAP_TSO4;
3965 ifp->if_hwassist |= CSUM_TSO;
3967 printf("mxge requires tx checksum offload"
3968 " be enabled to use TSO\n");
3972 if (mask & IFCAP_LRO) {
3973 if (IFCAP_LRO & ifp->if_capenable)
3974 err = mxge_change_lro_locked(sc, 0);
3976 err = mxge_change_lro_locked(sc, mxge_lro_cnt);
3978 if (mask & IFCAP_VLAN_HWTAGGING)
3979 ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
3980 mtx_unlock(&sc->driver_mtx);
3981 VLAN_CAPABILITIES(ifp);
3986 err = ifmedia_ioctl(ifp, (struct ifreq *)data,
3987 &sc->media, command);
3997 mxge_fetch_tunables(mxge_softc_t *sc)
4000 TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4001 TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4002 &mxge_flow_control);
4003 TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4004 &mxge_intr_coal_delay);
4005 TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4006 &mxge_nvidia_ecrc_enable);
4007 TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4008 &mxge_force_firmware);
4009 TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4010 &mxge_deassert_wait);
4011 TUNABLE_INT_FETCH("hw.mxge.verbose",
4013 TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4014 TUNABLE_INT_FETCH("hw.mxge.lro_cnt", &sc->lro_cnt);
4015 TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4016 TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4017 TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4018 if (sc->lro_cnt != 0)
4019 mxge_lro_cnt = sc->lro_cnt;
4023 if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4024 mxge_intr_coal_delay = 30;
4025 if (mxge_ticks == 0)
4026 mxge_ticks = hz / 2;
4027 sc->pause = mxge_flow_control;
4028 if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4029 || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4030 mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
4032 if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4033 mxge_initial_mtu < ETHER_MIN_LEN)
4034 mxge_initial_mtu = ETHERMTU_JUMBO;
4039 mxge_free_slices(mxge_softc_t *sc)
4041 struct mxge_slice_state *ss;
4048 for (i = 0; i < sc->num_slices; i++) {
4050 if (ss->fw_stats != NULL) {
4051 mxge_dma_free(&ss->fw_stats_dma);
4052 ss->fw_stats = NULL;
4053 #ifdef IFNET_BUF_RING
4054 if (ss->tx.br != NULL) {
4055 drbr_free(ss->tx.br, M_DEVBUF);
4059 mtx_destroy(&ss->tx.mtx);
4061 if (ss->rx_done.entry != NULL) {
4062 mxge_dma_free(&ss->rx_done.dma);
4063 ss->rx_done.entry = NULL;
4066 free(sc->ss, M_DEVBUF);
4071 mxge_alloc_slices(mxge_softc_t *sc)
4074 struct mxge_slice_state *ss;
4076 int err, i, max_intr_slots;
4078 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4080 device_printf(sc->dev, "Cannot determine rx ring size\n");
4083 sc->rx_ring_size = cmd.data0;
4084 max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4086 bytes = sizeof (*sc->ss) * sc->num_slices;
4087 sc->ss = malloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4090 for (i = 0; i < sc->num_slices; i++) {
4095 /* allocate per-slice rx interrupt queues */
4097 bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4098 err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4101 ss->rx_done.entry = ss->rx_done.dma.addr;
4102 bzero(ss->rx_done.entry, bytes);
4105 * allocate the per-slice firmware stats; stats
4106 * (including tx) are used used only on the first
4109 #ifndef IFNET_BUF_RING
4114 bytes = sizeof (*ss->fw_stats);
4115 err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4116 sizeof (*ss->fw_stats), 64);
4119 ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4120 snprintf(ss->tx.mtx_name, sizeof(ss->tx.mtx_name),
4121 "%s:tx(%d)", device_get_nameunit(sc->dev), i);
4122 mtx_init(&ss->tx.mtx, ss->tx.mtx_name, NULL, MTX_DEF);
4123 #ifdef IFNET_BUF_RING
4124 ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4132 mxge_free_slices(sc);
4137 mxge_slice_probe(mxge_softc_t *sc)
4141 int msix_cnt, status, max_intr_slots;
4145 * don't enable multiple slices if they are not enabled,
4146 * or if this is not an SMP system
4149 if (mxge_max_slices == 0 || mxge_max_slices == 1 || mp_ncpus < 2)
4152 /* see how many MSI-X interrupts are available */
4153 msix_cnt = pci_msix_count(sc->dev);
4157 /* now load the slice aware firmware see what it supports */
4158 old_fw = sc->fw_name;
4159 if (old_fw == mxge_fw_aligned)
4160 sc->fw_name = mxge_fw_rss_aligned;
4162 sc->fw_name = mxge_fw_rss_unaligned;
4163 status = mxge_load_firmware(sc, 0);
4165 device_printf(sc->dev, "Falling back to a single slice\n");
4169 /* try to send a reset command to the card to see if it
4171 memset(&cmd, 0, sizeof (cmd));
4172 status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4174 device_printf(sc->dev, "failed reset\n");
4178 /* get rx ring size */
4179 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4181 device_printf(sc->dev, "Cannot determine rx ring size\n");
4184 max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4186 /* tell it the size of the interrupt queues */
4187 cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4188 status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4190 device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4194 /* ask the maximum number of slices it supports */
4195 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4197 device_printf(sc->dev,
4198 "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4201 sc->num_slices = cmd.data0;
4202 if (sc->num_slices > msix_cnt)
4203 sc->num_slices = msix_cnt;
4205 if (mxge_max_slices == -1) {
4206 /* cap to number of CPUs in system */
4207 if (sc->num_slices > mp_ncpus)
4208 sc->num_slices = mp_ncpus;
4210 if (sc->num_slices > mxge_max_slices)
4211 sc->num_slices = mxge_max_slices;
4213 /* make sure it is a power of two */
4214 while (sc->num_slices & (sc->num_slices - 1))
4218 device_printf(sc->dev, "using %d slices\n",
4224 sc->fw_name = old_fw;
4225 (void) mxge_load_firmware(sc, 0);
4229 mxge_add_msix_irqs(mxge_softc_t *sc)
4232 int count, err, i, rid;
4235 sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4238 if (sc->msix_table_res == NULL) {
4239 device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4243 count = sc->num_slices;
4244 err = pci_alloc_msix(sc->dev, &count);
4246 device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4247 "err = %d \n", sc->num_slices, err);
4248 goto abort_with_msix_table;
4250 if (count < sc->num_slices) {
4251 device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4252 count, sc->num_slices);
4253 device_printf(sc->dev,
4254 "Try setting hw.mxge.max_slices to %d\n",
4257 goto abort_with_msix;
4259 bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4260 sc->msix_irq_res = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4261 if (sc->msix_irq_res == NULL) {
4263 goto abort_with_msix;
4266 for (i = 0; i < sc->num_slices; i++) {
4268 sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4271 if (sc->msix_irq_res[i] == NULL) {
4272 device_printf(sc->dev, "couldn't allocate IRQ res"
4273 " for message %d\n", i);
4275 goto abort_with_res;
4279 bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4280 sc->msix_ih = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4282 for (i = 0; i < sc->num_slices; i++) {
4283 err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4284 INTR_TYPE_NET | INTR_MPSAFE,
4285 #if __FreeBSD_version > 700030
4288 mxge_intr, &sc->ss[i], &sc->msix_ih[i]);
4290 device_printf(sc->dev, "couldn't setup intr for "
4292 goto abort_with_intr;
4297 device_printf(sc->dev, "using %d msix IRQs:",
4299 for (i = 0; i < sc->num_slices; i++)
4300 printf(" %ld", rman_get_start(sc->msix_irq_res[i]));
4306 for (i = 0; i < sc->num_slices; i++) {
4307 if (sc->msix_ih[i] != NULL) {
4308 bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4310 sc->msix_ih[i] = NULL;
4313 free(sc->msix_ih, M_DEVBUF);
4317 for (i = 0; i < sc->num_slices; i++) {
4319 if (sc->msix_irq_res[i] != NULL)
4320 bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4321 sc->msix_irq_res[i]);
4322 sc->msix_irq_res[i] = NULL;
4324 free(sc->msix_irq_res, M_DEVBUF);
4328 pci_release_msi(sc->dev);
4330 abort_with_msix_table:
4331 bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4332 sc->msix_table_res);
4338 mxge_add_single_irq(mxge_softc_t *sc)
4340 int count, err, rid;
4342 count = pci_msi_count(sc->dev);
4343 if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4349 sc->irq_res = bus_alloc_resource(sc->dev, SYS_RES_IRQ, &rid, 0, ~0,
4350 1, RF_SHAREABLE | RF_ACTIVE);
4351 if (sc->irq_res == NULL) {
4352 device_printf(sc->dev, "could not alloc interrupt\n");
4356 device_printf(sc->dev, "using %s irq %ld\n",
4357 sc->legacy_irq ? "INTx" : "MSI",
4358 rman_get_start(sc->irq_res));
4359 err = bus_setup_intr(sc->dev, sc->irq_res,
4360 INTR_TYPE_NET | INTR_MPSAFE,
4361 #if __FreeBSD_version > 700030
4364 mxge_intr, &sc->ss[0], &sc->ih);
4366 bus_release_resource(sc->dev, SYS_RES_IRQ,
4367 sc->legacy_irq ? 0 : 1, sc->irq_res);
4368 if (!sc->legacy_irq)
4369 pci_release_msi(sc->dev);
4375 mxge_rem_msix_irqs(mxge_softc_t *sc)
4379 for (i = 0; i < sc->num_slices; i++) {
4380 if (sc->msix_ih[i] != NULL) {
4381 bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4383 sc->msix_ih[i] = NULL;
4386 free(sc->msix_ih, M_DEVBUF);
4388 for (i = 0; i < sc->num_slices; i++) {
4390 if (sc->msix_irq_res[i] != NULL)
4391 bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4392 sc->msix_irq_res[i]);
4393 sc->msix_irq_res[i] = NULL;
4395 free(sc->msix_irq_res, M_DEVBUF);
4397 bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4398 sc->msix_table_res);
4400 pci_release_msi(sc->dev);
4405 mxge_rem_single_irq(mxge_softc_t *sc)
4407 bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4408 bus_release_resource(sc->dev, SYS_RES_IRQ,
4409 sc->legacy_irq ? 0 : 1, sc->irq_res);
4410 if (!sc->legacy_irq)
4411 pci_release_msi(sc->dev);
4415 mxge_rem_irq(mxge_softc_t *sc)
4417 if (sc->num_slices > 1)
4418 mxge_rem_msix_irqs(sc);
4420 mxge_rem_single_irq(sc);
4424 mxge_add_irq(mxge_softc_t *sc)
4428 if (sc->num_slices > 1)
4429 err = mxge_add_msix_irqs(sc);
4431 err = mxge_add_single_irq(sc);
4433 if (0 && err == 0 && sc->num_slices > 1) {
4434 mxge_rem_msix_irqs(sc);
4435 err = mxge_add_msix_irqs(sc);
4442 mxge_attach(device_t dev)
4444 mxge_softc_t *sc = device_get_softc(dev);
4449 mxge_fetch_tunables(sc);
4451 err = bus_dma_tag_create(NULL, /* parent */
4454 BUS_SPACE_MAXADDR, /* low */
4455 BUS_SPACE_MAXADDR, /* high */
4456 NULL, NULL, /* filter */
4457 65536 + 256, /* maxsize */
4458 MXGE_MAX_SEND_DESC, /* num segs */
4459 65536, /* maxsegsize */
4461 NULL, NULL, /* lock */
4462 &sc->parent_dmat); /* tag */
4465 device_printf(sc->dev, "Err %d allocating parent dmat\n",
4467 goto abort_with_nothing;
4470 ifp = sc->ifp = if_alloc(IFT_ETHER);
4472 device_printf(dev, "can not if_alloc()\n");
4474 goto abort_with_parent_dmat;
4476 if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4478 snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
4479 device_get_nameunit(dev));
4480 mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
4481 snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
4482 "%s:drv", device_get_nameunit(dev));
4483 mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
4484 MTX_NETWORK_LOCK, MTX_DEF);
4486 callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
4488 mxge_setup_cfg_space(sc);
4490 /* Map the board into the kernel */
4492 sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
4494 if (sc->mem_res == NULL) {
4495 device_printf(dev, "could not map memory\n");
4497 goto abort_with_lock;
4499 sc->sram = rman_get_virtual(sc->mem_res);
4500 sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4501 if (sc->sram_size > rman_get_size(sc->mem_res)) {
4502 device_printf(dev, "impossible memory region size %ld\n",
4503 rman_get_size(sc->mem_res));
4505 goto abort_with_mem_res;
4508 /* make NULL terminated copy of the EEPROM strings section of
4510 bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4511 bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4512 rman_get_bushandle(sc->mem_res),
4513 sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4515 MXGE_EEPROM_STRINGS_SIZE - 2);
4516 err = mxge_parse_strings(sc);
4518 goto abort_with_mem_res;
4520 /* Enable write combining for efficient use of PCIe bus */
4523 /* Allocate the out of band dma memory */
4524 err = mxge_dma_alloc(sc, &sc->cmd_dma,
4525 sizeof (mxge_cmd_t), 64);
4527 goto abort_with_mem_res;
4528 sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4529 err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4531 goto abort_with_cmd_dma;
4533 err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4535 goto abort_with_zeropad_dma;
4537 /* select & load the firmware */
4538 err = mxge_select_firmware(sc);
4540 goto abort_with_dmabench;
4541 sc->intr_coal_delay = mxge_intr_coal_delay;
4543 mxge_slice_probe(sc);
4544 err = mxge_alloc_slices(sc);
4546 goto abort_with_dmabench;
4548 err = mxge_reset(sc, 0);
4550 goto abort_with_slices;
4552 err = mxge_alloc_rings(sc);
4554 device_printf(sc->dev, "failed to allocate rings\n");
4555 goto abort_with_dmabench;
4558 err = mxge_add_irq(sc);
4560 device_printf(sc->dev, "failed to add irq\n");
4561 goto abort_with_rings;
4564 ifp->if_baudrate = IF_Gbps(10UL);
4565 ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4568 ifp->if_capabilities |= IFCAP_LRO;
4571 #ifdef MXGE_NEW_VLAN_API
4572 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4575 sc->max_mtu = mxge_max_mtu(sc);
4576 if (sc->max_mtu >= 9000)
4577 ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4579 device_printf(dev, "MTU limited to %d. Install "
4580 "latest firmware for 9000 byte jumbo support\n",
4581 sc->max_mtu - ETHER_HDR_LEN);
4582 ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4583 ifp->if_capenable = ifp->if_capabilities;
4584 if (sc->lro_cnt == 0)
4585 ifp->if_capenable &= ~IFCAP_LRO;
4587 ifp->if_init = mxge_init;
4589 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4590 ifp->if_ioctl = mxge_ioctl;
4591 ifp->if_start = mxge_start;
4592 /* Initialise the ifmedia structure */
4593 ifmedia_init(&sc->media, 0, mxge_media_change,
4595 mxge_set_media(sc, IFM_ETHER | IFM_AUTO);
4596 mxge_media_probe(sc);
4598 ether_ifattach(ifp, sc->mac_addr);
4599 /* ether_ifattach sets mtu to ETHERMTU */
4600 if (mxge_initial_mtu != ETHERMTU)
4601 mxge_change_mtu(sc, mxge_initial_mtu);
4603 mxge_add_sysctls(sc);
4604 #ifdef IFNET_BUF_RING
4605 ifp->if_transmit = mxge_transmit;
4606 ifp->if_qflush = mxge_qflush;
4611 mxge_free_rings(sc);
4613 mxge_free_slices(sc);
4614 abort_with_dmabench:
4615 mxge_dma_free(&sc->dmabench_dma);
4616 abort_with_zeropad_dma:
4617 mxge_dma_free(&sc->zeropad_dma);
4619 mxge_dma_free(&sc->cmd_dma);
4621 bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4623 pci_disable_busmaster(dev);
4624 mtx_destroy(&sc->cmd_mtx);
4625 mtx_destroy(&sc->driver_mtx);
4627 abort_with_parent_dmat:
4628 bus_dma_tag_destroy(sc->parent_dmat);
4635 mxge_detach(device_t dev)
4637 mxge_softc_t *sc = device_get_softc(dev);
4639 if (mxge_vlans_active(sc)) {
4640 device_printf(sc->dev,
4641 "Detach vlans before removing module\n");
4644 mtx_lock(&sc->driver_mtx);
4646 if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
4648 mtx_unlock(&sc->driver_mtx);
4649 ether_ifdetach(sc->ifp);
4650 callout_drain(&sc->co_hdl);
4651 ifmedia_removeall(&sc->media);
4652 mxge_dummy_rdma(sc, 0);
4653 mxge_rem_sysctls(sc);
4655 mxge_free_rings(sc);
4656 mxge_free_slices(sc);
4657 mxge_dma_free(&sc->dmabench_dma);
4658 mxge_dma_free(&sc->zeropad_dma);
4659 mxge_dma_free(&sc->cmd_dma);
4660 bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4661 pci_disable_busmaster(dev);
4662 mtx_destroy(&sc->cmd_mtx);
4663 mtx_destroy(&sc->driver_mtx);
4665 bus_dma_tag_destroy(sc->parent_dmat);
4670 mxge_shutdown(device_t dev)
4676 This file uses Myri10GE driver indentation.
4679 c-file-style:"linux"