1 /******************************************************************************
3 Copyright (c) 2006-2009, Myricom Inc.
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
9 1. Redistributions of source code must retain the above copyright notice,
10 this list of conditions and the following disclaimer.
12 2. Neither the name of the Myricom Inc, nor the names of its
13 contributors may be used to endorse or promote products derived from
14 this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
28 ***************************************************************************/
30 #include <sys/cdefs.h>
31 /*__FBSDID("$FreeBSD: src/sys/dev/mxge/if_mxge.c,v 1.63 2009/06/26 11:45:06 rwatson Exp $");*/
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/linker.h>
36 #include <sys/firmware.h>
37 #include <sys/endian.h>
38 #include <sys/in_cksum.h>
39 #include <sys/sockio.h>
41 #include <sys/malloc.h>
42 #include <sys/kernel.h>
44 #include <sys/module.h>
45 #include <sys/socket.h>
46 #include <sys/sysctl.h>
48 /* count xmits ourselves, rather than via drbr */
51 #include <net/if_arp.h>
52 #include <net/ifq_var.h>
53 #include <net/ethernet.h>
54 #include <net/if_dl.h>
55 #include <net/if_media.h>
59 #include <net/if_types.h>
60 #include <net/vlan/if_vlan_var.h>
63 #include <netinet/in_systm.h>
64 #include <netinet/in.h>
65 #include <netinet/ip.h>
66 #include <netinet/tcp.h>
68 #include <machine/resource.h>
72 #include <bus/pci/pcireg.h>
73 #include <bus/pci/pcivar.h>
74 #include <bus/pci/pci_private.h> /* XXX for pci_cfg_restore */
76 #include <vm/vm.h> /* for pmap_mapdev() */
79 #if defined(__i386) || defined(__amd64)
80 #include <machine/specialreg.h>
83 #include <dev/netif/mxge/mxge_mcp.h>
84 #include <dev/netif/mxge/mcp_gen_header.h>
85 /*#define MXGE_FAKE_IFP*/
86 #include <dev/netif/mxge/if_mxge_var.h>
88 #include <sys/buf_ring.h>
94 static int mxge_nvidia_ecrc_enable = 1;
95 static int mxge_force_firmware = 0;
96 static int mxge_intr_coal_delay = 30;
97 static int mxge_deassert_wait = 1;
98 static int mxge_flow_control = 1;
99 static int mxge_verbose = 0;
100 static int mxge_lro_cnt = 8;
101 static int mxge_ticks;
102 static int mxge_max_slices = 1;
103 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
104 static int mxge_always_promisc = 0;
105 static int mxge_initial_mtu = ETHERMTU_JUMBO;
106 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
107 static char *mxge_fw_aligned = "mxge_eth_z8e";
108 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
109 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
111 static int mxge_probe(device_t dev);
112 static int mxge_attach(device_t dev);
113 static int mxge_detach(device_t dev);
114 static int mxge_shutdown(device_t dev);
115 static void mxge_intr(void *arg);
117 static device_method_t mxge_methods[] =
119 /* Device interface */
120 DEVMETHOD(device_probe, mxge_probe),
121 DEVMETHOD(device_attach, mxge_attach),
122 DEVMETHOD(device_detach, mxge_detach),
123 DEVMETHOD(device_shutdown, mxge_shutdown),
127 static driver_t mxge_driver =
131 sizeof(mxge_softc_t),
134 static devclass_t mxge_devclass;
136 /* Declare ourselves to be a child of the PCI bus.*/
137 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
138 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
139 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
141 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
142 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
143 static int mxge_close(mxge_softc_t *sc);
144 static int mxge_open(mxge_softc_t *sc);
145 static void mxge_tick(void *arg);
148 mxge_probe(device_t dev)
153 if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
154 ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
155 (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
156 rev = pci_get_revid(dev);
158 case MXGE_PCI_REV_Z8E:
159 device_set_desc(dev, "Myri10G-PCIE-8A");
161 case MXGE_PCI_REV_Z8ES:
162 device_set_desc(dev, "Myri10G-PCIE-8B");
165 device_set_desc(dev, "Myri10G-PCIE-8??");
166 device_printf(dev, "Unrecognized rev %d NIC\n",
176 mxge_enable_wc(mxge_softc_t *sc)
179 #if defined(__i386) || defined(__amd64)
184 len = rman_get_size(sc->mem_res);
185 err = pmap_change_attr((vm_offset_t) sc->sram,
186 len, PAT_WRITE_COMBINING);
188 device_printf(sc->dev, "pmap_change_attr failed, %d\n",
194 sc->wc = 0; /* TBD: PAT support */
199 /* callback to get our DMA address */
201 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
205 *(bus_addr_t *) arg = segs->ds_addr;
210 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
211 bus_size_t alignment)
214 device_t dev = sc->dev;
215 bus_size_t boundary, maxsegsize;
217 if (bytes > 4096 && alignment == 4096) {
225 /* allocate DMAable memory tags */
226 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
227 alignment, /* alignment */
228 boundary, /* boundary */
229 BUS_SPACE_MAXADDR, /* low */
230 BUS_SPACE_MAXADDR, /* high */
231 NULL, NULL, /* filter */
234 maxsegsize, /* maxsegsize */
235 BUS_DMA_COHERENT, /* flags */
236 &dma->dmat); /* tag */
238 device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
242 /* allocate DMAable memory & map */
243 err = bus_dmamem_alloc(dma->dmat, &dma->addr,
244 (BUS_DMA_WAITOK | BUS_DMA_COHERENT
245 | BUS_DMA_ZERO), &dma->map);
247 device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
248 goto abort_with_dmat;
251 /* load the memory */
252 err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
253 mxge_dmamap_callback,
254 (void *)&dma->bus_addr, 0);
256 device_printf(dev, "couldn't load map (err = %d)\n", err);
262 bus_dmamem_free(dma->dmat, dma->addr, dma->map);
264 (void)bus_dma_tag_destroy(dma->dmat);
270 mxge_dma_free(mxge_dma_t *dma)
272 bus_dmamap_unload(dma->dmat, dma->map);
273 bus_dmamem_free(dma->dmat, dma->addr, dma->map);
274 (void)bus_dma_tag_destroy(dma->dmat);
278 * The eeprom strings on the lanaiX have the format
285 mxge_parse_strings(mxge_softc_t *sc)
287 #define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
292 ptr = sc->eeprom_strings;
293 limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
295 while (ptr < limit && *ptr != '\0') {
296 if (memcmp(ptr, "MAC=", 4) == 0) {
298 sc->mac_addr_string = ptr;
299 for (i = 0; i < 6; i++) {
301 if ((ptr + 2) > limit)
303 sc->mac_addr[i] = strtoul(ptr, NULL, 16);
306 } else if (memcmp(ptr, "PC=", 3) == 0) {
308 strncpy(sc->product_code_string, ptr,
309 sizeof (sc->product_code_string) - 1);
310 } else if (memcmp(ptr, "SN=", 3) == 0) {
312 strncpy(sc->serial_number_string, ptr,
313 sizeof (sc->serial_number_string) - 1);
315 MXGE_NEXT_STRING(ptr);
322 device_printf(sc->dev, "failed to parse eeprom_strings\n");
327 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
329 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
332 unsigned long base, off;
334 device_t pdev, mcp55;
335 uint16_t vendor_id, device_id, word;
336 uintptr_t bus, slot, func, ivend, idev;
340 if (!mxge_nvidia_ecrc_enable)
343 pdev = device_get_parent(device_get_parent(sc->dev));
345 device_printf(sc->dev, "could not find parent?\n");
348 vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
349 device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
351 if (vendor_id != 0x10de)
356 if (device_id == 0x005d) {
357 /* ck804, base address is magic */
359 } else if (device_id >= 0x0374 && device_id <= 0x378) {
360 /* mcp55, base address stored in chipset */
361 mcp55 = pci_find_bsf(0, 0, 0);
363 0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
364 0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
365 word = pci_read_config(mcp55, 0x90, 2);
366 base = ((unsigned long)word & 0x7ffeU) << 25;
373 Test below is commented because it is believed that doing
374 config read/write beyond 0xff will access the config space
375 for the next larger function. Uncomment this and remove
376 the hacky pmap_mapdev() way of accessing config space when
377 FreeBSD grows support for extended pcie config space access
380 /* See if we can, by some miracle, access the extended
382 val = pci_read_config(pdev, 0x178, 4);
383 if (val != 0xffffffff) {
385 pci_write_config(pdev, 0x178, val, 4);
389 /* Rather than using normal pci config space writes, we must
390 * map the Nvidia config space ourselves. This is because on
391 * opteron/nvidia class machine the 0xe000000 mapping is
392 * handled by the nvidia chipset, that means the internal PCI
393 * device (the on-chip northbridge), or the amd-8131 bridge
394 * and things behind them are not visible by this method.
397 BUS_READ_IVAR(device_get_parent(pdev), pdev,
399 BUS_READ_IVAR(device_get_parent(pdev), pdev,
400 PCI_IVAR_SLOT, &slot);
401 BUS_READ_IVAR(device_get_parent(pdev), pdev,
402 PCI_IVAR_FUNCTION, &func);
403 BUS_READ_IVAR(device_get_parent(pdev), pdev,
404 PCI_IVAR_VENDOR, &ivend);
405 BUS_READ_IVAR(device_get_parent(pdev), pdev,
406 PCI_IVAR_DEVICE, &idev);
409 + 0x00100000UL * (unsigned long)bus
410 + 0x00001000UL * (unsigned long)(func
413 /* map it into the kernel */
414 va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
418 device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
421 /* get a pointer to the config space mapped into the kernel */
422 cfgptr = va + (off & PAGE_MASK);
424 /* make sure that we can really access it */
425 vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
426 device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
427 if (! (vendor_id == ivend && device_id == idev)) {
428 device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
429 vendor_id, device_id);
430 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
434 ptr32 = (uint32_t*)(cfgptr + 0x178);
437 if (val == 0xffffffff) {
438 device_printf(sc->dev, "extended mapping failed\n");
439 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
443 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
445 device_printf(sc->dev,
446 "Enabled ECRC on upstream Nvidia bridge "
448 (int)bus, (int)slot, (int)func);
453 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
455 device_printf(sc->dev,
456 "Nforce 4 chipset on non-x86/amd64!?!?!\n");
463 mxge_dma_test(mxge_softc_t *sc, int test_type)
466 bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
472 /* Run a small DMA test.
473 * The magic multipliers to the length tell the firmware
474 * to do DMA read, write, or read+write tests. The
475 * results are returned in cmd.data0. The upper 16
476 * bits of the return is the number of transfers completed.
477 * The lower 16 bits is the time in 0.5us ticks that the
478 * transfers took to complete.
481 len = sc->tx_boundary;
483 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
484 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
485 cmd.data2 = len * 0x10000;
486 status = mxge_send_cmd(sc, test_type, &cmd);
491 sc->read_dma = ((cmd.data0>>16) * len * 2) /
492 (cmd.data0 & 0xffff);
493 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
494 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
495 cmd.data2 = len * 0x1;
496 status = mxge_send_cmd(sc, test_type, &cmd);
501 sc->write_dma = ((cmd.data0>>16) * len * 2) /
502 (cmd.data0 & 0xffff);
504 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
505 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
506 cmd.data2 = len * 0x10001;
507 status = mxge_send_cmd(sc, test_type, &cmd);
512 sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
513 (cmd.data0 & 0xffff);
516 if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
517 device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
524 * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
525 * when the PCI-E Completion packets are aligned on an 8-byte
526 * boundary. Some PCI-E chip sets always align Completion packets; on
527 * the ones that do not, the alignment can be enforced by enabling
528 * ECRC generation (if supported).
530 * When PCI-E Completion packets are not aligned, it is actually more
531 * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
533 * If the driver can neither enable ECRC nor verify that it has
534 * already been enabled, then it must use a firmware image which works
535 * around unaligned completion packets (ethp_z8e.dat), and it should
536 * also ensure that it never gives the device a Read-DMA which is
537 * larger than 2KB by setting the tx_boundary to 2KB. If ECRC is
538 * enabled, then the driver should use the aligned (eth_z8e.dat)
539 * firmware image, and set tx_boundary to 4KB.
543 mxge_firmware_probe(mxge_softc_t *sc)
545 device_t dev = sc->dev;
549 sc->tx_boundary = 4096;
551 * Verify the max read request size was set to 4KB
552 * before trying the test with 4KB.
554 if (pci_find_extcap(dev, PCIY_EXPRESS, ®) == 0) {
555 pectl = pci_read_config(dev, reg + 0x8, 2);
556 if ((pectl & (5 << 12)) != (5 << 12)) {
557 device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
559 sc->tx_boundary = 2048;
564 * load the optimized firmware (which assumes aligned PCIe
565 * completions) in order to see if it works on this host.
567 sc->fw_name = mxge_fw_aligned;
568 status = mxge_load_firmware(sc, 1);
574 * Enable ECRC if possible
576 mxge_enable_nvidia_ecrc(sc);
579 * Run a DMA test which watches for unaligned completions and
580 * aborts on the first one seen.
583 status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
585 return 0; /* keep the aligned firmware */
588 device_printf(dev, "DMA test failed: %d\n", status);
589 if (status == ENOSYS)
590 device_printf(dev, "Falling back to ethp! "
591 "Please install up to date fw\n");
596 mxge_select_firmware(mxge_softc_t *sc)
601 if (mxge_force_firmware != 0) {
602 if (mxge_force_firmware == 1)
607 device_printf(sc->dev,
608 "Assuming %s completions (forced)\n",
609 aligned ? "aligned" : "unaligned");
613 /* if the PCIe link width is 4 or less, we can use the aligned
614 firmware and skip any checks */
615 if (sc->link_width != 0 && sc->link_width <= 4) {
616 device_printf(sc->dev,
617 "PCIe x%d Link, expect reduced performance\n",
623 if (0 == mxge_firmware_probe(sc))
628 sc->fw_name = mxge_fw_aligned;
629 sc->tx_boundary = 4096;
631 sc->fw_name = mxge_fw_unaligned;
632 sc->tx_boundary = 2048;
634 return (mxge_load_firmware(sc, 0));
644 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
648 if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
649 device_printf(sc->dev, "Bad firmware type: 0x%x\n",
650 be32toh(hdr->mcp_type));
654 /* save firmware version for sysctl */
655 strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
657 device_printf(sc->dev, "firmware id: %s\n", hdr->version);
659 ksscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
660 &sc->fw_ver_minor, &sc->fw_ver_tiny);
662 if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
663 && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
664 device_printf(sc->dev, "Found firmware version %s\n",
666 device_printf(sc->dev, "Driver needs %d.%d\n",
667 MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
675 z_alloc(void *nil, u_int items, u_int size)
679 ptr = kmalloc(items * size, M_TEMP, M_NOWAIT);
684 z_free(void *nil, void *ptr)
691 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
694 const mcp_gen_header_t *hdr;
701 fw = firmware_image_load(sc->fw_name, NULL);
703 device_printf(sc->dev, "Could not find firmware image %s\n",
708 /* setup zlib and decompress f/w */
709 bzero(&zs, sizeof (zs));
712 status = inflateInit(&zs);
713 if (status != Z_OK) {
718 /* the uncompressed size is stored as the firmware version,
719 which would otherwise go unused */
720 fw_len = (size_t) fw->version;
721 inflate_buffer = kmalloc(fw_len, M_TEMP, M_NOWAIT);
722 if (inflate_buffer == NULL)
724 zs.avail_in = fw->datasize;
725 zs.next_in = __DECONST(char *, fw->data);
726 zs.avail_out = fw_len;
727 zs.next_out = inflate_buffer;
728 status = inflate(&zs, Z_FINISH);
729 if (status != Z_STREAM_END) {
730 device_printf(sc->dev, "zlib %d\n", status);
732 goto abort_with_buffer;
736 hdr_offset = htobe32(*(const uint32_t *)
737 (fw->fw_image + MCP_HEADER_PTR_OFFSET));
738 if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
739 device_printf(sc->dev, "Bad firmware file");
743 hdr = (const void*)(fw->fw_image + hdr_offset);
745 status = mxge_validate_firmware(sc, hdr);
749 /* Copy the inflated firmware to NIC SRAM. */
750 for (i = 0; i < fw_len; i += 256) {
751 mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
753 min(256U, (unsigned)(fw_len - i)));
763 kfree(inflate_buffer, M_TEMP);
768 firmware_image_unload(fw);
773 * Enable or disable periodic RDMAs from the host to make certain
774 * chipsets resend dropped PCIe messages
778 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
781 volatile uint32_t *confirm;
782 volatile char *submit;
783 uint32_t *buf, dma_low, dma_high;
786 buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
788 /* clear confirmation addr */
789 confirm = (volatile uint32_t *)sc->cmd;
793 /* send an rdma command to the PCIe engine, and wait for the
794 response in the confirmation address. The firmware should
795 write a -1 there to indicate it is alive and well
798 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
799 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
800 buf[0] = htobe32(dma_high); /* confirm addr MSW */
801 buf[1] = htobe32(dma_low); /* confirm addr LSW */
802 buf[2] = htobe32(0xffffffff); /* confirm data */
803 dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
804 dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
805 buf[3] = htobe32(dma_high); /* dummy addr MSW */
806 buf[4] = htobe32(dma_low); /* dummy addr LSW */
807 buf[5] = htobe32(enable); /* enable? */
810 submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
812 mxge_pio_copy(submit, buf, 64);
817 while (*confirm != 0xffffffff && i < 20) {
821 if (*confirm != 0xffffffff) {
822 device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
823 (enable ? "enable" : "disable"), confirm,
830 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
833 char buf_bytes[sizeof(*buf) + 8];
834 volatile mcp_cmd_response_t *response = sc->cmd;
835 volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
836 uint32_t dma_low, dma_high;
837 int err, sleep_total = 0;
839 /* ensure buf is aligned to 8 bytes */
840 buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
842 buf->data0 = htobe32(data->data0);
843 buf->data1 = htobe32(data->data1);
844 buf->data2 = htobe32(data->data2);
845 buf->cmd = htobe32(cmd);
846 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
847 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
849 buf->response_addr.low = htobe32(dma_low);
850 buf->response_addr.high = htobe32(dma_high);
851 lockmgr(&sc->cmd_lock, LK_EXCLUSIVE);
852 response->result = 0xffffffff;
854 mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
856 /* wait up to 20ms */
858 for (sleep_total = 0; sleep_total < 20; sleep_total++) {
859 bus_dmamap_sync(sc->cmd_dma.dmat,
860 sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
862 switch (be32toh(response->result)) {
864 data->data0 = be32toh(response->data);
870 case MXGEFW_CMD_UNKNOWN:
873 case MXGEFW_CMD_ERROR_UNALIGNED:
876 case MXGEFW_CMD_ERROR_BUSY:
880 device_printf(sc->dev,
882 "failed, result = %d\n",
883 cmd, be32toh(response->result));
891 device_printf(sc->dev, "mxge: command %d timed out"
893 cmd, be32toh(response->result));
894 lockmgr(&sc->cmd_lock, LK_RELEASE);
899 mxge_adopt_running_firmware(mxge_softc_t *sc)
901 struct mcp_gen_header *hdr;
902 const size_t bytes = sizeof (struct mcp_gen_header);
906 /* find running firmware header */
907 hdr_offset = htobe32(*(volatile uint32_t *)
908 (sc->sram + MCP_HEADER_PTR_OFFSET));
910 if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
911 device_printf(sc->dev,
912 "Running firmware has bad header offset (%d)\n",
917 /* copy header of running firmware from SRAM to host memory to
918 * validate firmware */
919 hdr = kmalloc(bytes, M_DEVBUF, M_NOWAIT);
921 device_printf(sc->dev, "could not kmalloc firmware hdr\n");
924 bus_space_read_region_1(rman_get_bustag(sc->mem_res),
925 rman_get_bushandle(sc->mem_res),
926 hdr_offset, (char *)hdr, bytes);
927 status = mxge_validate_firmware(sc, hdr);
928 kfree(hdr, M_DEVBUF);
931 * check to see if adopted firmware has bug where adopting
932 * it will cause broadcasts to be filtered unless the NIC
933 * is kept in ALLMULTI mode
935 if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
936 sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
937 sc->adopted_rx_filter_bug = 1;
938 device_printf(sc->dev, "Adopting fw %d.%d.%d: "
939 "working around rx filter bug\n",
940 sc->fw_ver_major, sc->fw_ver_minor,
949 mxge_load_firmware(mxge_softc_t *sc, int adopt)
951 volatile uint32_t *confirm;
952 volatile char *submit;
954 uint32_t *buf, size, dma_low, dma_high;
957 buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
959 size = sc->sram_size;
960 status = mxge_load_firmware_helper(sc, &size);
964 /* Try to use the currently running firmware, if
966 status = mxge_adopt_running_firmware(sc);
968 device_printf(sc->dev,
969 "failed to adopt running firmware\n");
972 device_printf(sc->dev,
973 "Successfully adopted running firmware\n");
974 if (sc->tx_boundary == 4096) {
975 device_printf(sc->dev,
976 "Using firmware currently running on NIC"
978 device_printf(sc->dev,
979 "performance consider loading optimized "
982 sc->fw_name = mxge_fw_unaligned;
983 sc->tx_boundary = 2048;
986 /* clear confirmation addr */
987 confirm = (volatile uint32_t *)sc->cmd;
990 /* send a reload command to the bootstrap MCP, and wait for the
991 response in the confirmation address. The firmware should
992 write a -1 there to indicate it is alive and well
995 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
996 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
998 buf[0] = htobe32(dma_high); /* confirm addr MSW */
999 buf[1] = htobe32(dma_low); /* confirm addr LSW */
1000 buf[2] = htobe32(0xffffffff); /* confirm data */
1002 /* FIX: All newest firmware should un-protect the bottom of
1003 the sram before handoff. However, the very first interfaces
1004 do not. Therefore the handoff copy must skip the first 8 bytes
1006 /* where the code starts*/
1007 buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1008 buf[4] = htobe32(size - 8); /* length of code */
1009 buf[5] = htobe32(8); /* where to copy to */
1010 buf[6] = htobe32(0); /* where to jump to */
1012 submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1013 mxge_pio_copy(submit, buf, 64);
1018 while (*confirm != 0xffffffff && i < 20) {
1021 bus_dmamap_sync(sc->cmd_dma.dmat,
1022 sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1024 if (*confirm != 0xffffffff) {
1025 device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1034 mxge_update_mac_address(mxge_softc_t *sc)
1037 uint8_t *addr = sc->mac_addr;
1041 cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1042 | (addr[2] << 8) | addr[3]);
1044 cmd.data1 = ((addr[4] << 8) | (addr[5]));
1046 status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1051 mxge_change_pause(mxge_softc_t *sc, int pause)
1057 status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1060 status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1064 device_printf(sc->dev, "Failed to set flow control mode\n");
1072 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1077 if (mxge_always_promisc)
1081 status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1084 status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1088 device_printf(sc->dev, "Failed to set promisc mode\n");
1093 mxge_set_multicast_list(mxge_softc_t *sc)
1096 struct ifmultiaddr *ifma;
1097 struct ifnet *ifp = sc->ifp;
1100 /* This firmware is known to not support multicast */
1101 if (!sc->fw_multicast_support)
1104 /* Disable multicast filtering while we play with the lists*/
1105 err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1107 device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1108 " error status: %d\n", err);
1112 if (sc->adopted_rx_filter_bug)
1115 if (ifp->if_flags & IFF_ALLMULTI)
1116 /* request to disable multicast filtering, so quit here */
1119 /* Flush all the filters */
1121 err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1123 device_printf(sc->dev,
1124 "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1125 ", error status: %d\n", err);
1129 /* Walk the multicast list, and add each address */
1131 if_maddr_rlock(ifp);
1132 LIST_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1133 if (ifma->ifma_addr->sa_family != AF_LINK)
1135 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1137 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1139 cmd.data0 = htonl(cmd.data0);
1140 cmd.data1 = htonl(cmd.data1);
1141 err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1143 device_printf(sc->dev, "Failed "
1144 "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1146 /* abort, leaving multicast filtering off */
1147 if_maddr_runlock(ifp);
1151 if_maddr_runlock(ifp);
1152 /* Enable multicast filtering */
1153 err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1155 device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1156 ", error status: %d\n", err);
1161 mxge_max_mtu(mxge_softc_t *sc)
1166 if (MJUMPAGESIZE - MXGEFW_PAD > MXGEFW_MAX_MTU)
1167 return MXGEFW_MAX_MTU - MXGEFW_PAD;
1169 /* try to set nbufs to see if it we can
1170 use virtually contiguous jumbos */
1172 status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1175 return MXGEFW_MAX_MTU - MXGEFW_PAD;
1177 /* otherwise, we're limited to MJUMPAGESIZE */
1178 return MJUMPAGESIZE - MXGEFW_PAD;
1182 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1184 struct mxge_slice_state *ss;
1185 mxge_rx_done_t *rx_done;
1186 volatile uint32_t *irq_claim;
1190 /* try to send a reset command to the card to see if it
1192 memset(&cmd, 0, sizeof (cmd));
1193 status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1195 device_printf(sc->dev, "failed reset\n");
1199 mxge_dummy_rdma(sc, 1);
1202 /* set the intrq size */
1203 cmd.data0 = sc->rx_ring_size;
1204 status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1207 * Even though we already know how many slices are supported
1208 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1209 * has magic side effects, and must be called after a reset.
1210 * It must be called prior to calling any RSS related cmds,
1211 * including assigning an interrupt queue for anything but
1212 * slice 0. It must also be called *after*
1213 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1214 * the firmware to compute offsets.
1217 if (sc->num_slices > 1) {
1218 /* ask the maximum number of slices it supports */
1219 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1222 device_printf(sc->dev,
1223 "failed to get number of slices\n");
1227 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1228 * to setting up the interrupt queue DMA
1230 cmd.data0 = sc->num_slices;
1231 cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1232 #ifdef IFNET_BUF_RING
1233 cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1235 status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1238 device_printf(sc->dev,
1239 "failed to set number of slices\n");
1245 if (interrupts_setup) {
1246 /* Now exchange information about interrupts */
1247 for (slice = 0; slice < sc->num_slices; slice++) {
1248 rx_done = &sc->ss[slice].rx_done;
1249 memset(rx_done->entry, 0, sc->rx_ring_size);
1250 cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1251 cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1253 status |= mxge_send_cmd(sc,
1254 MXGEFW_CMD_SET_INTRQ_DMA,
1259 status |= mxge_send_cmd(sc,
1260 MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1263 sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1265 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1266 irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1269 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1271 sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1273 device_printf(sc->dev, "failed set interrupt parameters\n");
1278 *sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1281 /* run a DMA benchmark */
1282 (void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1284 for (slice = 0; slice < sc->num_slices; slice++) {
1285 ss = &sc->ss[slice];
1287 ss->irq_claim = irq_claim + (2 * slice);
1288 /* reset mcp/driver shared state back to 0 */
1289 ss->rx_done.idx = 0;
1290 ss->rx_done.cnt = 0;
1293 ss->tx.pkt_done = 0;
1294 ss->tx.queue_active = 0;
1295 ss->tx.activate = 0;
1296 ss->tx.deactivate = 0;
1301 ss->rx_small.cnt = 0;
1302 ss->lro_bad_csum = 0;
1304 ss->lro_flushed = 0;
1305 if (ss->fw_stats != NULL) {
1306 ss->fw_stats->valid = 0;
1307 ss->fw_stats->send_done_count = 0;
1310 sc->rdma_tags_available = 15;
1311 status = mxge_update_mac_address(sc);
1312 mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1313 mxge_change_pause(sc, sc->pause);
1314 mxge_set_multicast_list(sc);
1319 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1322 unsigned int intr_coal_delay;
1326 intr_coal_delay = sc->intr_coal_delay;
1327 err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1331 if (intr_coal_delay == sc->intr_coal_delay)
1334 if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1337 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
1338 *sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1339 sc->intr_coal_delay = intr_coal_delay;
1341 lockmgr(&sc->driver_lock, LK_RELEASE);
1346 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1349 unsigned int enabled;
1353 enabled = sc->pause;
1354 err = sysctl_handle_int(oidp, &enabled, arg2, req);
1358 if (enabled == sc->pause)
1361 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
1362 err = mxge_change_pause(sc, enabled);
1363 lockmgr(&sc->driver_lock, LK_RELEASE);
1368 mxge_change_lro_locked(mxge_softc_t *sc, int lro_cnt)
1375 ifp->if_capenable &= ~IFCAP_LRO;
1377 ifp->if_capenable |= IFCAP_LRO;
1378 sc->lro_cnt = lro_cnt;
1379 if (ifp->if_flags & IFF_RUNNING) {
1381 err = mxge_open(sc);
1387 mxge_change_lro(SYSCTL_HANDLER_ARGS)
1390 unsigned int lro_cnt;
1394 lro_cnt = sc->lro_cnt;
1395 err = sysctl_handle_int(oidp, &lro_cnt, arg2, req);
1399 if (lro_cnt == sc->lro_cnt)
1405 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
1406 err = mxge_change_lro_locked(sc, lro_cnt);
1407 lockmgr(&sc->driver_lock, LK_RELEASE);
1412 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1418 arg2 = be32toh(*(int *)arg1);
1420 err = sysctl_handle_int(oidp, arg1, arg2, req);
1426 mxge_rem_sysctls(mxge_softc_t *sc)
1428 struct mxge_slice_state *ss;
1431 if (sc->slice_sysctl_tree == NULL)
1434 for (slice = 0; slice < sc->num_slices; slice++) {
1435 ss = &sc->ss[slice];
1436 if (ss == NULL || ss->sysctl_tree == NULL)
1438 sysctl_ctx_free(&ss->sysctl_ctx);
1439 ss->sysctl_tree = NULL;
1441 sysctl_ctx_free(&sc->slice_sysctl_ctx);
1442 sc->slice_sysctl_tree = NULL;
1446 mxge_add_sysctls(mxge_softc_t *sc)
1448 struct sysctl_ctx_list *ctx;
1449 struct sysctl_oid_list *children;
1451 struct mxge_slice_state *ss;
1455 ctx = &sc->sysctl_ctx;
1456 sysctl_ctx_init(ctx);
1457 sc->sysctl_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_STATIC_CHILDREN(_hw),
1459 device_get_nameunit(sc->dev),
1461 if (sc->sysctl_tree == NULL) {
1462 device_printf(sc->dev, "can't add sysctl node\n");
1466 children = SYSCTL_CHILDREN(sc->sysctl_tree);
1467 fw = sc->ss[0].fw_stats;
1469 /* random information */
1470 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1472 CTLFLAG_RD, &sc->fw_version,
1473 0, "firmware version");
1474 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1476 CTLFLAG_RD, &sc->serial_number_string,
1477 0, "serial number");
1478 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1480 CTLFLAG_RD, &sc->product_code_string,
1482 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1484 CTLFLAG_RD, &sc->link_width,
1486 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1488 CTLFLAG_RD, &sc->tx_boundary,
1490 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1492 CTLFLAG_RD, &sc->wc,
1493 0, "write combining PIO?");
1494 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1496 CTLFLAG_RD, &sc->read_dma,
1497 0, "DMA Read speed in MB/s");
1498 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1500 CTLFLAG_RD, &sc->write_dma,
1501 0, "DMA Write speed in MB/s");
1502 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1503 "read_write_dma_MBs",
1504 CTLFLAG_RD, &sc->read_write_dma,
1505 0, "DMA concurrent Read/Write speed in MB/s");
1508 /* performance related tunables */
1509 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1511 CTLTYPE_INT|CTLFLAG_RW, sc,
1512 0, mxge_change_intr_coal,
1513 "I", "interrupt coalescing delay in usecs");
1515 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1516 "flow_control_enabled",
1517 CTLTYPE_INT|CTLFLAG_RW, sc,
1518 0, mxge_change_flow_control,
1519 "I", "interrupt coalescing delay in usecs");
1521 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1523 CTLFLAG_RW, &mxge_deassert_wait,
1524 0, "Wait for IRQ line to go low in ihandler");
1526 /* stats block from firmware is in network byte order.
1528 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1530 CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1531 0, mxge_handle_be32,
1533 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1534 "rdma_tags_available",
1535 CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1536 0, mxge_handle_be32,
1537 "I", "rdma_tags_available");
1538 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1539 "dropped_bad_crc32",
1540 CTLTYPE_INT|CTLFLAG_RD,
1541 &fw->dropped_bad_crc32,
1542 0, mxge_handle_be32,
1543 "I", "dropped_bad_crc32");
1544 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1546 CTLTYPE_INT|CTLFLAG_RD,
1547 &fw->dropped_bad_phy,
1548 0, mxge_handle_be32,
1549 "I", "dropped_bad_phy");
1550 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1551 "dropped_link_error_or_filtered",
1552 CTLTYPE_INT|CTLFLAG_RD,
1553 &fw->dropped_link_error_or_filtered,
1554 0, mxge_handle_be32,
1555 "I", "dropped_link_error_or_filtered");
1556 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1557 "dropped_link_overflow",
1558 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1559 0, mxge_handle_be32,
1560 "I", "dropped_link_overflow");
1561 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1562 "dropped_multicast_filtered",
1563 CTLTYPE_INT|CTLFLAG_RD,
1564 &fw->dropped_multicast_filtered,
1565 0, mxge_handle_be32,
1566 "I", "dropped_multicast_filtered");
1567 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1568 "dropped_no_big_buffer",
1569 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1570 0, mxge_handle_be32,
1571 "I", "dropped_no_big_buffer");
1572 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1573 "dropped_no_small_buffer",
1574 CTLTYPE_INT|CTLFLAG_RD,
1575 &fw->dropped_no_small_buffer,
1576 0, mxge_handle_be32,
1577 "I", "dropped_no_small_buffer");
1578 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1580 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1581 0, mxge_handle_be32,
1582 "I", "dropped_overrun");
1583 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1585 CTLTYPE_INT|CTLFLAG_RD,
1587 0, mxge_handle_be32,
1588 "I", "dropped_pause");
1589 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1591 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1592 0, mxge_handle_be32,
1593 "I", "dropped_runt");
1595 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1596 "dropped_unicast_filtered",
1597 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1598 0, mxge_handle_be32,
1599 "I", "dropped_unicast_filtered");
1601 /* verbose printing? */
1602 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1604 CTLFLAG_RW, &mxge_verbose,
1605 0, "verbose printing");
1608 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1610 CTLTYPE_INT|CTLFLAG_RW, sc,
1612 "I", "number of lro merge queues");
1615 /* add counters exported for debugging from all slices */
1616 sysctl_ctx_init(&sc->slice_sysctl_ctx);
1617 sc->slice_sysctl_tree =
1618 SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1619 "slice", CTLFLAG_RD, 0, "");
1621 for (slice = 0; slice < sc->num_slices; slice++) {
1622 ss = &sc->ss[slice];
1623 sysctl_ctx_init(&ss->sysctl_ctx);
1624 ctx = &ss->sysctl_ctx;
1625 children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1626 ksprintf(slice_num, "%d", slice);
1628 SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1630 children = SYSCTL_CHILDREN(ss->sysctl_tree);
1631 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1633 CTLFLAG_RD, &ss->rx_small.cnt,
1635 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1637 CTLFLAG_RD, &ss->rx_big.cnt,
1639 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1640 "lro_flushed", CTLFLAG_RD, &ss->lro_flushed,
1641 0, "number of lro merge queues flushed");
1643 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1644 "lro_queued", CTLFLAG_RD, &ss->lro_queued,
1645 0, "number of frames appended to lro merge"
1648 #ifndef IFNET_BUF_RING
1649 /* only transmit from slice 0 for now */
1653 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1655 CTLFLAG_RD, &ss->tx.req,
1658 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1660 CTLFLAG_RD, &ss->tx.done,
1662 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1664 CTLFLAG_RD, &ss->tx.pkt_done,
1666 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1668 CTLFLAG_RD, &ss->tx.stall,
1670 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1672 CTLFLAG_RD, &ss->tx.wake,
1674 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1676 CTLFLAG_RD, &ss->tx.defrag,
1678 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1680 CTLFLAG_RD, &ss->tx.queue_active,
1681 0, "tx_queue_active");
1682 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1684 CTLFLAG_RD, &ss->tx.activate,
1686 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1688 CTLFLAG_RD, &ss->tx.deactivate,
1689 0, "tx_deactivate");
1693 /* copy an array of mcp_kreq_ether_send_t's to the mcp. Copy
1694 backwards one at a time and handle ring wraps */
1697 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1698 mcp_kreq_ether_send_t *src, int cnt)
1700 int idx, starting_slot;
1701 starting_slot = tx->req;
1704 idx = (starting_slot + cnt) & tx->mask;
1705 mxge_pio_copy(&tx->lanai[idx],
1706 &src[cnt], sizeof(*src));
1712 * copy an array of mcp_kreq_ether_send_t's to the mcp. Copy
1713 * at most 32 bytes at a time, so as to avoid involving the software
1714 * pio handler in the nic. We re-write the first segment's flags
1715 * to mark them valid only after writing the entire chain
1719 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1724 volatile uint32_t *dst_ints;
1725 mcp_kreq_ether_send_t *srcp;
1726 volatile mcp_kreq_ether_send_t *dstp, *dst;
1729 idx = tx->req & tx->mask;
1731 last_flags = src->flags;
1734 dst = dstp = &tx->lanai[idx];
1737 if ((idx + cnt) < tx->mask) {
1738 for (i = 0; i < (cnt - 1); i += 2) {
1739 mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1740 wmb(); /* force write every 32 bytes */
1745 /* submit all but the first request, and ensure
1746 that it is submitted below */
1747 mxge_submit_req_backwards(tx, src, cnt);
1751 /* submit the first request */
1752 mxge_pio_copy(dstp, srcp, sizeof(*src));
1753 wmb(); /* barrier before setting valid flag */
1756 /* re-write the last 32-bits with the valid flags */
1757 src->flags = last_flags;
1758 src_ints = (uint32_t *)src;
1760 dst_ints = (volatile uint32_t *)dst;
1762 *dst_ints = *src_ints;
1770 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1771 int busdma_seg_cnt, int ip_off)
1774 mcp_kreq_ether_send_t *req;
1775 bus_dma_segment_t *seg;
1778 uint32_t low, high_swapped;
1779 int len, seglen, cum_len, cum_len_next;
1780 int next_is_first, chop, cnt, rdma_count, small;
1781 uint16_t pseudo_hdr_offset, cksum_offset, mss;
1782 uint8_t flags, flags_next;
1785 mss = m->m_pkthdr.tso_segsz;
1787 /* negative cum_len signifies to the
1788 * send loop that we are still in the
1789 * header portion of the TSO packet.
1792 /* ensure we have the ethernet, IP and TCP
1793 header together in the first mbuf, copy
1794 it to a scratch buffer if not */
1795 if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1796 m_copydata(m, 0, ip_off + sizeof (*ip),
1798 ip = (struct ip *)(ss->scratch + ip_off);
1800 ip = (struct ip *)(mtod(m, char *) + ip_off);
1802 if (__predict_false(m->m_len < ip_off + (ip->ip_hl << 2)
1804 m_copydata(m, 0, ip_off + (ip->ip_hl << 2)
1805 + sizeof (*tcp), ss->scratch);
1806 ip = (struct ip *)(mtod(m, char *) + ip_off);
1809 tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1810 cum_len = -(ip_off + ((ip->ip_hl + tcp->th_off) << 2));
1812 /* TSO implies checksum offload on this hardware */
1813 cksum_offset = ip_off + (ip->ip_hl << 2);
1814 flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1817 /* for TSO, pseudo_hdr_offset holds mss.
1818 * The firmware figures out where to put
1819 * the checksum by parsing the header. */
1820 pseudo_hdr_offset = htobe16(mss);
1827 /* "rdma_count" is the number of RDMAs belonging to the
1828 * current packet BEFORE the current send request. For
1829 * non-TSO packets, this is equal to "count".
1830 * For TSO packets, rdma_count needs to be reset
1831 * to 0 after a segment cut.
1833 * The rdma_count field of the send request is
1834 * the number of RDMAs of the packet starting at
1835 * that request. For TSO send requests with one ore more cuts
1836 * in the middle, this is the number of RDMAs starting
1837 * after the last cut in the request. All previous
1838 * segments before the last cut implicitly have 1 RDMA.
1840 * Since the number of RDMAs is not known beforehand,
1841 * it must be filled-in retroactively - after each
1842 * segmentation cut or at the end of the entire packet.
1845 while (busdma_seg_cnt) {
1846 /* Break the busdma segment up into pieces*/
1847 low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1848 high_swapped = htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1852 flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1854 cum_len_next = cum_len + seglen;
1855 (req-rdma_count)->rdma_count = rdma_count + 1;
1856 if (__predict_true(cum_len >= 0)) {
1858 chop = (cum_len_next > mss);
1859 cum_len_next = cum_len_next % mss;
1860 next_is_first = (cum_len_next == 0);
1861 flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1862 flags_next |= next_is_first *
1864 rdma_count |= -(chop | next_is_first);
1865 rdma_count += chop & !next_is_first;
1866 } else if (cum_len_next >= 0) {
1871 small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1872 flags_next = MXGEFW_FLAGS_TSO_PLD |
1873 MXGEFW_FLAGS_FIRST |
1874 (small * MXGEFW_FLAGS_SMALL);
1877 req->addr_high = high_swapped;
1878 req->addr_low = htobe32(low);
1879 req->pseudo_hdr_offset = pseudo_hdr_offset;
1881 req->rdma_count = 1;
1882 req->length = htobe16(seglen);
1883 req->cksum_offset = cksum_offset;
1884 req->flags = flags | ((cum_len & 1) *
1885 MXGEFW_FLAGS_ALIGN_ODD);
1888 cum_len = cum_len_next;
1893 if (__predict_false(cksum_offset > seglen))
1894 cksum_offset -= seglen;
1897 if (__predict_false(cnt > tx->max_desc))
1903 (req-rdma_count)->rdma_count = rdma_count;
1907 req->flags |= MXGEFW_FLAGS_TSO_LAST;
1908 } while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1910 tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1911 mxge_submit_req(tx, tx->req_list, cnt);
1912 #ifdef IFNET_BUF_RING
1913 if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1914 /* tell the NIC to start polling this slice */
1916 tx->queue_active = 1;
1924 bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1928 kprintf("tx->max_desc exceeded via TSO!\n");
1929 kprintf("mss = %d, %ld, %d!\n", mss,
1930 (long)seg - (long)tx->seg_list, tx->max_desc);
1937 #endif /* IFCAP_TSO4 */
1939 #ifdef MXGE_NEW_VLAN_API
1941 * We reproduce the software vlan tag insertion from
1942 * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
1943 * vlan tag insertion. We need to advertise this in order to have the
1944 * vlan interface respect our csum offload flags.
1946 static struct mbuf *
1947 mxge_vlan_tag_insert(struct mbuf *m)
1949 struct ether_vlan_header *evl;
1951 M_PREPEND(m, EVL_ENCAPLEN, MB_DONTWAIT);
1952 if (__predict_false(m == NULL))
1954 if (m->m_len < sizeof(*evl)) {
1955 m = m_pullup(m, sizeof(*evl));
1956 if (__predict_false(m == NULL))
1960 * Transform the Ethernet header into an Ethernet header
1961 * with 802.1Q encapsulation.
1963 evl = mtod(m, struct ether_vlan_header *);
1964 bcopy((char *)evl + EVL_ENCAPLEN,
1965 (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
1966 evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
1967 evl->evl_tag = htons(m->m_pkthdr.ether_vlantag);
1968 m->m_flags &= ~M_VLANTAG;
1971 #endif /* MXGE_NEW_VLAN_API */
1974 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
1977 mcp_kreq_ether_send_t *req;
1978 bus_dma_segment_t *seg;
1983 int cnt, cum_len, err, i, idx, odd_flag, ip_off;
1984 uint16_t pseudo_hdr_offset;
1985 uint8_t flags, cksum_offset;
1992 ip_off = sizeof (struct ether_header);
1993 #ifdef MXGE_NEW_VLAN_API
1994 if (m->m_flags & M_VLANTAG) {
1995 m = mxge_vlan_tag_insert(m);
1996 if (__predict_false(m == NULL))
1998 ip_off += EVL_ENCAPLEN;
2001 /* (try to) map the frame for DMA */
2002 idx = tx->req & tx->mask;
2003 err = bus_dmamap_load_mbuf_segment(tx->dmat, tx->info[idx].map,
2004 m, tx->seg_list, 1, &cnt,
2006 if (__predict_false(err == EFBIG)) {
2007 /* Too many segments in the chain. Try
2009 m_tmp = m_defrag(m, M_NOWAIT);
2010 if (m_tmp == NULL) {
2015 err = bus_dmamap_load_mbuf_segment(tx->dmat,
2017 m, tx->seg_list, 1, &cnt,
2020 if (__predict_false(err != 0)) {
2021 device_printf(sc->dev, "bus_dmamap_load_mbuf_segment returned %d"
2022 " packet len = %d\n", err, m->m_pkthdr.len);
2025 bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2026 BUS_DMASYNC_PREWRITE);
2027 tx->info[idx].m = m;
2030 /* TSO is different enough, we handle it in another routine */
2031 if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2032 mxge_encap_tso(ss, m, cnt, ip_off);
2039 pseudo_hdr_offset = 0;
2040 flags = MXGEFW_FLAGS_NO_TSO;
2042 /* checksum offloading? */
2043 if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
2044 /* ensure ip header is in first mbuf, copy
2045 it to a scratch buffer if not */
2046 if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
2047 m_copydata(m, 0, ip_off + sizeof (*ip),
2049 ip = (struct ip *)(ss->scratch + ip_off);
2051 ip = (struct ip *)(mtod(m, char *) + ip_off);
2053 cksum_offset = ip_off + (ip->ip_hl << 2);
2054 pseudo_hdr_offset = cksum_offset + m->m_pkthdr.csum_data;
2055 pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2056 req->cksum_offset = cksum_offset;
2057 flags |= MXGEFW_FLAGS_CKSUM;
2058 odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2062 if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2063 flags |= MXGEFW_FLAGS_SMALL;
2065 /* convert segments into a request list */
2068 req->flags = MXGEFW_FLAGS_FIRST;
2069 for (i = 0; i < cnt; i++) {
2071 htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2073 htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2074 req->length = htobe16(seg->ds_len);
2075 req->cksum_offset = cksum_offset;
2076 if (cksum_offset > seg->ds_len)
2077 cksum_offset -= seg->ds_len;
2080 req->pseudo_hdr_offset = pseudo_hdr_offset;
2081 req->pad = 0; /* complete solid 16-byte block */
2082 req->rdma_count = 1;
2083 req->flags |= flags | ((cum_len & 1) * odd_flag);
2084 cum_len += seg->ds_len;
2090 /* pad runts to 60 bytes */
2094 htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2096 htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2097 req->length = htobe16(60 - cum_len);
2098 req->cksum_offset = 0;
2099 req->pseudo_hdr_offset = pseudo_hdr_offset;
2100 req->pad = 0; /* complete solid 16-byte block */
2101 req->rdma_count = 1;
2102 req->flags |= flags | ((cum_len & 1) * odd_flag);
2106 tx->req_list[0].rdma_count = cnt;
2108 /* print what the firmware will see */
2109 for (i = 0; i < cnt; i++) {
2110 kprintf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2111 "cso:%d, flags:0x%x, rdma:%d\n",
2112 i, (int)ntohl(tx->req_list[i].addr_high),
2113 (int)ntohl(tx->req_list[i].addr_low),
2114 (int)ntohs(tx->req_list[i].length),
2115 (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2116 tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2117 tx->req_list[i].rdma_count);
2119 kprintf("--------------\n");
2121 tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2122 mxge_submit_req(tx, tx->req_list, cnt);
2123 #ifdef IFNET_BUF_RING
2124 if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2125 /* tell the NIC to start polling this slice */
2127 tx->queue_active = 1;
2140 #ifdef IFNET_BUF_RING
2142 mxge_qflush(struct ifnet *ifp)
2144 mxge_softc_t *sc = ifp->if_softc;
2149 for (slice = 0; slice < sc->num_slices; slice++) {
2150 tx = &sc->ss[slice].tx;
2151 lockmgr(&tx->lock, LK_EXCLUSIVE);
2152 while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2154 lockmgr(&tx->lock, LK_RELEASE);
2160 mxge_start_locked(struct mxge_slice_state *ss)
2171 while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2172 m = drbr_dequeue(ifp, tx->br);
2176 /* let BPF see it */
2179 /* give it to the nic */
2182 /* ran out of transmit slots */
2183 if (((ss->if_flags & IFF_OACTIVE) == 0)
2184 && (!drbr_empty(ifp, tx->br))) {
2185 ss->if_flags |= IFF_OACTIVE;
2191 mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2202 if ((ss->if_flags & (IFF_RUNNING|IFF_OACTIVE)) !=
2204 err = drbr_enqueue(ifp, tx->br, m);
2208 if (drbr_empty(ifp, tx->br) &&
2209 ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2210 /* let BPF see it */
2212 /* give it to the nic */
2214 } else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2217 if (!drbr_empty(ifp, tx->br))
2218 mxge_start_locked(ss);
2223 mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2225 mxge_softc_t *sc = ifp->if_softc;
2226 struct mxge_slice_state *ss;
2232 slice = m->m_pkthdr.flowid;
2234 slice &= (sc->num_slices - 1); /* num_slices always power of 2 */
2236 ss = &sc->ss[slice];
2239 if (lockmgr(&tx->lock, LK_EXCLUSIVE|LK_NOWAIT)) {
2240 err = mxge_transmit_locked(ss, m);
2241 lockmgr(&tx->lock, LK_RELEASE);
2243 err = drbr_enqueue(ifp, tx->br, m);
2252 mxge_start_locked(struct mxge_slice_state *ss)
2262 while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2263 m = ifq_dequeue(&ifp->if_snd, NULL);
2267 /* let BPF see it */
2270 /* give it to the nic */
2273 /* ran out of transmit slots */
2274 if ((sc->ifp->if_flags & IFF_OACTIVE) == 0) {
2275 sc->ifp->if_flags |= IFF_OACTIVE;
2281 mxge_start(struct ifnet *ifp)
2283 mxge_softc_t *sc = ifp->if_softc;
2284 struct mxge_slice_state *ss;
2286 /* only use the first slice for now */
2288 lockmgr(&ss->tx.lock, LK_EXCLUSIVE);
2289 mxge_start_locked(ss);
2290 lockmgr(&ss->tx.lock, LK_RELEASE);
2294 * copy an array of mcp_kreq_ether_recv_t's to the mcp. Copy
2295 * at most 32 bytes at a time, so as to avoid involving the software
2296 * pio handler in the nic. We re-write the first segment's low
2297 * DMA address to mark it valid only after we write the entire chunk
2301 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2302 mcp_kreq_ether_recv_t *src)
2306 low = src->addr_low;
2307 src->addr_low = 0xffffffff;
2308 mxge_pio_copy(dst, src, 4 * sizeof (*src));
2310 mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2312 src->addr_low = low;
2313 dst->addr_low = low;
2318 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2320 bus_dma_segment_t seg;
2322 mxge_rx_ring_t *rx = &ss->rx_small;
2325 m = m_gethdr(MB_DONTWAIT, MT_DATA);
2332 err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2333 &seg, 1, &cnt, BUS_DMA_NOWAIT);
2338 rx->info[idx].m = m;
2339 rx->shadow[idx].addr_low =
2340 htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2341 rx->shadow[idx].addr_high =
2342 htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2346 mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2351 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2353 bus_dma_segment_t seg[3];
2355 mxge_rx_ring_t *rx = &ss->rx_big;
2358 if (rx->cl_size == MCLBYTES)
2359 m = m_getcl(MB_DONTWAIT, MT_DATA, M_PKTHDR);
2361 m = m_getjcl(MB_DONTWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2367 m->m_len = rx->mlen;
2368 err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2369 seg, 1, &cnt, BUS_DMA_NOWAIT);
2374 rx->info[idx].m = m;
2375 rx->shadow[idx].addr_low =
2376 htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2377 rx->shadow[idx].addr_high =
2378 htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2380 #if MXGE_VIRT_JUMBOS
2381 for (i = 1; i < cnt; i++) {
2382 rx->shadow[idx + i].addr_low =
2383 htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2384 rx->shadow[idx + i].addr_high =
2385 htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2390 for (i = 0; i < rx->nbufs; i++) {
2391 if ((idx & 7) == 7) {
2392 mxge_submit_8rx(&rx->lanai[idx - 7],
2393 &rx->shadow[idx - 7]);
2401 * Myri10GE hardware checksums are not valid if the sender
2402 * padded the frame with non-zero padding. This is because
2403 * the firmware just does a simple 16-bit 1s complement
2404 * checksum across the entire frame, excluding the first 14
2405 * bytes. It is best to simply to check the checksum and
2406 * tell the stack about it only if the checksum is good
2409 static inline uint16_t
2410 mxge_rx_csum(struct mbuf *m, int csum)
2412 struct ether_header *eh;
2416 eh = mtod(m, struct ether_header *);
2418 /* only deal with IPv4 TCP & UDP for now */
2419 if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2421 ip = (struct ip *)(eh + 1);
2422 if (__predict_false(ip->ip_p != IPPROTO_TCP &&
2423 ip->ip_p != IPPROTO_UDP))
2426 c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2427 htonl(ntohs(csum) + ntohs(ip->ip_len) +
2428 - (ip->ip_hl << 2) + ip->ip_p));
2437 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2439 struct ether_vlan_header *evl;
2440 struct ether_header *eh;
2443 evl = mtod(m, struct ether_vlan_header *);
2444 eh = mtod(m, struct ether_header *);
2447 * fix checksum by subtracting EVL_ENCAPLEN bytes
2448 * after what the firmware thought was the end of the ethernet
2452 /* put checksum into host byte order */
2453 *csum = ntohs(*csum);
2454 partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2455 (*csum) += ~partial;
2456 (*csum) += ((*csum) < ~partial);
2457 (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2458 (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2460 /* restore checksum to network byte order;
2461 later consumers expect this */
2462 *csum = htons(*csum);
2465 #ifdef MXGE_NEW_VLAN_API
2466 m->m_pkthdr.ether_vlantag = ntohs(evl->evl_tag);
2470 mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2474 VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2475 m_tag_prepend(m, mtag);
2479 m->m_flags |= M_VLANTAG;
2482 * Remove the 802.1q header by copying the Ethernet
2483 * addresses over it and adjusting the beginning of
2484 * the data in the mbuf. The encapsulated Ethernet
2485 * type field is already in place.
2487 bcopy((char *)evl, (char *)evl + EVL_ENCAPLEN,
2488 ETHER_HDR_LEN - ETHER_TYPE_LEN);
2489 m_adj(m, EVL_ENCAPLEN);
2494 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2499 struct ether_header *eh;
2501 bus_dmamap_t old_map;
2503 uint16_t tcpudp_csum;
2508 idx = rx->cnt & rx->mask;
2509 rx->cnt += rx->nbufs;
2510 /* save a pointer to the received mbuf */
2511 m = rx->info[idx].m;
2512 /* try to replace the received mbuf */
2513 if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2514 /* drop the frame -- the old mbuf is re-cycled */
2519 /* unmap the received buffer */
2520 old_map = rx->info[idx].map;
2521 bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2522 bus_dmamap_unload(rx->dmat, old_map);
2524 /* swap the bus_dmamap_t's */
2525 rx->info[idx].map = rx->extra_map;
2526 rx->extra_map = old_map;
2528 /* mcp implicitly skips 1st 2 bytes so that packet is properly
2530 m->m_data += MXGEFW_PAD;
2532 m->m_pkthdr.rcvif = ifp;
2533 m->m_len = m->m_pkthdr.len = len;
2535 eh = mtod(m, struct ether_header *);
2536 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2537 mxge_vlan_tag_remove(m, &csum);
2539 /* if the checksum is valid, mark it in the mbuf header */
2540 if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2541 if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2543 /* otherwise, it was a UDP frame, or a TCP frame which
2544 we could not do LRO on. Tell the stack that the
2546 m->m_pkthdr.csum_data = 0xffff;
2547 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2550 /* flowid only valid if RSS hashing is enabled */
2551 if (sc->num_slices > 1) {
2552 m->m_pkthdr.flowid = (ss - sc->ss);
2553 m->m_flags |= M_FLOWID;
2556 /* pass the frame up the stack */
2557 (*ifp->if_input)(ifp, m);
2561 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2565 struct ether_header *eh;
2568 bus_dmamap_t old_map;
2570 uint16_t tcpudp_csum;
2575 idx = rx->cnt & rx->mask;
2577 /* save a pointer to the received mbuf */
2578 m = rx->info[idx].m;
2579 /* try to replace the received mbuf */
2580 if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2581 /* drop the frame -- the old mbuf is re-cycled */
2586 /* unmap the received buffer */
2587 old_map = rx->info[idx].map;
2588 bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2589 bus_dmamap_unload(rx->dmat, old_map);
2591 /* swap the bus_dmamap_t's */
2592 rx->info[idx].map = rx->extra_map;
2593 rx->extra_map = old_map;
2595 /* mcp implicitly skips 1st 2 bytes so that packet is properly
2597 m->m_data += MXGEFW_PAD;
2599 m->m_pkthdr.rcvif = ifp;
2600 m->m_len = m->m_pkthdr.len = len;
2602 eh = mtod(m, struct ether_header *);
2603 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2604 mxge_vlan_tag_remove(m, &csum);
2606 /* if the checksum is valid, mark it in the mbuf header */
2607 if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2608 if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2610 /* otherwise, it was a UDP frame, or a TCP frame which
2611 we could not do LRO on. Tell the stack that the
2613 m->m_pkthdr.csum_data = 0xffff;
2614 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2617 /* flowid only valid if RSS hashing is enabled */
2618 if (sc->num_slices > 1) {
2619 m->m_pkthdr.flowid = (ss - sc->ss);
2620 m->m_flags |= M_FLOWID;
2623 /* pass the frame up the stack */
2624 (*ifp->if_input)(ifp, m);
2628 mxge_clean_rx_done(struct mxge_slice_state *ss)
2630 mxge_rx_done_t *rx_done = &ss->rx_done;
2636 while (rx_done->entry[rx_done->idx].length != 0) {
2637 length = ntohs(rx_done->entry[rx_done->idx].length);
2638 rx_done->entry[rx_done->idx].length = 0;
2639 checksum = rx_done->entry[rx_done->idx].checksum;
2640 if (length <= (MHLEN - MXGEFW_PAD))
2641 mxge_rx_done_small(ss, length, checksum);
2643 mxge_rx_done_big(ss, length, checksum);
2645 rx_done->idx = rx_done->cnt & rx_done->mask;
2647 /* limit potential for livelock */
2648 if (__predict_false(++limit > rx_done->mask / 2))
2652 while (!SLIST_EMPTY(&ss->lro_active)) {
2653 struct lro_entry *lro = SLIST_FIRST(&ss->lro_active);
2654 SLIST_REMOVE_HEAD(&ss->lro_active, next);
2655 mxge_lro_flush(ss, lro);
2662 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2673 while (tx->pkt_done != mcp_idx) {
2674 idx = tx->done & tx->mask;
2676 m = tx->info[idx].m;
2677 /* mbuf and DMA map only attached to the first
2680 ss->obytes += m->m_pkthdr.len;
2681 if (m->m_flags & M_MCAST)
2684 tx->info[idx].m = NULL;
2685 map = tx->info[idx].map;
2686 bus_dmamap_unload(tx->dmat, map);
2689 if (tx->info[idx].flag) {
2690 tx->info[idx].flag = 0;
2695 /* If we have space, clear IFF_OACTIVE to tell the stack that
2696 its OK to send packets */
2697 #ifdef IFNET_BUF_RING
2698 flags = &ss->if_flags;
2700 flags = &ifp->if_flags;
2702 lockmgr(&ss->tx.lock, LK_EXCLUSIVE);
2703 if ((*flags) & IFF_OACTIVE &&
2704 tx->req - tx->done < (tx->mask + 1)/4) {
2705 *(flags) &= ~IFF_OACTIVE;
2707 mxge_start_locked(ss);
2709 #ifdef IFNET_BUF_RING
2710 if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2711 /* let the NIC stop polling this queue, since there
2712 * are no more transmits pending */
2713 if (tx->req == tx->done) {
2715 tx->queue_active = 0;
2721 lockmgr(&ss->tx.lock, LK_RELEASE);
2725 static struct mxge_media_type mxge_xfp_media_types[] =
2727 {IFM_10G_CX4, 0x7f, "10GBASE-CX4 (module)"},
2728 {IFM_10G_SR, (1 << 7), "10GBASE-SR"},
2729 {IFM_10G_LR, (1 << 6), "10GBASE-LR"},
2730 {0, (1 << 5), "10GBASE-ER"},
2731 {IFM_10G_LRM, (1 << 4), "10GBASE-LRM"},
2732 {0, (1 << 3), "10GBASE-SW"},
2733 {0, (1 << 2), "10GBASE-LW"},
2734 {0, (1 << 1), "10GBASE-EW"},
2735 {0, (1 << 0), "Reserved"}
2737 static struct mxge_media_type mxge_sfp_media_types[] =
2739 {0, (1 << 7), "Reserved"},
2740 {IFM_10G_LRM, (1 << 6), "10GBASE-LRM"},
2741 {IFM_10G_LR, (1 << 5), "10GBASE-LR"},
2742 {IFM_10G_SR, (1 << 4), "10GBASE-SR"}
2746 mxge_set_media(mxge_softc_t *sc, int type)
2748 sc->media_flags |= type;
2749 ifmedia_add(&sc->media, sc->media_flags, 0, NULL);
2750 ifmedia_set(&sc->media, sc->media_flags);
2755 * Determine the media type for a NIC. Some XFPs will identify
2756 * themselves only when their link is up, so this is initiated via a
2757 * link up interrupt. However, this can potentially take up to
2758 * several milliseconds, so it is run via the watchdog routine, rather
2759 * than in the interrupt handler itself. This need only be done
2760 * once, not each time the link is up.
2763 mxge_media_probe(mxge_softc_t *sc)
2768 struct mxge_media_type *mxge_media_types = NULL;
2769 int i, err, ms, mxge_media_type_entries;
2772 sc->need_media_probe = 0;
2774 /* if we've already set a media type, we're done */
2775 if (sc->media_flags != (IFM_ETHER | IFM_AUTO))
2779 * parse the product code to deterimine the interface type
2780 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2781 * after the 3rd dash in the driver's cached copy of the
2782 * EEPROM's product code string.
2784 ptr = sc->product_code_string;
2786 device_printf(sc->dev, "Missing product code\n");
2789 for (i = 0; i < 3; i++, ptr++) {
2790 ptr = index(ptr, '-');
2792 device_printf(sc->dev,
2793 "only %d dashes in PC?!?\n", i);
2799 mxge_set_media(sc, IFM_10G_CX4);
2802 else if (*ptr == 'Q') {
2803 /* -Q is Quad Ribbon Fiber */
2804 device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2805 /* FreeBSD has no media type for Quad ribbon fiber */
2811 mxge_media_types = mxge_xfp_media_types;
2812 mxge_media_type_entries =
2813 sizeof (mxge_xfp_media_types) /
2814 sizeof (mxge_xfp_media_types[0]);
2815 byte = MXGE_XFP_COMPLIANCE_BYTE;
2819 if (*ptr == 'S' || *(ptr +1) == 'S') {
2820 /* -S or -2S is SFP+ */
2821 mxge_media_types = mxge_sfp_media_types;
2822 mxge_media_type_entries =
2823 sizeof (mxge_sfp_media_types) /
2824 sizeof (mxge_sfp_media_types[0]);
2829 if (mxge_media_types == NULL) {
2830 device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2835 * At this point we know the NIC has an XFP cage, so now we
2836 * try to determine what is in the cage by using the
2837 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2838 * register. We read just one byte, which may take over
2842 cmd.data0 = 0; /* just fetch 1 byte, not all 256 */
2844 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2845 if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
2846 device_printf(sc->dev, "failed to read XFP\n");
2848 if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
2849 device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
2851 if (err != MXGEFW_CMD_OK) {
2855 /* now we wait for the data to be cached */
2857 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2858 for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
2861 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2863 if (err != MXGEFW_CMD_OK) {
2864 device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
2865 cage_type, err, ms);
2869 if (cmd.data0 == mxge_media_types[0].bitmask) {
2871 device_printf(sc->dev, "%s:%s\n", cage_type,
2872 mxge_media_types[0].name);
2873 mxge_set_media(sc, IFM_10G_CX4);
2876 for (i = 1; i < mxge_media_type_entries; i++) {
2877 if (cmd.data0 & mxge_media_types[i].bitmask) {
2879 device_printf(sc->dev, "%s:%s\n",
2881 mxge_media_types[i].name);
2883 mxge_set_media(sc, mxge_media_types[i].flag);
2887 device_printf(sc->dev, "%s media 0x%x unknown\n", cage_type,
2894 mxge_intr(void *arg)
2896 struct mxge_slice_state *ss = arg;
2897 mxge_softc_t *sc = ss->sc;
2898 mcp_irq_data_t *stats = ss->fw_stats;
2899 mxge_tx_ring_t *tx = &ss->tx;
2900 mxge_rx_done_t *rx_done = &ss->rx_done;
2901 uint32_t send_done_count;
2905 #ifndef IFNET_BUF_RING
2906 /* an interrupt on a non-zero slice is implicitly valid
2907 since MSI-X irqs are not shared */
2909 mxge_clean_rx_done(ss);
2910 *ss->irq_claim = be32toh(3);
2915 /* make sure the DMA has finished */
2916 if (!stats->valid) {
2919 valid = stats->valid;
2921 if (sc->legacy_irq) {
2922 /* lower legacy IRQ */
2923 *sc->irq_deassert = 0;
2924 if (!mxge_deassert_wait)
2925 /* don't wait for conf. that irq is low */
2931 /* loop while waiting for legacy irq deassertion */
2933 /* check for transmit completes and receives */
2934 send_done_count = be32toh(stats->send_done_count);
2935 while ((send_done_count != tx->pkt_done) ||
2936 (rx_done->entry[rx_done->idx].length != 0)) {
2937 if (send_done_count != tx->pkt_done)
2938 mxge_tx_done(ss, (int)send_done_count);
2939 mxge_clean_rx_done(ss);
2940 send_done_count = be32toh(stats->send_done_count);
2942 if (sc->legacy_irq && mxge_deassert_wait)
2944 } while (*((volatile uint8_t *) &stats->valid));
2946 /* fw link & error stats meaningful only on the first slice */
2947 if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
2948 if (sc->link_state != stats->link_up) {
2949 sc->link_state = stats->link_up;
2950 if (sc->link_state) {
2951 sc->ifp->if_link_state = LINK_STATE_UP;
2952 if_link_state_change(sc->ifp);
2954 device_printf(sc->dev, "link up\n");
2956 sc->ifp->if_link_state = LINK_STATE_DOWN;
2957 if_link_state_change(sc->ifp);
2959 device_printf(sc->dev, "link down\n");
2961 sc->need_media_probe = 1;
2963 if (sc->rdma_tags_available !=
2964 be32toh(stats->rdma_tags_available)) {
2965 sc->rdma_tags_available =
2966 be32toh(stats->rdma_tags_available);
2967 device_printf(sc->dev, "RDMA timed out! %d tags "
2968 "left\n", sc->rdma_tags_available);
2971 if (stats->link_down) {
2972 sc->down_cnt += stats->link_down;
2974 sc->ifp->if_link_state = LINK_STATE_DOWN;
2975 if_link_state_change(sc->ifp);
2979 /* check to see if we have rx token to pass back */
2981 *ss->irq_claim = be32toh(3);
2982 *(ss->irq_claim + 1) = be32toh(3);
2986 mxge_init(void *arg)
2993 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
2995 struct lro_entry *lro_entry;
2998 while (!SLIST_EMPTY(&ss->lro_free)) {
2999 lro_entry = SLIST_FIRST(&ss->lro_free);
3000 SLIST_REMOVE_HEAD(&ss->lro_free, next);
3001 kfree(lro_entry, M_DEVBUF);
3004 for (i = 0; i <= ss->rx_big.mask; i++) {
3005 if (ss->rx_big.info[i].m == NULL)
3007 bus_dmamap_unload(ss->rx_big.dmat,
3008 ss->rx_big.info[i].map);
3009 m_freem(ss->rx_big.info[i].m);
3010 ss->rx_big.info[i].m = NULL;
3013 for (i = 0; i <= ss->rx_small.mask; i++) {
3014 if (ss->rx_small.info[i].m == NULL)
3016 bus_dmamap_unload(ss->rx_small.dmat,
3017 ss->rx_small.info[i].map);
3018 m_freem(ss->rx_small.info[i].m);
3019 ss->rx_small.info[i].m = NULL;
3022 /* transmit ring used only on the first slice */
3023 if (ss->tx.info == NULL)
3026 for (i = 0; i <= ss->tx.mask; i++) {
3027 ss->tx.info[i].flag = 0;
3028 if (ss->tx.info[i].m == NULL)
3030 bus_dmamap_unload(ss->tx.dmat,
3031 ss->tx.info[i].map);
3032 m_freem(ss->tx.info[i].m);
3033 ss->tx.info[i].m = NULL;
3038 mxge_free_mbufs(mxge_softc_t *sc)
3042 for (slice = 0; slice < sc->num_slices; slice++)
3043 mxge_free_slice_mbufs(&sc->ss[slice]);
3047 mxge_free_slice_rings(struct mxge_slice_state *ss)
3052 if (ss->rx_done.entry != NULL)
3053 mxge_dma_free(&ss->rx_done.dma);
3054 ss->rx_done.entry = NULL;
3056 if (ss->tx.req_bytes != NULL)
3057 kfree(ss->tx.req_bytes, M_DEVBUF);
3058 ss->tx.req_bytes = NULL;
3060 if (ss->tx.seg_list != NULL)
3061 kfree(ss->tx.seg_list, M_DEVBUF);
3062 ss->tx.seg_list = NULL;
3064 if (ss->rx_small.shadow != NULL)
3065 kfree(ss->rx_small.shadow, M_DEVBUF);
3066 ss->rx_small.shadow = NULL;
3068 if (ss->rx_big.shadow != NULL)
3069 kfree(ss->rx_big.shadow, M_DEVBUF);
3070 ss->rx_big.shadow = NULL;
3072 if (ss->tx.info != NULL) {
3073 if (ss->tx.dmat != NULL) {
3074 for (i = 0; i <= ss->tx.mask; i++) {
3075 bus_dmamap_destroy(ss->tx.dmat,
3076 ss->tx.info[i].map);
3078 bus_dma_tag_destroy(ss->tx.dmat);
3080 kfree(ss->tx.info, M_DEVBUF);
3084 if (ss->rx_small.info != NULL) {
3085 if (ss->rx_small.dmat != NULL) {
3086 for (i = 0; i <= ss->rx_small.mask; i++) {
3087 bus_dmamap_destroy(ss->rx_small.dmat,
3088 ss->rx_small.info[i].map);
3090 bus_dmamap_destroy(ss->rx_small.dmat,
3091 ss->rx_small.extra_map);
3092 bus_dma_tag_destroy(ss->rx_small.dmat);
3094 kfree(ss->rx_small.info, M_DEVBUF);
3096 ss->rx_small.info = NULL;
3098 if (ss->rx_big.info != NULL) {
3099 if (ss->rx_big.dmat != NULL) {
3100 for (i = 0; i <= ss->rx_big.mask; i++) {
3101 bus_dmamap_destroy(ss->rx_big.dmat,
3102 ss->rx_big.info[i].map);
3104 bus_dmamap_destroy(ss->rx_big.dmat,
3105 ss->rx_big.extra_map);
3106 bus_dma_tag_destroy(ss->rx_big.dmat);
3108 kfree(ss->rx_big.info, M_DEVBUF);
3110 ss->rx_big.info = NULL;
3114 mxge_free_rings(mxge_softc_t *sc)
3118 for (slice = 0; slice < sc->num_slices; slice++)
3119 mxge_free_slice_rings(&sc->ss[slice]);
3123 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3124 int tx_ring_entries)
3126 mxge_softc_t *sc = ss->sc;
3132 /* allocate per-slice receive resources */
3134 ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3135 ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3137 /* allocate the rx shadow rings */
3138 bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3139 ss->rx_small.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3140 if (ss->rx_small.shadow == NULL)
3143 bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3144 ss->rx_big.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3145 if (ss->rx_big.shadow == NULL)
3148 /* allocate the rx host info rings */
3149 bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3150 ss->rx_small.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3151 if (ss->rx_small.info == NULL)
3154 bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3155 ss->rx_big.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3156 if (ss->rx_big.info == NULL)
3159 /* allocate the rx busdma resources */
3160 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3162 4096, /* boundary */
3163 BUS_SPACE_MAXADDR, /* low */
3164 BUS_SPACE_MAXADDR, /* high */
3165 NULL, NULL, /* filter */
3166 MHLEN, /* maxsize */
3168 MHLEN, /* maxsegsize */
3169 BUS_DMA_ALLOCNOW, /* flags */
3170 &ss->rx_small.dmat); /* tag */
3172 device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3177 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3179 #if MXGE_VIRT_JUMBOS
3180 4096, /* boundary */
3184 BUS_SPACE_MAXADDR, /* low */
3185 BUS_SPACE_MAXADDR, /* high */
3186 NULL, NULL, /* filter */
3187 3*4096, /* maxsize */
3188 #if MXGE_VIRT_JUMBOS
3190 4096, /* maxsegsize*/
3193 MJUM9BYTES, /* maxsegsize*/
3195 BUS_DMA_ALLOCNOW, /* flags */
3196 &ss->rx_big.dmat); /* tag */
3198 device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3202 for (i = 0; i <= ss->rx_small.mask; i++) {
3203 err = bus_dmamap_create(ss->rx_small.dmat, 0,
3204 &ss->rx_small.info[i].map);
3206 device_printf(sc->dev, "Err %d rx_small dmamap\n",
3211 err = bus_dmamap_create(ss->rx_small.dmat, 0,
3212 &ss->rx_small.extra_map);
3214 device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3219 for (i = 0; i <= ss->rx_big.mask; i++) {
3220 err = bus_dmamap_create(ss->rx_big.dmat, 0,
3221 &ss->rx_big.info[i].map);
3223 device_printf(sc->dev, "Err %d rx_big dmamap\n",
3228 err = bus_dmamap_create(ss->rx_big.dmat, 0,
3229 &ss->rx_big.extra_map);
3231 device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3236 /* now allocate TX resouces */
3238 #ifndef IFNET_BUF_RING
3239 /* only use a single TX ring for now */
3240 if (ss != ss->sc->ss)
3244 ss->tx.mask = tx_ring_entries - 1;
3245 ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3248 /* allocate the tx request copy block */
3250 sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3251 ss->tx.req_bytes = kmalloc(bytes, M_DEVBUF, M_WAITOK);
3252 if (ss->tx.req_bytes == NULL)
3254 /* ensure req_list entries are aligned to 8 bytes */
3255 ss->tx.req_list = (mcp_kreq_ether_send_t *)
3256 ((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3258 /* allocate the tx busdma segment list */
3259 bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3260 ss->tx.seg_list = (bus_dma_segment_t *)
3261 kmalloc(bytes, M_DEVBUF, M_WAITOK);
3262 if (ss->tx.seg_list == NULL)
3265 /* allocate the tx host info ring */
3266 bytes = tx_ring_entries * sizeof (*ss->tx.info);
3267 ss->tx.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3268 if (ss->tx.info == NULL)
3271 /* allocate the tx busdma resources */
3272 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3274 sc->tx_boundary, /* boundary */
3275 BUS_SPACE_MAXADDR, /* low */
3276 BUS_SPACE_MAXADDR, /* high */
3277 NULL, NULL, /* filter */
3278 65536 + 256, /* maxsize */
3279 ss->tx.max_desc - 2, /* num segs */
3280 sc->tx_boundary, /* maxsegsz */
3281 BUS_DMA_ALLOCNOW, /* flags */
3282 &ss->tx.dmat); /* tag */
3285 device_printf(sc->dev, "Err %d allocating tx dmat\n",
3290 /* now use these tags to setup dmamaps for each slot
3292 for (i = 0; i <= ss->tx.mask; i++) {
3293 err = bus_dmamap_create(ss->tx.dmat, 0,
3294 &ss->tx.info[i].map);
3296 device_printf(sc->dev, "Err %d tx dmamap\n",
3306 mxge_alloc_rings(mxge_softc_t *sc)
3310 int tx_ring_entries, rx_ring_entries;
3313 /* get ring sizes */
3314 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3315 tx_ring_size = cmd.data0;
3317 device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3321 tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3322 rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3323 ifq_set_maxlen(&sc->ifp->if_snd, tx_ring_entries - 1);
3324 ifq_set_ready(&sc->ifp->if_snd);
3326 for (slice = 0; slice < sc->num_slices; slice++) {
3327 err = mxge_alloc_slice_rings(&sc->ss[slice],
3336 mxge_free_rings(sc);
3343 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3345 int bufsize = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN + MXGEFW_PAD;
3347 if (bufsize < MCLBYTES) {
3348 /* easy, everything fits in a single buffer */
3349 *big_buf_size = MCLBYTES;
3350 *cl_size = MCLBYTES;
3355 if (bufsize < MJUMPAGESIZE) {
3356 /* still easy, everything still fits in a single buffer */
3357 *big_buf_size = MJUMPAGESIZE;
3358 *cl_size = MJUMPAGESIZE;
3362 #if MXGE_VIRT_JUMBOS
3363 /* now we need to use virtually contiguous buffers */
3364 *cl_size = MJUM9BYTES;
3365 *big_buf_size = 4096;
3366 *nbufs = mtu / 4096 + 1;
3367 /* needs to be a power of two, so round up */
3371 *cl_size = MJUM9BYTES;
3372 *big_buf_size = MJUM9BYTES;
3378 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3383 struct lro_entry *lro_entry;
3388 slice = ss - sc->ss;
3390 SLIST_INIT(&ss->lro_free);
3391 SLIST_INIT(&ss->lro_active);
3393 for (i = 0; i < sc->lro_cnt; i++) {
3394 lro_entry = (struct lro_entry *)
3395 kmalloc(sizeof (*lro_entry), M_DEVBUF,
3397 if (lro_entry == NULL) {
3401 SLIST_INSERT_HEAD(&ss->lro_free, lro_entry, next);
3403 /* get the lanai pointers to the send and receive rings */
3406 #ifndef IFNET_BUF_RING
3407 /* We currently only send from the first slice */
3411 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3413 (volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3414 ss->tx.send_go = (volatile uint32_t *)
3415 (sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3416 ss->tx.send_stop = (volatile uint32_t *)
3417 (sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3418 #ifndef IFNET_BUF_RING
3422 err |= mxge_send_cmd(sc,
3423 MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3424 ss->rx_small.lanai =
3425 (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3427 err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3429 (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3432 device_printf(sc->dev,
3433 "failed to get ring sizes or locations\n");
3437 /* stock receive rings */
3438 for (i = 0; i <= ss->rx_small.mask; i++) {
3439 map = ss->rx_small.info[i].map;
3440 err = mxge_get_buf_small(ss, map, i);
3442 device_printf(sc->dev, "alloced %d/%d smalls\n",
3443 i, ss->rx_small.mask + 1);
3447 for (i = 0; i <= ss->rx_big.mask; i++) {
3448 ss->rx_big.shadow[i].addr_low = 0xffffffff;
3449 ss->rx_big.shadow[i].addr_high = 0xffffffff;
3451 ss->rx_big.nbufs = nbufs;
3452 ss->rx_big.cl_size = cl_size;
3453 ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3454 EVL_ENCAPLEN + MXGEFW_PAD;
3455 for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3456 map = ss->rx_big.info[i].map;
3457 err = mxge_get_buf_big(ss, map, i);
3459 device_printf(sc->dev, "alloced %d/%d bigs\n",
3460 i, ss->rx_big.mask + 1);
3468 mxge_open(mxge_softc_t *sc)
3471 int err, big_bytes, nbufs, slice, cl_size, i;
3473 volatile uint8_t *itable;
3474 struct mxge_slice_state *ss;
3476 /* Copy the MAC address in case it was overridden */
3477 bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3479 err = mxge_reset(sc, 1);
3481 device_printf(sc->dev, "failed to reset\n");
3485 if (sc->num_slices > 1) {
3486 /* setup the indirection table */
3487 cmd.data0 = sc->num_slices;
3488 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3491 err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3494 device_printf(sc->dev,
3495 "failed to setup rss tables\n");
3499 /* just enable an identity mapping */
3500 itable = sc->sram + cmd.data0;
3501 for (i = 0; i < sc->num_slices; i++)
3502 itable[i] = (uint8_t)i;
3505 cmd.data1 = mxge_rss_hash_type;
3506 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3508 device_printf(sc->dev, "failed to enable slices\n");
3514 mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3517 err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3519 /* error is only meaningful if we're trying to set
3520 MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3521 if (err && nbufs > 1) {
3522 device_printf(sc->dev,
3523 "Failed to set alway-use-n to %d\n",
3527 /* Give the firmware the mtu and the big and small buffer
3528 sizes. The firmware wants the big buf size to be a power
3529 of two. Luckily, FreeBSD's clusters are powers of two */
3530 cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
3531 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3532 cmd.data0 = MHLEN - MXGEFW_PAD;
3533 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3535 cmd.data0 = big_bytes;
3536 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3539 device_printf(sc->dev, "failed to setup params\n");
3543 /* Now give him the pointer to the stats block */
3545 #ifdef IFNET_BUF_RING
3546 slice < sc->num_slices;
3551 ss = &sc->ss[slice];
3553 MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3555 MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3556 cmd.data2 = sizeof(struct mcp_irq_data);
3557 cmd.data2 |= (slice << 16);
3558 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3562 bus = sc->ss->fw_stats_dma.bus_addr;
3563 bus += offsetof(struct mcp_irq_data, send_done_count);
3564 cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3565 cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3566 err = mxge_send_cmd(sc,
3567 MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3569 /* Firmware cannot support multicast without STATS_DMA_V2 */
3570 sc->fw_multicast_support = 0;
3572 sc->fw_multicast_support = 1;
3576 device_printf(sc->dev, "failed to setup params\n");
3580 for (slice = 0; slice < sc->num_slices; slice++) {
3581 err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3583 device_printf(sc->dev, "couldn't open slice %d\n",
3589 /* Finally, start the firmware running */
3590 err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3592 device_printf(sc->dev, "Couldn't bring up link\n");
3595 #ifdef IFNET_BUF_RING
3596 for (slice = 0; slice < sc->num_slices; slice++) {
3597 ss = &sc->ss[slice];
3598 ss->if_flags |= IFF_RUNNING;
3599 ss->if_flags &= ~IFF_OACTIVE;
3602 sc->ifp->if_flags |= IFF_RUNNING;
3603 sc->ifp->if_flags &= ~IFF_OACTIVE;
3604 callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3610 mxge_free_mbufs(sc);
3616 mxge_close(mxge_softc_t *sc)
3619 int err, old_down_cnt;
3620 #ifdef IFNET_BUF_RING
3621 struct mxge_slice_state *ss;
3625 callout_stop(&sc->co_hdl);
3626 #ifdef IFNET_BUF_RING
3627 for (slice = 0; slice < sc->num_slices; slice++) {
3628 ss = &sc->ss[slice];
3629 ss->if_flags &= ~IFF_RUNNING;
3632 sc->ifp->if_flags &= ~IFF_RUNNING;
3633 old_down_cnt = sc->down_cnt;
3635 err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3637 device_printf(sc->dev, "Couldn't bring down link\n");
3639 if (old_down_cnt == sc->down_cnt) {
3640 /* wait for down irq */
3641 DELAY(10 * sc->intr_coal_delay);
3644 if (old_down_cnt == sc->down_cnt) {
3645 device_printf(sc->dev, "never got down irq\n");
3648 mxge_free_mbufs(sc);
3654 mxge_setup_cfg_space(mxge_softc_t *sc)
3656 device_t dev = sc->dev;
3658 uint16_t cmd, lnk, pectl;
3660 /* find the PCIe link width and set max read request to 4KB*/
3661 if (pci_find_extcap(dev, PCIY_EXPRESS, ®) == 0) {
3662 lnk = pci_read_config(dev, reg + 0x12, 2);
3663 sc->link_width = (lnk >> 4) & 0x3f;
3665 pectl = pci_read_config(dev, reg + 0x8, 2);
3666 pectl = (pectl & ~0x7000) | (5 << 12);
3667 pci_write_config(dev, reg + 0x8, pectl, 2);
3670 /* Enable DMA and Memory space access */
3671 pci_enable_busmaster(dev);
3672 cmd = pci_read_config(dev, PCIR_COMMAND, 2);
3673 cmd |= PCIM_CMD_MEMEN;
3674 pci_write_config(dev, PCIR_COMMAND, cmd, 2);
3678 mxge_read_reboot(mxge_softc_t *sc)
3680 device_t dev = sc->dev;
3683 /* find the vendor specific offset */
3684 if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
3685 device_printf(sc->dev,
3686 "could not find vendor specific offset\n");
3687 return (uint32_t)-1;
3689 /* enable read32 mode */
3690 pci_write_config(dev, vs + 0x10, 0x3, 1);
3691 /* tell NIC which register to read */
3692 pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3693 return (pci_read_config(dev, vs + 0x14, 4));
3697 mxge_watchdog_reset(mxge_softc_t *sc, int slice)
3699 struct pci_devinfo *dinfo;
3707 device_printf(sc->dev, "Watchdog reset!\n");
3710 * check to see if the NIC rebooted. If it did, then all of
3711 * PCI config space has been reset, and things like the
3712 * busmaster bit will be zero. If this is the case, then we
3713 * must restore PCI config space before the NIC can be used
3716 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3717 if (cmd == 0xffff) {
3719 * maybe the watchdog caught the NIC rebooting; wait
3720 * up to 100ms for it to finish. If it does not come
3721 * back, then give up
3724 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3725 if (cmd == 0xffff) {
3726 device_printf(sc->dev, "NIC disappeared!\n");
3730 if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3731 /* print the reboot status */
3732 reboot = mxge_read_reboot(sc);
3733 device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3735 /* restore PCI configuration space */
3736 dinfo = device_get_ivars(sc->dev);
3737 pci_cfg_restore(sc->dev, dinfo);
3739 /* and redo any changes we made to our config space */
3740 mxge_setup_cfg_space(sc);
3742 if (sc->ifp->if_flags & IFF_RUNNING) {
3744 err = mxge_open(sc);
3747 tx = &sc->ss[slice].tx;
3748 device_printf(sc->dev,
3749 "NIC did not reboot, slice %d ring state:\n",
3751 device_printf(sc->dev,
3752 "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3753 tx->req, tx->done, tx->queue_active);
3754 device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3755 tx->activate, tx->deactivate);
3756 device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3758 be32toh(sc->ss->fw_stats->send_done_count));
3759 device_printf(sc->dev, "not resetting\n");
3765 mxge_watchdog(mxge_softc_t *sc)
3768 uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3771 /* see if we have outstanding transmits, which
3772 have been pending for more than mxge_ticks */
3774 #ifdef IFNET_BUF_RING
3775 (i < sc->num_slices) && (err == 0);
3777 (i < 1) && (err == 0);
3781 if (tx->req != tx->done &&
3782 tx->watchdog_req != tx->watchdog_done &&
3783 tx->done == tx->watchdog_done) {
3784 /* check for pause blocking before resetting */
3785 if (tx->watchdog_rx_pause == rx_pause)
3786 err = mxge_watchdog_reset(sc, i);
3788 device_printf(sc->dev, "Flow control blocking "
3789 "xmits, check link partner\n");
3792 tx->watchdog_req = tx->req;
3793 tx->watchdog_done = tx->done;
3794 tx->watchdog_rx_pause = rx_pause;
3797 if (sc->need_media_probe)
3798 mxge_media_probe(sc);
3803 mxge_update_stats(mxge_softc_t *sc)
3805 struct mxge_slice_state *ss;
3806 u_long ipackets = 0;
3807 u_long opackets = 0;
3808 #ifdef IFNET_BUF_RING
3816 for (slice = 0; slice < sc->num_slices; slice++) {
3817 ss = &sc->ss[slice];
3818 ipackets += ss->ipackets;
3819 opackets += ss->opackets;
3820 #ifdef IFNET_BUF_RING
3821 obytes += ss->obytes;
3822 omcasts += ss->omcasts;
3823 odrops += ss->tx.br->br_drops;
3825 oerrors += ss->oerrors;
3827 sc->ifp->if_ipackets = ipackets;
3828 sc->ifp->if_opackets = opackets;
3829 #ifdef IFNET_BUF_RING
3830 sc->ifp->if_obytes = obytes;
3831 sc->ifp->if_omcasts = omcasts;
3832 sc->ifp->if_snd.ifq_drops = odrops;
3834 sc->ifp->if_oerrors = oerrors;
3838 mxge_tick(void *arg)
3840 mxge_softc_t *sc = arg;
3843 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
3844 /* aggregate stats from different slices */
3845 mxge_update_stats(sc);
3846 if (!sc->watchdog_countdown) {
3847 err = mxge_watchdog(sc);
3848 sc->watchdog_countdown = 4;
3850 sc->watchdog_countdown--;
3852 callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3853 lockmgr(&sc->driver_lock, LK_RELEASE);
3857 mxge_media_change(struct ifnet *ifp)
3863 mxge_change_mtu(mxge_softc_t *sc, int mtu)
3865 struct ifnet *ifp = sc->ifp;
3866 int real_mtu, old_mtu;
3870 real_mtu = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
3871 if ((real_mtu > sc->max_mtu) || real_mtu < 60)
3873 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
3874 old_mtu = ifp->if_mtu;
3876 if (ifp->if_flags & IFF_RUNNING) {
3878 err = mxge_open(sc);
3880 ifp->if_mtu = old_mtu;
3882 (void) mxge_open(sc);
3885 lockmgr(&sc->driver_lock, LK_RELEASE);
3890 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
3892 mxge_softc_t *sc = ifp->if_softc;
3897 ifmr->ifm_status = IFM_AVALID;
3898 ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
3899 ifmr->ifm_active = IFM_AUTO | IFM_ETHER;
3900 ifmr->ifm_active |= sc->link_state ? IFM_FDX : 0;
3904 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data, struct ucred *cr)
3906 mxge_softc_t *sc = ifp->if_softc;
3907 struct ifreq *ifr = (struct ifreq *)data;
3915 err = ether_ioctl(ifp, command, data);
3919 err = mxge_change_mtu(sc, ifr->ifr_mtu);
3923 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
3925 lockmgr(&sc->driver_lock, LK_RELEASE);
3928 if (ifp->if_flags & IFF_UP) {
3929 if (!(ifp->if_flags & IFF_RUNNING)) {
3930 err = mxge_open(sc);
3932 /* take care of promis can allmulti
3934 mxge_change_promisc(sc,
3935 ifp->if_flags & IFF_PROMISC);
3936 mxge_set_multicast_list(sc);
3939 if (ifp->if_flags & IFF_RUNNING) {
3943 lockmgr(&sc->driver_lock, LK_RELEASE);
3948 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
3949 mxge_set_multicast_list(sc);
3950 lockmgr(&sc->driver_lock, LK_RELEASE);
3954 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
3955 mask = ifr->ifr_reqcap ^ ifp->if_capenable;
3956 if (mask & IFCAP_TXCSUM) {
3957 if (IFCAP_TXCSUM & ifp->if_capenable) {
3958 ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
3959 ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
3962 ifp->if_capenable |= IFCAP_TXCSUM;
3963 ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
3965 } else if (mask & IFCAP_RXCSUM) {
3966 if (IFCAP_RXCSUM & ifp->if_capenable) {
3967 ifp->if_capenable &= ~IFCAP_RXCSUM;
3970 ifp->if_capenable |= IFCAP_RXCSUM;
3974 if (mask & IFCAP_TSO4) {
3975 if (IFCAP_TSO4 & ifp->if_capenable) {
3976 ifp->if_capenable &= ~IFCAP_TSO4;
3977 ifp->if_hwassist &= ~CSUM_TSO;
3978 } else if (IFCAP_TXCSUM & ifp->if_capenable) {
3979 ifp->if_capenable |= IFCAP_TSO4;
3980 ifp->if_hwassist |= CSUM_TSO;
3982 kprintf("mxge requires tx checksum offload"
3983 " be enabled to use TSO\n");
3987 if (mask & IFCAP_LRO) {
3988 if (IFCAP_LRO & ifp->if_capenable)
3989 err = mxge_change_lro_locked(sc, 0);
3991 err = mxge_change_lro_locked(sc, mxge_lro_cnt);
3993 if (mask & IFCAP_VLAN_HWTAGGING)
3994 ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
3995 lockmgr(&sc->driver_lock, LK_RELEASE);
3996 VLAN_CAPABILITIES(ifp);
4001 err = ifmedia_ioctl(ifp, (struct ifreq *)data,
4002 &sc->media, command);
4012 mxge_fetch_tunables(mxge_softc_t *sc)
4015 TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4016 TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4017 &mxge_flow_control);
4018 TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4019 &mxge_intr_coal_delay);
4020 TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4021 &mxge_nvidia_ecrc_enable);
4022 TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4023 &mxge_force_firmware);
4024 TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4025 &mxge_deassert_wait);
4026 TUNABLE_INT_FETCH("hw.mxge.verbose",
4028 TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4029 TUNABLE_INT_FETCH("hw.mxge.lro_cnt", &sc->lro_cnt);
4030 TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4031 TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4032 TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4033 if (sc->lro_cnt != 0)
4034 mxge_lro_cnt = sc->lro_cnt;
4038 if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4039 mxge_intr_coal_delay = 30;
4040 if (mxge_ticks == 0)
4041 mxge_ticks = hz / 2;
4042 sc->pause = mxge_flow_control;
4043 if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4044 || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4045 mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
4047 if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4048 mxge_initial_mtu < ETHER_MIN_LEN)
4049 mxge_initial_mtu = ETHERMTU_JUMBO;
4054 mxge_free_slices(mxge_softc_t *sc)
4056 struct mxge_slice_state *ss;
4063 for (i = 0; i < sc->num_slices; i++) {
4065 if (ss->fw_stats != NULL) {
4066 mxge_dma_free(&ss->fw_stats_dma);
4067 ss->fw_stats = NULL;
4068 #ifdef IFNET_BUF_RING
4069 if (ss->tx.br != NULL) {
4070 drbr_free(ss->tx.br, M_DEVBUF);
4074 lockuninit(&ss->tx.lock);
4076 if (ss->rx_done.entry != NULL) {
4077 mxge_dma_free(&ss->rx_done.dma);
4078 ss->rx_done.entry = NULL;
4081 kfree(sc->ss, M_DEVBUF);
4086 mxge_alloc_slices(mxge_softc_t *sc)
4089 struct mxge_slice_state *ss;
4091 int err, i, max_intr_slots;
4093 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4095 device_printf(sc->dev, "Cannot determine rx ring size\n");
4098 sc->rx_ring_size = cmd.data0;
4099 max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4101 bytes = sizeof (*sc->ss) * sc->num_slices;
4102 sc->ss = kmalloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4105 for (i = 0; i < sc->num_slices; i++) {
4110 /* allocate per-slice rx interrupt queues */
4112 bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4113 err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4116 ss->rx_done.entry = ss->rx_done.dma.addr;
4117 bzero(ss->rx_done.entry, bytes);
4120 * allocate the per-slice firmware stats; stats
4121 * (including tx) are used used only on the first
4124 #ifndef IFNET_BUF_RING
4129 bytes = sizeof (*ss->fw_stats);
4130 err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4131 sizeof (*ss->fw_stats), 64);
4134 ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4135 ksnprintf(ss->tx.lock_name, sizeof(ss->tx.lock_name),
4136 "%s:tx(%d)", device_get_nameunit(sc->dev), i);
4137 lockinit(&ss->tx.lock, ss->tx.lock_name, 0, LK_CANRECURSE);
4138 #ifdef IFNET_BUF_RING
4139 ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4147 mxge_free_slices(sc);
4152 mxge_slice_probe(mxge_softc_t *sc)
4156 int msix_cnt, status, max_intr_slots;
4160 * don't enable multiple slices if they are not enabled,
4161 * or if this is not an SMP system
4164 if (mxge_max_slices == 0 || mxge_max_slices == 1 || ncpus < 2)
4167 /* see how many MSI-X interrupts are available */
4168 msix_cnt = pci_msix_count(sc->dev);
4172 /* now load the slice aware firmware see what it supports */
4173 old_fw = sc->fw_name;
4174 if (old_fw == mxge_fw_aligned)
4175 sc->fw_name = mxge_fw_rss_aligned;
4177 sc->fw_name = mxge_fw_rss_unaligned;
4178 status = mxge_load_firmware(sc, 0);
4180 device_printf(sc->dev, "Falling back to a single slice\n");
4184 /* try to send a reset command to the card to see if it
4186 memset(&cmd, 0, sizeof (cmd));
4187 status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4189 device_printf(sc->dev, "failed reset\n");
4193 /* get rx ring size */
4194 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4196 device_printf(sc->dev, "Cannot determine rx ring size\n");
4199 max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4201 /* tell it the size of the interrupt queues */
4202 cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4203 status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4205 device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4209 /* ask the maximum number of slices it supports */
4210 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4212 device_printf(sc->dev,
4213 "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4216 sc->num_slices = cmd.data0;
4217 if (sc->num_slices > msix_cnt)
4218 sc->num_slices = msix_cnt;
4220 if (mxge_max_slices == -1) {
4221 /* cap to number of CPUs in system */
4222 if (sc->num_slices > ncpus)
4223 sc->num_slices = ncpus;
4225 if (sc->num_slices > mxge_max_slices)
4226 sc->num_slices = mxge_max_slices;
4228 /* make sure it is a power of two */
4229 while (sc->num_slices & (sc->num_slices - 1))
4233 device_printf(sc->dev, "using %d slices\n",
4239 sc->fw_name = old_fw;
4240 (void) mxge_load_firmware(sc, 0);
4244 mxge_add_msix_irqs(mxge_softc_t *sc)
4247 int count, err, i, rid;
4250 sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4253 if (sc->msix_table_res == NULL) {
4254 device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4258 count = sc->num_slices;
4259 err = pci_alloc_msix(sc->dev, &count);
4261 device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4262 "err = %d \n", sc->num_slices, err);
4263 goto abort_with_msix_table;
4265 if (count < sc->num_slices) {
4266 device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4267 count, sc->num_slices);
4268 device_printf(sc->dev,
4269 "Try setting hw.mxge.max_slices to %d\n",
4272 goto abort_with_msix;
4274 bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4275 sc->msix_irq_res = kmalloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4276 if (sc->msix_irq_res == NULL) {
4278 goto abort_with_msix;
4281 for (i = 0; i < sc->num_slices; i++) {
4283 sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4286 if (sc->msix_irq_res[i] == NULL) {
4287 device_printf(sc->dev, "couldn't allocate IRQ res"
4288 " for message %d\n", i);
4290 goto abort_with_res;
4294 bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4295 sc->msix_ih = kmalloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4297 for (i = 0; i < sc->num_slices; i++) {
4298 err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4300 mxge_intr, &sc->ss[i], &sc->msix_ih[i],
4301 XXX /* serializer */);
4303 device_printf(sc->dev, "couldn't setup intr for "
4305 goto abort_with_intr;
4310 device_printf(sc->dev, "using %d msix IRQs:",
4312 for (i = 0; i < sc->num_slices; i++)
4313 kprintf(" %ld", rman_get_start(sc->msix_irq_res[i]));
4319 for (i = 0; i < sc->num_slices; i++) {
4320 if (sc->msix_ih[i] != NULL) {
4321 bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4323 sc->msix_ih[i] = NULL;
4326 kfree(sc->msix_ih, M_DEVBUF);
4330 for (i = 0; i < sc->num_slices; i++) {
4332 if (sc->msix_irq_res[i] != NULL)
4333 bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4334 sc->msix_irq_res[i]);
4335 sc->msix_irq_res[i] = NULL;
4337 kfree(sc->msix_irq_res, M_DEVBUF);
4341 pci_release_msi(sc->dev);
4343 abort_with_msix_table:
4344 bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4345 sc->msix_table_res);
4351 mxge_add_single_irq(mxge_softc_t *sc)
4353 int count, err, rid;
4355 count = pci_msi_count(sc->dev);
4356 if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4362 sc->irq_res = bus_alloc_resource(sc->dev, SYS_RES_IRQ, &rid, 0, ~0,
4363 1, RF_SHAREABLE | RF_ACTIVE);
4364 if (sc->irq_res == NULL) {
4365 device_printf(sc->dev, "could not alloc interrupt\n");
4369 device_printf(sc->dev, "using %s irq %ld\n",
4370 sc->legacy_irq ? "INTx" : "MSI",
4371 rman_get_start(sc->irq_res));
4372 err = bus_setup_intr(sc->dev, sc->irq_res,
4374 mxge_intr, &sc->ss[0], &sc->ih,
4375 XXX /* serializer */);
4377 bus_release_resource(sc->dev, SYS_RES_IRQ,
4378 sc->legacy_irq ? 0 : 1, sc->irq_res);
4379 if (!sc->legacy_irq)
4380 pci_release_msi(sc->dev);
4386 mxge_rem_msix_irqs(mxge_softc_t *sc)
4390 for (i = 0; i < sc->num_slices; i++) {
4391 if (sc->msix_ih[i] != NULL) {
4392 bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4394 sc->msix_ih[i] = NULL;
4397 kfree(sc->msix_ih, M_DEVBUF);
4399 for (i = 0; i < sc->num_slices; i++) {
4401 if (sc->msix_irq_res[i] != NULL)
4402 bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4403 sc->msix_irq_res[i]);
4404 sc->msix_irq_res[i] = NULL;
4406 kfree(sc->msix_irq_res, M_DEVBUF);
4408 bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4409 sc->msix_table_res);
4411 pci_release_msi(sc->dev);
4416 mxge_rem_single_irq(mxge_softc_t *sc)
4418 bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4419 bus_release_resource(sc->dev, SYS_RES_IRQ,
4420 sc->legacy_irq ? 0 : 1, sc->irq_res);
4421 if (!sc->legacy_irq)
4422 pci_release_msi(sc->dev);
4426 mxge_rem_irq(mxge_softc_t *sc)
4428 if (sc->num_slices > 1)
4429 mxge_rem_msix_irqs(sc);
4431 mxge_rem_single_irq(sc);
4435 mxge_add_irq(mxge_softc_t *sc)
4439 if (sc->num_slices > 1)
4440 err = mxge_add_msix_irqs(sc);
4442 err = mxge_add_single_irq(sc);
4444 if (0 && err == 0 && sc->num_slices > 1) {
4445 mxge_rem_msix_irqs(sc);
4446 err = mxge_add_msix_irqs(sc);
4453 mxge_attach(device_t dev)
4455 mxge_softc_t *sc = device_get_softc(dev);
4456 struct ifnet *ifp = &sc->arpcom.ac_if;
4460 * avoid rewriting half the lines in this file to use
4461 * &sc->arpcom.ac_if instead
4465 mxge_fetch_tunables(sc);
4467 err = bus_dma_tag_create(NULL, /* parent */
4470 BUS_SPACE_MAXADDR, /* low */
4471 BUS_SPACE_MAXADDR, /* high */
4472 NULL, NULL, /* filter */
4473 65536 + 256, /* maxsize */
4474 MXGE_MAX_SEND_DESC, /* num segs */
4475 65536, /* maxsegsize */
4477 &sc->parent_dmat); /* tag */
4480 device_printf(sc->dev, "Err %d allocating parent dmat\n",
4482 goto abort_with_nothing;
4486 if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4488 ksnprintf(sc->cmd_lock_name, sizeof(sc->cmd_lock_name), "%s:cmd",
4489 device_get_nameunit(dev));
4490 lockinit(&sc->cmd_lock, sc->cmd_lock_name, 0, LK_CANRECURSE);
4491 ksnprintf(sc->driver_lock_name, sizeof(sc->driver_lock_name),
4492 "%s:drv", device_get_nameunit(dev));
4493 lockinit(&sc->driver_lock, sc->driver_lock_name,
4496 callout_init(&sc->co_hdl);
4498 mxge_setup_cfg_space(sc);
4500 /* Map the board into the kernel */
4502 sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
4504 if (sc->mem_res == NULL) {
4505 device_printf(dev, "could not map memory\n");
4507 goto abort_with_lock;
4509 sc->sram = rman_get_virtual(sc->mem_res);
4510 sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4511 if (sc->sram_size > rman_get_size(sc->mem_res)) {
4512 device_printf(dev, "impossible memory region size %ld\n",
4513 rman_get_size(sc->mem_res));
4515 goto abort_with_mem_res;
4518 /* make NULL terminated copy of the EEPROM strings section of
4520 bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4521 bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4522 rman_get_bushandle(sc->mem_res),
4523 sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4525 MXGE_EEPROM_STRINGS_SIZE - 2);
4526 err = mxge_parse_strings(sc);
4528 goto abort_with_mem_res;
4530 /* Enable write combining for efficient use of PCIe bus */
4533 /* Allocate the out of band dma memory */
4534 err = mxge_dma_alloc(sc, &sc->cmd_dma,
4535 sizeof (mxge_cmd_t), 64);
4537 goto abort_with_mem_res;
4538 sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4539 err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4541 goto abort_with_cmd_dma;
4543 err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4545 goto abort_with_zeropad_dma;
4547 /* select & load the firmware */
4548 err = mxge_select_firmware(sc);
4550 goto abort_with_dmabench;
4551 sc->intr_coal_delay = mxge_intr_coal_delay;
4553 mxge_slice_probe(sc);
4554 err = mxge_alloc_slices(sc);
4556 goto abort_with_dmabench;
4558 err = mxge_reset(sc, 0);
4560 goto abort_with_slices;
4562 err = mxge_alloc_rings(sc);
4564 device_printf(sc->dev, "failed to allocate rings\n");
4565 goto abort_with_dmabench;
4568 err = mxge_add_irq(sc);
4570 device_printf(sc->dev, "failed to add irq\n");
4571 goto abort_with_rings;
4574 ifp->if_baudrate = IF_Gbps(10UL);
4575 ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4578 ifp->if_capabilities |= IFCAP_LRO;
4581 #ifdef MXGE_NEW_VLAN_API
4582 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4585 sc->max_mtu = mxge_max_mtu(sc);
4586 if (sc->max_mtu >= 9000)
4587 ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4589 device_printf(dev, "MTU limited to %d. Install "
4590 "latest firmware for 9000 byte jumbo support\n",
4591 sc->max_mtu - ETHER_HDR_LEN);
4592 ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4593 ifp->if_capenable = ifp->if_capabilities;
4594 if (sc->lro_cnt == 0)
4595 ifp->if_capenable &= ~IFCAP_LRO;
4597 ifp->if_init = mxge_init;
4599 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4600 ifp->if_ioctl = mxge_ioctl;
4601 ifp->if_start = mxge_start;
4602 /* Initialise the ifmedia structure */
4603 ifmedia_init(&sc->media, 0, mxge_media_change,
4605 mxge_set_media(sc, IFM_ETHER | IFM_AUTO);
4606 mxge_media_probe(sc);
4608 ether_ifattach(ifp, sc->mac_addr, NULL);
4609 /* ether_ifattach sets mtu to ETHERMTU */
4610 if (mxge_initial_mtu != ETHERMTU)
4611 mxge_change_mtu(sc, mxge_initial_mtu);
4613 mxge_add_sysctls(sc);
4614 #ifdef IFNET_BUF_RING
4615 ifp->if_transmit = mxge_transmit;
4616 ifp->if_qflush = mxge_qflush;
4621 mxge_free_rings(sc);
4623 mxge_free_slices(sc);
4624 abort_with_dmabench:
4625 mxge_dma_free(&sc->dmabench_dma);
4626 abort_with_zeropad_dma:
4627 mxge_dma_free(&sc->zeropad_dma);
4629 mxge_dma_free(&sc->cmd_dma);
4631 bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4633 pci_disable_busmaster(dev);
4634 lockuninit(&sc->cmd_lock);
4635 lockuninit(&sc->driver_lock);
4636 bus_dma_tag_destroy(sc->parent_dmat);
4642 mxge_detach(device_t dev)
4644 mxge_softc_t *sc = device_get_softc(dev);
4646 if (mxge_vlans_active(sc)) {
4647 device_printf(sc->dev,
4648 "Detach vlans before removing module\n");
4651 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
4653 if (sc->ifp->if_flags & IFF_RUNNING)
4655 lockmgr(&sc->driver_lock, LK_RELEASE);
4656 ether_ifdetach(sc->ifp);
4657 callout_drain(&sc->co_hdl);
4658 ifmedia_removeall(&sc->media);
4659 mxge_dummy_rdma(sc, 0);
4660 mxge_rem_sysctls(sc);
4662 mxge_free_rings(sc);
4663 mxge_free_slices(sc);
4664 mxge_dma_free(&sc->dmabench_dma);
4665 mxge_dma_free(&sc->zeropad_dma);
4666 mxge_dma_free(&sc->cmd_dma);
4667 bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4668 pci_disable_busmaster(dev);
4669 lockuninit(&sc->cmd_lock);
4670 lockuninit(&sc->driver_lock);
4671 bus_dma_tag_destroy(sc->parent_dmat);
4676 mxge_shutdown(device_t dev)
4682 This file uses Myri10GE driver indentation.
4685 c-file-style:"linux"