1 /******************************************************************************
3 Copyright (c) 2006-2009, Myricom Inc.
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
9 1. Redistributions of source code must retain the above copyright notice,
10 this list of conditions and the following disclaimer.
12 2. Neither the name of the Myricom Inc, nor the names of its
13 contributors may be used to endorse or promote products derived from
14 this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
28 ***************************************************************************/
30 #include <sys/cdefs.h>
31 /*__FBSDID("$FreeBSD: src/sys/dev/mxge/if_mxge.c,v 1.63 2009/06/26 11:45:06 rwatson Exp $");*/
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/linker.h>
36 #include <sys/firmware.h>
37 #include <sys/endian.h>
38 #include <sys/sockio.h>
40 #include <sys/malloc.h>
41 #include <sys/kernel.h>
43 #include <sys/module.h>
44 #include <sys/socket.h>
45 #include <sys/sysctl.h>
47 /* count xmits ourselves, rather than via drbr */
50 #include <net/if_arp.h>
51 #include <net/ethernet.h>
52 #include <net/if_dl.h>
53 #include <net/if_media.h>
57 #include <net/if_types.h>
58 #include <net/vlan/if_vlan_var.h>
61 #include <netinet/in_systm.h>
62 #include <netinet/in.h>
63 #include <netinet/ip.h>
64 #include <netinet/tcp.h>
66 #include <machine/resource.h>
70 #include <bus/pci/pcireg.h>
71 #include <bus/pci/pcivar.h>
72 #include <bus/pci/pci_private.h> /* XXX for pci_cfg_restore */
74 #include <vm/vm.h> /* for pmap_mapdev() */
77 #if defined(__i386) || defined(__amd64)
78 #include <machine/specialreg.h>
81 #include <dev/netif/mxge/mxge_mcp.h>
82 #include <dev/netif/mxge/mcp_gen_header.h>
83 /*#define MXGE_FAKE_IFP*/
84 #include <dev/netif/mxge/if_mxge_var.h>
86 #include <sys/buf_ring.h>
92 static int mxge_nvidia_ecrc_enable = 1;
93 static int mxge_force_firmware = 0;
94 static int mxge_intr_coal_delay = 30;
95 static int mxge_deassert_wait = 1;
96 static int mxge_flow_control = 1;
97 static int mxge_verbose = 0;
98 static int mxge_lro_cnt = 8;
99 static int mxge_ticks;
100 static int mxge_max_slices = 1;
101 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
102 static int mxge_always_promisc = 0;
103 static int mxge_initial_mtu = ETHERMTU_JUMBO;
104 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
105 static char *mxge_fw_aligned = "mxge_eth_z8e";
106 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
107 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
109 static int mxge_probe(device_t dev);
110 static int mxge_attach(device_t dev);
111 static int mxge_detach(device_t dev);
112 static int mxge_shutdown(device_t dev);
113 static void mxge_intr(void *arg);
115 static device_method_t mxge_methods[] =
117 /* Device interface */
118 DEVMETHOD(device_probe, mxge_probe),
119 DEVMETHOD(device_attach, mxge_attach),
120 DEVMETHOD(device_detach, mxge_detach),
121 DEVMETHOD(device_shutdown, mxge_shutdown),
125 static driver_t mxge_driver =
129 sizeof(mxge_softc_t),
132 static devclass_t mxge_devclass;
134 /* Declare ourselves to be a child of the PCI bus.*/
135 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
136 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
137 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
139 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
140 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
141 static int mxge_close(mxge_softc_t *sc);
142 static int mxge_open(mxge_softc_t *sc);
143 static void mxge_tick(void *arg);
146 mxge_probe(device_t dev)
151 if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
152 ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
153 (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
154 rev = pci_get_revid(dev);
156 case MXGE_PCI_REV_Z8E:
157 device_set_desc(dev, "Myri10G-PCIE-8A");
159 case MXGE_PCI_REV_Z8ES:
160 device_set_desc(dev, "Myri10G-PCIE-8B");
163 device_set_desc(dev, "Myri10G-PCIE-8??");
164 device_printf(dev, "Unrecognized rev %d NIC\n",
174 mxge_enable_wc(mxge_softc_t *sc)
176 #if defined(__i386) || defined(__amd64)
181 len = rman_get_size(sc->mem_res);
182 err = pmap_change_attr((vm_offset_t) sc->sram,
183 len, PAT_WRITE_COMBINING);
185 device_printf(sc->dev, "pmap_change_attr failed, %d\n",
193 /* callback to get our DMA address */
195 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
199 *(bus_addr_t *) arg = segs->ds_addr;
204 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
205 bus_size_t alignment)
208 device_t dev = sc->dev;
209 bus_size_t boundary, maxsegsize;
211 if (bytes > 4096 && alignment == 4096) {
219 /* allocate DMAable memory tags */
220 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
221 alignment, /* alignment */
222 boundary, /* boundary */
223 BUS_SPACE_MAXADDR, /* low */
224 BUS_SPACE_MAXADDR, /* high */
225 NULL, NULL, /* filter */
228 maxsegsize, /* maxsegsize */
229 BUS_DMA_COHERENT, /* flags */
230 NULL, NULL, /* lock */
231 &dma->dmat); /* tag */
233 device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
237 /* allocate DMAable memory & map */
238 err = bus_dmamem_alloc(dma->dmat, &dma->addr,
239 (BUS_DMA_WAITOK | BUS_DMA_COHERENT
240 | BUS_DMA_ZERO), &dma->map);
242 device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
243 goto abort_with_dmat;
246 /* load the memory */
247 err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
248 mxge_dmamap_callback,
249 (void *)&dma->bus_addr, 0);
251 device_printf(dev, "couldn't load map (err = %d)\n", err);
257 bus_dmamem_free(dma->dmat, dma->addr, dma->map);
259 (void)bus_dma_tag_destroy(dma->dmat);
265 mxge_dma_free(mxge_dma_t *dma)
267 bus_dmamap_unload(dma->dmat, dma->map);
268 bus_dmamem_free(dma->dmat, dma->addr, dma->map);
269 (void)bus_dma_tag_destroy(dma->dmat);
273 * The eeprom strings on the lanaiX have the format
280 mxge_parse_strings(mxge_softc_t *sc)
282 #define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
287 ptr = sc->eeprom_strings;
288 limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
290 while (ptr < limit && *ptr != '\0') {
291 if (memcmp(ptr, "MAC=", 4) == 0) {
293 sc->mac_addr_string = ptr;
294 for (i = 0; i < 6; i++) {
296 if ((ptr + 2) > limit)
298 sc->mac_addr[i] = strtoul(ptr, NULL, 16);
301 } else if (memcmp(ptr, "PC=", 3) == 0) {
303 strncpy(sc->product_code_string, ptr,
304 sizeof (sc->product_code_string) - 1);
305 } else if (memcmp(ptr, "SN=", 3) == 0) {
307 strncpy(sc->serial_number_string, ptr,
308 sizeof (sc->serial_number_string) - 1);
310 MXGE_NEXT_STRING(ptr);
317 device_printf(sc->dev, "failed to parse eeprom_strings\n");
322 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
324 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
327 unsigned long base, off;
329 device_t pdev, mcp55;
330 uint16_t vendor_id, device_id, word;
331 uintptr_t bus, slot, func, ivend, idev;
335 if (!mxge_nvidia_ecrc_enable)
338 pdev = device_get_parent(device_get_parent(sc->dev));
340 device_printf(sc->dev, "could not find parent?\n");
343 vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
344 device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
346 if (vendor_id != 0x10de)
351 if (device_id == 0x005d) {
352 /* ck804, base address is magic */
354 } else if (device_id >= 0x0374 && device_id <= 0x378) {
355 /* mcp55, base address stored in chipset */
356 mcp55 = pci_find_bsf(0, 0, 0);
358 0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
359 0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
360 word = pci_read_config(mcp55, 0x90, 2);
361 base = ((unsigned long)word & 0x7ffeU) << 25;
368 Test below is commented because it is believed that doing
369 config read/write beyond 0xff will access the config space
370 for the next larger function. Uncomment this and remove
371 the hacky pmap_mapdev() way of accessing config space when
372 FreeBSD grows support for extended pcie config space access
375 /* See if we can, by some miracle, access the extended
377 val = pci_read_config(pdev, 0x178, 4);
378 if (val != 0xffffffff) {
380 pci_write_config(pdev, 0x178, val, 4);
384 /* Rather than using normal pci config space writes, we must
385 * map the Nvidia config space ourselves. This is because on
386 * opteron/nvidia class machine the 0xe000000 mapping is
387 * handled by the nvidia chipset, that means the internal PCI
388 * device (the on-chip northbridge), or the amd-8131 bridge
389 * and things behind them are not visible by this method.
392 BUS_READ_IVAR(device_get_parent(pdev), pdev,
394 BUS_READ_IVAR(device_get_parent(pdev), pdev,
395 PCI_IVAR_SLOT, &slot);
396 BUS_READ_IVAR(device_get_parent(pdev), pdev,
397 PCI_IVAR_FUNCTION, &func);
398 BUS_READ_IVAR(device_get_parent(pdev), pdev,
399 PCI_IVAR_VENDOR, &ivend);
400 BUS_READ_IVAR(device_get_parent(pdev), pdev,
401 PCI_IVAR_DEVICE, &idev);
404 + 0x00100000UL * (unsigned long)bus
405 + 0x00001000UL * (unsigned long)(func
408 /* map it into the kernel */
409 va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
413 device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
416 /* get a pointer to the config space mapped into the kernel */
417 cfgptr = va + (off & PAGE_MASK);
419 /* make sure that we can really access it */
420 vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
421 device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
422 if (! (vendor_id == ivend && device_id == idev)) {
423 device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
424 vendor_id, device_id);
425 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
429 ptr32 = (uint32_t*)(cfgptr + 0x178);
432 if (val == 0xffffffff) {
433 device_printf(sc->dev, "extended mapping failed\n");
434 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
438 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
440 device_printf(sc->dev,
441 "Enabled ECRC on upstream Nvidia bridge "
443 (int)bus, (int)slot, (int)func);
448 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
450 device_printf(sc->dev,
451 "Nforce 4 chipset on non-x86/amd64!?!?!\n");
458 mxge_dma_test(mxge_softc_t *sc, int test_type)
461 bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
467 /* Run a small DMA test.
468 * The magic multipliers to the length tell the firmware
469 * to do DMA read, write, or read+write tests. The
470 * results are returned in cmd.data0. The upper 16
471 * bits of the return is the number of transfers completed.
472 * The lower 16 bits is the time in 0.5us ticks that the
473 * transfers took to complete.
476 len = sc->tx_boundary;
478 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
479 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
480 cmd.data2 = len * 0x10000;
481 status = mxge_send_cmd(sc, test_type, &cmd);
486 sc->read_dma = ((cmd.data0>>16) * len * 2) /
487 (cmd.data0 & 0xffff);
488 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
489 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
490 cmd.data2 = len * 0x1;
491 status = mxge_send_cmd(sc, test_type, &cmd);
496 sc->write_dma = ((cmd.data0>>16) * len * 2) /
497 (cmd.data0 & 0xffff);
499 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
500 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
501 cmd.data2 = len * 0x10001;
502 status = mxge_send_cmd(sc, test_type, &cmd);
507 sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
508 (cmd.data0 & 0xffff);
511 if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
512 device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
519 * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
520 * when the PCI-E Completion packets are aligned on an 8-byte
521 * boundary. Some PCI-E chip sets always align Completion packets; on
522 * the ones that do not, the alignment can be enforced by enabling
523 * ECRC generation (if supported).
525 * When PCI-E Completion packets are not aligned, it is actually more
526 * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
528 * If the driver can neither enable ECRC nor verify that it has
529 * already been enabled, then it must use a firmware image which works
530 * around unaligned completion packets (ethp_z8e.dat), and it should
531 * also ensure that it never gives the device a Read-DMA which is
532 * larger than 2KB by setting the tx_boundary to 2KB. If ECRC is
533 * enabled, then the driver should use the aligned (eth_z8e.dat)
534 * firmware image, and set tx_boundary to 4KB.
538 mxge_firmware_probe(mxge_softc_t *sc)
540 device_t dev = sc->dev;
544 sc->tx_boundary = 4096;
546 * Verify the max read request size was set to 4KB
547 * before trying the test with 4KB.
549 if (pci_find_extcap(dev, PCIY_EXPRESS, ®) == 0) {
550 pectl = pci_read_config(dev, reg + 0x8, 2);
551 if ((pectl & (5 << 12)) != (5 << 12)) {
552 device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
554 sc->tx_boundary = 2048;
559 * load the optimized firmware (which assumes aligned PCIe
560 * completions) in order to see if it works on this host.
562 sc->fw_name = mxge_fw_aligned;
563 status = mxge_load_firmware(sc, 1);
569 * Enable ECRC if possible
571 mxge_enable_nvidia_ecrc(sc);
574 * Run a DMA test which watches for unaligned completions and
575 * aborts on the first one seen.
578 status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
580 return 0; /* keep the aligned firmware */
583 device_printf(dev, "DMA test failed: %d\n", status);
584 if (status == ENOSYS)
585 device_printf(dev, "Falling back to ethp! "
586 "Please install up to date fw\n");
591 mxge_select_firmware(mxge_softc_t *sc)
596 if (mxge_force_firmware != 0) {
597 if (mxge_force_firmware == 1)
602 device_printf(sc->dev,
603 "Assuming %s completions (forced)\n",
604 aligned ? "aligned" : "unaligned");
608 /* if the PCIe link width is 4 or less, we can use the aligned
609 firmware and skip any checks */
610 if (sc->link_width != 0 && sc->link_width <= 4) {
611 device_printf(sc->dev,
612 "PCIe x%d Link, expect reduced performance\n",
618 if (0 == mxge_firmware_probe(sc))
623 sc->fw_name = mxge_fw_aligned;
624 sc->tx_boundary = 4096;
626 sc->fw_name = mxge_fw_unaligned;
627 sc->tx_boundary = 2048;
629 return (mxge_load_firmware(sc, 0));
639 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
643 if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
644 device_printf(sc->dev, "Bad firmware type: 0x%x\n",
645 be32toh(hdr->mcp_type));
649 /* save firmware version for sysctl */
650 strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
652 device_printf(sc->dev, "firmware id: %s\n", hdr->version);
654 ksscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
655 &sc->fw_ver_minor, &sc->fw_ver_tiny);
657 if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
658 && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
659 device_printf(sc->dev, "Found firmware version %s\n",
661 device_printf(sc->dev, "Driver needs %d.%d\n",
662 MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
670 z_alloc(void *nil, u_int items, u_int size)
674 ptr = kmalloc(items * size, M_TEMP, M_NOWAIT);
679 z_free(void *nil, void *ptr)
686 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
689 char *inflate_buffer;
690 const struct firmware *fw;
691 const mcp_gen_header_t *hdr;
698 fw = firmware_get(sc->fw_name);
700 device_printf(sc->dev, "Could not find firmware image %s\n",
707 /* setup zlib and decompress f/w */
708 bzero(&zs, sizeof (zs));
711 status = inflateInit(&zs);
712 if (status != Z_OK) {
717 /* the uncompressed size is stored as the firmware version,
718 which would otherwise go unused */
719 fw_len = (size_t) fw->version;
720 inflate_buffer = kmalloc(fw_len, M_TEMP, M_NOWAIT);
721 if (inflate_buffer == NULL)
723 zs.avail_in = fw->datasize;
724 zs.next_in = __DECONST(char *, fw->data);
725 zs.avail_out = fw_len;
726 zs.next_out = inflate_buffer;
727 status = inflate(&zs, Z_FINISH);
728 if (status != Z_STREAM_END) {
729 device_printf(sc->dev, "zlib %d\n", status);
731 goto abort_with_buffer;
735 hdr_offset = htobe32(*(const uint32_t *)
736 (inflate_buffer + MCP_HEADER_PTR_OFFSET));
737 if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
738 device_printf(sc->dev, "Bad firmware file");
740 goto abort_with_buffer;
742 hdr = (const void*)(inflate_buffer + hdr_offset);
744 status = mxge_validate_firmware(sc, hdr);
746 goto abort_with_buffer;
748 /* Copy the inflated firmware to NIC SRAM. */
749 for (i = 0; i < fw_len; i += 256) {
750 mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
752 min(256U, (unsigned)(fw_len - i)));
761 kfree(inflate_buffer, M_TEMP);
765 firmware_put(fw, FIRMWARE_UNLOAD);
770 * Enable or disable periodic RDMAs from the host to make certain
771 * chipsets resend dropped PCIe messages
775 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
778 volatile uint32_t *confirm;
779 volatile char *submit;
780 uint32_t *buf, dma_low, dma_high;
783 buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
785 /* clear confirmation addr */
786 confirm = (volatile uint32_t *)sc->cmd;
790 /* send an rdma command to the PCIe engine, and wait for the
791 response in the confirmation address. The firmware should
792 write a -1 there to indicate it is alive and well
795 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
796 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
797 buf[0] = htobe32(dma_high); /* confirm addr MSW */
798 buf[1] = htobe32(dma_low); /* confirm addr LSW */
799 buf[2] = htobe32(0xffffffff); /* confirm data */
800 dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
801 dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
802 buf[3] = htobe32(dma_high); /* dummy addr MSW */
803 buf[4] = htobe32(dma_low); /* dummy addr LSW */
804 buf[5] = htobe32(enable); /* enable? */
807 submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
809 mxge_pio_copy(submit, buf, 64);
814 while (*confirm != 0xffffffff && i < 20) {
818 if (*confirm != 0xffffffff) {
819 device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
820 (enable ? "enable" : "disable"), confirm,
827 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
830 char buf_bytes[sizeof(*buf) + 8];
831 volatile mcp_cmd_response_t *response = sc->cmd;
832 volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
833 uint32_t dma_low, dma_high;
834 int err, sleep_total = 0;
836 /* ensure buf is aligned to 8 bytes */
837 buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
839 buf->data0 = htobe32(data->data0);
840 buf->data1 = htobe32(data->data1);
841 buf->data2 = htobe32(data->data2);
842 buf->cmd = htobe32(cmd);
843 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
844 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
846 buf->response_addr.low = htobe32(dma_low);
847 buf->response_addr.high = htobe32(dma_high);
848 lockmgr(&sc->cmd_lock, LK_EXCLUSIVE);
849 response->result = 0xffffffff;
851 mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
853 /* wait up to 20ms */
855 for (sleep_total = 0; sleep_total < 20; sleep_total++) {
856 bus_dmamap_sync(sc->cmd_dma.dmat,
857 sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
859 switch (be32toh(response->result)) {
861 data->data0 = be32toh(response->data);
867 case MXGEFW_CMD_UNKNOWN:
870 case MXGEFW_CMD_ERROR_UNALIGNED:
873 case MXGEFW_CMD_ERROR_BUSY:
877 device_printf(sc->dev,
879 "failed, result = %d\n",
880 cmd, be32toh(response->result));
888 device_printf(sc->dev, "mxge: command %d timed out"
890 cmd, be32toh(response->result));
891 lockmgr(&sc->cmd_lock, LK_RELEASE);
896 mxge_adopt_running_firmware(mxge_softc_t *sc)
898 struct mcp_gen_header *hdr;
899 const size_t bytes = sizeof (struct mcp_gen_header);
903 /* find running firmware header */
904 hdr_offset = htobe32(*(volatile uint32_t *)
905 (sc->sram + MCP_HEADER_PTR_OFFSET));
907 if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
908 device_printf(sc->dev,
909 "Running firmware has bad header offset (%d)\n",
914 /* copy header of running firmware from SRAM to host memory to
915 * validate firmware */
916 hdr = kmalloc(bytes, M_DEVBUF, M_NOWAIT);
918 device_printf(sc->dev, "could not kmalloc firmware hdr\n");
921 bus_space_read_region_1(rman_get_bustag(sc->mem_res),
922 rman_get_bushandle(sc->mem_res),
923 hdr_offset, (char *)hdr, bytes);
924 status = mxge_validate_firmware(sc, hdr);
925 kfree(hdr, M_DEVBUF);
928 * check to see if adopted firmware has bug where adopting
929 * it will cause broadcasts to be filtered unless the NIC
930 * is kept in ALLMULTI mode
932 if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
933 sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
934 sc->adopted_rx_filter_bug = 1;
935 device_printf(sc->dev, "Adopting fw %d.%d.%d: "
936 "working around rx filter bug\n",
937 sc->fw_ver_major, sc->fw_ver_minor,
946 mxge_load_firmware(mxge_softc_t *sc, int adopt)
948 volatile uint32_t *confirm;
949 volatile char *submit;
951 uint32_t *buf, size, dma_low, dma_high;
954 buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
956 size = sc->sram_size;
957 status = mxge_load_firmware_helper(sc, &size);
961 /* Try to use the currently running firmware, if
963 status = mxge_adopt_running_firmware(sc);
965 device_printf(sc->dev,
966 "failed to adopt running firmware\n");
969 device_printf(sc->dev,
970 "Successfully adopted running firmware\n");
971 if (sc->tx_boundary == 4096) {
972 device_printf(sc->dev,
973 "Using firmware currently running on NIC"
975 device_printf(sc->dev,
976 "performance consider loading optimized "
979 sc->fw_name = mxge_fw_unaligned;
980 sc->tx_boundary = 2048;
983 /* clear confirmation addr */
984 confirm = (volatile uint32_t *)sc->cmd;
987 /* send a reload command to the bootstrap MCP, and wait for the
988 response in the confirmation address. The firmware should
989 write a -1 there to indicate it is alive and well
992 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
993 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
995 buf[0] = htobe32(dma_high); /* confirm addr MSW */
996 buf[1] = htobe32(dma_low); /* confirm addr LSW */
997 buf[2] = htobe32(0xffffffff); /* confirm data */
999 /* FIX: All newest firmware should un-protect the bottom of
1000 the sram before handoff. However, the very first interfaces
1001 do not. Therefore the handoff copy must skip the first 8 bytes
1003 /* where the code starts*/
1004 buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1005 buf[4] = htobe32(size - 8); /* length of code */
1006 buf[5] = htobe32(8); /* where to copy to */
1007 buf[6] = htobe32(0); /* where to jump to */
1009 submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1010 mxge_pio_copy(submit, buf, 64);
1015 while (*confirm != 0xffffffff && i < 20) {
1018 bus_dmamap_sync(sc->cmd_dma.dmat,
1019 sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1021 if (*confirm != 0xffffffff) {
1022 device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1031 mxge_update_mac_address(mxge_softc_t *sc)
1034 uint8_t *addr = sc->mac_addr;
1038 cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1039 | (addr[2] << 8) | addr[3]);
1041 cmd.data1 = ((addr[4] << 8) | (addr[5]));
1043 status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1048 mxge_change_pause(mxge_softc_t *sc, int pause)
1054 status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1057 status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1061 device_printf(sc->dev, "Failed to set flow control mode\n");
1069 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1074 if (mxge_always_promisc)
1078 status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1081 status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1085 device_printf(sc->dev, "Failed to set promisc mode\n");
1090 mxge_set_multicast_list(mxge_softc_t *sc)
1093 struct ifmultiaddr *ifma;
1094 struct ifnet *ifp = sc->ifp;
1097 /* This firmware is known to not support multicast */
1098 if (!sc->fw_multicast_support)
1101 /* Disable multicast filtering while we play with the lists*/
1102 err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1104 device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1105 " error status: %d\n", err);
1109 if (sc->adopted_rx_filter_bug)
1112 if (ifp->if_flags & IFF_ALLMULTI)
1113 /* request to disable multicast filtering, so quit here */
1116 /* Flush all the filters */
1118 err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1120 device_printf(sc->dev,
1121 "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1122 ", error status: %d\n", err);
1126 /* Walk the multicast list, and add each address */
1128 if_maddr_rlock(ifp);
1129 TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1130 if (ifma->ifma_addr->sa_family != AF_LINK)
1132 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1134 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1136 cmd.data0 = htonl(cmd.data0);
1137 cmd.data1 = htonl(cmd.data1);
1138 err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1140 device_printf(sc->dev, "Failed "
1141 "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1143 /* abort, leaving multicast filtering off */
1144 if_maddr_runlock(ifp);
1148 if_maddr_runlock(ifp);
1149 /* Enable multicast filtering */
1150 err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1152 device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1153 ", error status: %d\n", err);
1158 mxge_max_mtu(mxge_softc_t *sc)
1163 if (MJUMPAGESIZE - MXGEFW_PAD > MXGEFW_MAX_MTU)
1164 return MXGEFW_MAX_MTU - MXGEFW_PAD;
1166 /* try to set nbufs to see if it we can
1167 use virtually contiguous jumbos */
1169 status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1172 return MXGEFW_MAX_MTU - MXGEFW_PAD;
1174 /* otherwise, we're limited to MJUMPAGESIZE */
1175 return MJUMPAGESIZE - MXGEFW_PAD;
1179 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1181 struct mxge_slice_state *ss;
1182 mxge_rx_done_t *rx_done;
1183 volatile uint32_t *irq_claim;
1187 /* try to send a reset command to the card to see if it
1189 memset(&cmd, 0, sizeof (cmd));
1190 status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1192 device_printf(sc->dev, "failed reset\n");
1196 mxge_dummy_rdma(sc, 1);
1199 /* set the intrq size */
1200 cmd.data0 = sc->rx_ring_size;
1201 status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1204 * Even though we already know how many slices are supported
1205 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1206 * has magic side effects, and must be called after a reset.
1207 * It must be called prior to calling any RSS related cmds,
1208 * including assigning an interrupt queue for anything but
1209 * slice 0. It must also be called *after*
1210 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1211 * the firmware to compute offsets.
1214 if (sc->num_slices > 1) {
1215 /* ask the maximum number of slices it supports */
1216 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1219 device_printf(sc->dev,
1220 "failed to get number of slices\n");
1224 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1225 * to setting up the interrupt queue DMA
1227 cmd.data0 = sc->num_slices;
1228 cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1229 #ifdef IFNET_BUF_RING
1230 cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1232 status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1235 device_printf(sc->dev,
1236 "failed to set number of slices\n");
1242 if (interrupts_setup) {
1243 /* Now exchange information about interrupts */
1244 for (slice = 0; slice < sc->num_slices; slice++) {
1245 rx_done = &sc->ss[slice].rx_done;
1246 memset(rx_done->entry, 0, sc->rx_ring_size);
1247 cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1248 cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1250 status |= mxge_send_cmd(sc,
1251 MXGEFW_CMD_SET_INTRQ_DMA,
1256 status |= mxge_send_cmd(sc,
1257 MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1260 sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1262 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1263 irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1266 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1268 sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1270 device_printf(sc->dev, "failed set interrupt parameters\n");
1275 *sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1278 /* run a DMA benchmark */
1279 (void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1281 for (slice = 0; slice < sc->num_slices; slice++) {
1282 ss = &sc->ss[slice];
1284 ss->irq_claim = irq_claim + (2 * slice);
1285 /* reset mcp/driver shared state back to 0 */
1286 ss->rx_done.idx = 0;
1287 ss->rx_done.cnt = 0;
1290 ss->tx.pkt_done = 0;
1291 ss->tx.queue_active = 0;
1292 ss->tx.activate = 0;
1293 ss->tx.deactivate = 0;
1298 ss->rx_small.cnt = 0;
1299 ss->lro_bad_csum = 0;
1301 ss->lro_flushed = 0;
1302 if (ss->fw_stats != NULL) {
1303 ss->fw_stats->valid = 0;
1304 ss->fw_stats->send_done_count = 0;
1307 sc->rdma_tags_available = 15;
1308 status = mxge_update_mac_address(sc);
1309 mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1310 mxge_change_pause(sc, sc->pause);
1311 mxge_set_multicast_list(sc);
1316 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1319 unsigned int intr_coal_delay;
1323 intr_coal_delay = sc->intr_coal_delay;
1324 err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1328 if (intr_coal_delay == sc->intr_coal_delay)
1331 if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1334 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
1335 *sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1336 sc->intr_coal_delay = intr_coal_delay;
1338 lockmgr(&sc->driver_lock, LK_RELEASE);
1343 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1346 unsigned int enabled;
1350 enabled = sc->pause;
1351 err = sysctl_handle_int(oidp, &enabled, arg2, req);
1355 if (enabled == sc->pause)
1358 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
1359 err = mxge_change_pause(sc, enabled);
1360 lockmgr(&sc->driver_lock, LK_RELEASE);
1365 mxge_change_lro_locked(mxge_softc_t *sc, int lro_cnt)
1372 ifp->if_capenable &= ~IFCAP_LRO;
1374 ifp->if_capenable |= IFCAP_LRO;
1375 sc->lro_cnt = lro_cnt;
1376 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
1378 err = mxge_open(sc);
1384 mxge_change_lro(SYSCTL_HANDLER_ARGS)
1387 unsigned int lro_cnt;
1391 lro_cnt = sc->lro_cnt;
1392 err = sysctl_handle_int(oidp, &lro_cnt, arg2, req);
1396 if (lro_cnt == sc->lro_cnt)
1402 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
1403 err = mxge_change_lro_locked(sc, lro_cnt);
1404 lockmgr(&sc->driver_lock, LK_RELEASE);
1409 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1415 arg2 = be32toh(*(int *)arg1);
1417 err = sysctl_handle_int(oidp, arg1, arg2, req);
1423 mxge_rem_sysctls(mxge_softc_t *sc)
1425 struct mxge_slice_state *ss;
1428 if (sc->slice_sysctl_tree == NULL)
1431 for (slice = 0; slice < sc->num_slices; slice++) {
1432 ss = &sc->ss[slice];
1433 if (ss == NULL || ss->sysctl_tree == NULL)
1435 sysctl_ctx_free(&ss->sysctl_ctx);
1436 ss->sysctl_tree = NULL;
1438 sysctl_ctx_free(&sc->slice_sysctl_ctx);
1439 sc->slice_sysctl_tree = NULL;
1443 mxge_add_sysctls(mxge_softc_t *sc)
1445 struct sysctl_ctx_list *ctx;
1446 struct sysctl_oid_list *children;
1448 struct mxge_slice_state *ss;
1452 ctx = &sc->sysctl_ctx;
1453 sysctl_ctx_init(ctx);
1454 sc->sysctl_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_STATIC_CHILDREN(_hw),
1456 device_get_nameunit(sc->dev),
1458 if (sc->sysctl_tree == NULL) {
1459 device_printf(sc->dev, "can't add sysctl node\n");
1463 children = SYSCTL_CHILDREN(sc->sysctl_tree);
1464 fw = sc->ss[0].fw_stats;
1466 /* random information */
1467 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1469 CTLFLAG_RD, &sc->fw_version,
1470 0, "firmware version");
1471 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1473 CTLFLAG_RD, &sc->serial_number_string,
1474 0, "serial number");
1475 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1477 CTLFLAG_RD, &sc->product_code_string,
1479 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1481 CTLFLAG_RD, &sc->link_width,
1483 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1485 CTLFLAG_RD, &sc->tx_boundary,
1487 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1489 CTLFLAG_RD, &sc->wc,
1490 0, "write combining PIO?");
1491 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1493 CTLFLAG_RD, &sc->read_dma,
1494 0, "DMA Read speed in MB/s");
1495 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1497 CTLFLAG_RD, &sc->write_dma,
1498 0, "DMA Write speed in MB/s");
1499 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1500 "read_write_dma_MBs",
1501 CTLFLAG_RD, &sc->read_write_dma,
1502 0, "DMA concurrent Read/Write speed in MB/s");
1505 /* performance related tunables */
1506 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1508 CTLTYPE_INT|CTLFLAG_RW, sc,
1509 0, mxge_change_intr_coal,
1510 "I", "interrupt coalescing delay in usecs");
1512 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1513 "flow_control_enabled",
1514 CTLTYPE_INT|CTLFLAG_RW, sc,
1515 0, mxge_change_flow_control,
1516 "I", "interrupt coalescing delay in usecs");
1518 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1520 CTLFLAG_RW, &mxge_deassert_wait,
1521 0, "Wait for IRQ line to go low in ihandler");
1523 /* stats block from firmware is in network byte order.
1525 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1527 CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1528 0, mxge_handle_be32,
1530 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1531 "rdma_tags_available",
1532 CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1533 0, mxge_handle_be32,
1534 "I", "rdma_tags_available");
1535 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1536 "dropped_bad_crc32",
1537 CTLTYPE_INT|CTLFLAG_RD,
1538 &fw->dropped_bad_crc32,
1539 0, mxge_handle_be32,
1540 "I", "dropped_bad_crc32");
1541 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1543 CTLTYPE_INT|CTLFLAG_RD,
1544 &fw->dropped_bad_phy,
1545 0, mxge_handle_be32,
1546 "I", "dropped_bad_phy");
1547 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1548 "dropped_link_error_or_filtered",
1549 CTLTYPE_INT|CTLFLAG_RD,
1550 &fw->dropped_link_error_or_filtered,
1551 0, mxge_handle_be32,
1552 "I", "dropped_link_error_or_filtered");
1553 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1554 "dropped_link_overflow",
1555 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1556 0, mxge_handle_be32,
1557 "I", "dropped_link_overflow");
1558 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1559 "dropped_multicast_filtered",
1560 CTLTYPE_INT|CTLFLAG_RD,
1561 &fw->dropped_multicast_filtered,
1562 0, mxge_handle_be32,
1563 "I", "dropped_multicast_filtered");
1564 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1565 "dropped_no_big_buffer",
1566 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1567 0, mxge_handle_be32,
1568 "I", "dropped_no_big_buffer");
1569 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1570 "dropped_no_small_buffer",
1571 CTLTYPE_INT|CTLFLAG_RD,
1572 &fw->dropped_no_small_buffer,
1573 0, mxge_handle_be32,
1574 "I", "dropped_no_small_buffer");
1575 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1577 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1578 0, mxge_handle_be32,
1579 "I", "dropped_overrun");
1580 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1582 CTLTYPE_INT|CTLFLAG_RD,
1584 0, mxge_handle_be32,
1585 "I", "dropped_pause");
1586 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1588 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1589 0, mxge_handle_be32,
1590 "I", "dropped_runt");
1592 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1593 "dropped_unicast_filtered",
1594 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1595 0, mxge_handle_be32,
1596 "I", "dropped_unicast_filtered");
1598 /* verbose printing? */
1599 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1601 CTLFLAG_RW, &mxge_verbose,
1602 0, "verbose printing");
1605 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1607 CTLTYPE_INT|CTLFLAG_RW, sc,
1609 "I", "number of lro merge queues");
1612 /* add counters exported for debugging from all slices */
1613 sysctl_ctx_init(&sc->slice_sysctl_ctx);
1614 sc->slice_sysctl_tree =
1615 SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1616 "slice", CTLFLAG_RD, 0, "");
1618 for (slice = 0; slice < sc->num_slices; slice++) {
1619 ss = &sc->ss[slice];
1620 sysctl_ctx_init(&ss->sysctl_ctx);
1621 ctx = &ss->sysctl_ctx;
1622 children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1623 ksprintf(slice_num, "%d", slice);
1625 SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1627 children = SYSCTL_CHILDREN(ss->sysctl_tree);
1628 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1630 CTLFLAG_RD, &ss->rx_small.cnt,
1632 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1634 CTLFLAG_RD, &ss->rx_big.cnt,
1636 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1637 "lro_flushed", CTLFLAG_RD, &ss->lro_flushed,
1638 0, "number of lro merge queues flushed");
1640 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1641 "lro_queued", CTLFLAG_RD, &ss->lro_queued,
1642 0, "number of frames appended to lro merge"
1645 #ifndef IFNET_BUF_RING
1646 /* only transmit from slice 0 for now */
1650 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1652 CTLFLAG_RD, &ss->tx.req,
1655 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1657 CTLFLAG_RD, &ss->tx.done,
1659 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1661 CTLFLAG_RD, &ss->tx.pkt_done,
1663 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1665 CTLFLAG_RD, &ss->tx.stall,
1667 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1669 CTLFLAG_RD, &ss->tx.wake,
1671 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1673 CTLFLAG_RD, &ss->tx.defrag,
1675 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1677 CTLFLAG_RD, &ss->tx.queue_active,
1678 0, "tx_queue_active");
1679 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1681 CTLFLAG_RD, &ss->tx.activate,
1683 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1685 CTLFLAG_RD, &ss->tx.deactivate,
1686 0, "tx_deactivate");
1690 /* copy an array of mcp_kreq_ether_send_t's to the mcp. Copy
1691 backwards one at a time and handle ring wraps */
1694 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1695 mcp_kreq_ether_send_t *src, int cnt)
1697 int idx, starting_slot;
1698 starting_slot = tx->req;
1701 idx = (starting_slot + cnt) & tx->mask;
1702 mxge_pio_copy(&tx->lanai[idx],
1703 &src[cnt], sizeof(*src));
1709 * copy an array of mcp_kreq_ether_send_t's to the mcp. Copy
1710 * at most 32 bytes at a time, so as to avoid involving the software
1711 * pio handler in the nic. We re-write the first segment's flags
1712 * to mark them valid only after writing the entire chain
1716 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1721 volatile uint32_t *dst_ints;
1722 mcp_kreq_ether_send_t *srcp;
1723 volatile mcp_kreq_ether_send_t *dstp, *dst;
1726 idx = tx->req & tx->mask;
1728 last_flags = src->flags;
1731 dst = dstp = &tx->lanai[idx];
1734 if ((idx + cnt) < tx->mask) {
1735 for (i = 0; i < (cnt - 1); i += 2) {
1736 mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1737 wmb(); /* force write every 32 bytes */
1742 /* submit all but the first request, and ensure
1743 that it is submitted below */
1744 mxge_submit_req_backwards(tx, src, cnt);
1748 /* submit the first request */
1749 mxge_pio_copy(dstp, srcp, sizeof(*src));
1750 wmb(); /* barrier before setting valid flag */
1753 /* re-write the last 32-bits with the valid flags */
1754 src->flags = last_flags;
1755 src_ints = (uint32_t *)src;
1757 dst_ints = (volatile uint32_t *)dst;
1759 *dst_ints = *src_ints;
1767 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1768 int busdma_seg_cnt, int ip_off)
1771 mcp_kreq_ether_send_t *req;
1772 bus_dma_segment_t *seg;
1775 uint32_t low, high_swapped;
1776 int len, seglen, cum_len, cum_len_next;
1777 int next_is_first, chop, cnt, rdma_count, small;
1778 uint16_t pseudo_hdr_offset, cksum_offset, mss;
1779 uint8_t flags, flags_next;
1782 mss = m->m_pkthdr.tso_segsz;
1784 /* negative cum_len signifies to the
1785 * send loop that we are still in the
1786 * header portion of the TSO packet.
1789 /* ensure we have the ethernet, IP and TCP
1790 header together in the first mbuf, copy
1791 it to a scratch buffer if not */
1792 if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1793 m_copydata(m, 0, ip_off + sizeof (*ip),
1795 ip = (struct ip *)(ss->scratch + ip_off);
1797 ip = (struct ip *)(mtod(m, char *) + ip_off);
1799 if (__predict_false(m->m_len < ip_off + (ip->ip_hl << 2)
1801 m_copydata(m, 0, ip_off + (ip->ip_hl << 2)
1802 + sizeof (*tcp), ss->scratch);
1803 ip = (struct ip *)(mtod(m, char *) + ip_off);
1806 tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1807 cum_len = -(ip_off + ((ip->ip_hl + tcp->th_off) << 2));
1809 /* TSO implies checksum offload on this hardware */
1810 cksum_offset = ip_off + (ip->ip_hl << 2);
1811 flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1814 /* for TSO, pseudo_hdr_offset holds mss.
1815 * The firmware figures out where to put
1816 * the checksum by parsing the header. */
1817 pseudo_hdr_offset = htobe16(mss);
1824 /* "rdma_count" is the number of RDMAs belonging to the
1825 * current packet BEFORE the current send request. For
1826 * non-TSO packets, this is equal to "count".
1827 * For TSO packets, rdma_count needs to be reset
1828 * to 0 after a segment cut.
1830 * The rdma_count field of the send request is
1831 * the number of RDMAs of the packet starting at
1832 * that request. For TSO send requests with one ore more cuts
1833 * in the middle, this is the number of RDMAs starting
1834 * after the last cut in the request. All previous
1835 * segments before the last cut implicitly have 1 RDMA.
1837 * Since the number of RDMAs is not known beforehand,
1838 * it must be filled-in retroactively - after each
1839 * segmentation cut or at the end of the entire packet.
1842 while (busdma_seg_cnt) {
1843 /* Break the busdma segment up into pieces*/
1844 low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1845 high_swapped = htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1849 flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1851 cum_len_next = cum_len + seglen;
1852 (req-rdma_count)->rdma_count = rdma_count + 1;
1853 if (__predict_true(cum_len >= 0)) {
1855 chop = (cum_len_next > mss);
1856 cum_len_next = cum_len_next % mss;
1857 next_is_first = (cum_len_next == 0);
1858 flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1859 flags_next |= next_is_first *
1861 rdma_count |= -(chop | next_is_first);
1862 rdma_count += chop & !next_is_first;
1863 } else if (cum_len_next >= 0) {
1868 small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1869 flags_next = MXGEFW_FLAGS_TSO_PLD |
1870 MXGEFW_FLAGS_FIRST |
1871 (small * MXGEFW_FLAGS_SMALL);
1874 req->addr_high = high_swapped;
1875 req->addr_low = htobe32(low);
1876 req->pseudo_hdr_offset = pseudo_hdr_offset;
1878 req->rdma_count = 1;
1879 req->length = htobe16(seglen);
1880 req->cksum_offset = cksum_offset;
1881 req->flags = flags | ((cum_len & 1) *
1882 MXGEFW_FLAGS_ALIGN_ODD);
1885 cum_len = cum_len_next;
1890 if (__predict_false(cksum_offset > seglen))
1891 cksum_offset -= seglen;
1894 if (__predict_false(cnt > tx->max_desc))
1900 (req-rdma_count)->rdma_count = rdma_count;
1904 req->flags |= MXGEFW_FLAGS_TSO_LAST;
1905 } while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1907 tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1908 mxge_submit_req(tx, tx->req_list, cnt);
1909 #ifdef IFNET_BUF_RING
1910 if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1911 /* tell the NIC to start polling this slice */
1913 tx->queue_active = 1;
1921 bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1925 printf("tx->max_desc exceeded via TSO!\n");
1926 printf("mss = %d, %ld, %d!\n", mss,
1927 (long)seg - (long)tx->seg_list, tx->max_desc);
1934 #endif /* IFCAP_TSO4 */
1936 #ifdef MXGE_NEW_VLAN_API
1938 * We reproduce the software vlan tag insertion from
1939 * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
1940 * vlan tag insertion. We need to advertise this in order to have the
1941 * vlan interface respect our csum offload flags.
1943 static struct mbuf *
1944 mxge_vlan_tag_insert(struct mbuf *m)
1946 struct ether_vlan_header *evl;
1948 M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_DONTWAIT);
1949 if (__predict_false(m == NULL))
1951 if (m->m_len < sizeof(*evl)) {
1952 m = m_pullup(m, sizeof(*evl));
1953 if (__predict_false(m == NULL))
1957 * Transform the Ethernet header into an Ethernet header
1958 * with 802.1Q encapsulation.
1960 evl = mtod(m, struct ether_vlan_header *);
1961 bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
1962 (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
1963 evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
1964 evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
1965 m->m_flags &= ~M_VLANTAG;
1968 #endif /* MXGE_NEW_VLAN_API */
1971 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
1974 mcp_kreq_ether_send_t *req;
1975 bus_dma_segment_t *seg;
1980 int cnt, cum_len, err, i, idx, odd_flag, ip_off;
1981 uint16_t pseudo_hdr_offset;
1982 uint8_t flags, cksum_offset;
1989 ip_off = sizeof (struct ether_header);
1990 #ifdef MXGE_NEW_VLAN_API
1991 if (m->m_flags & M_VLANTAG) {
1992 m = mxge_vlan_tag_insert(m);
1993 if (__predict_false(m == NULL))
1995 ip_off += ETHER_VLAN_ENCAP_LEN;
1998 /* (try to) map the frame for DMA */
1999 idx = tx->req & tx->mask;
2000 err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
2001 m, tx->seg_list, &cnt,
2003 if (__predict_false(err == EFBIG)) {
2004 /* Too many segments in the chain. Try
2006 m_tmp = m_defrag(m, M_NOWAIT);
2007 if (m_tmp == NULL) {
2012 err = bus_dmamap_load_mbuf_sg(tx->dmat,
2014 m, tx->seg_list, &cnt,
2017 if (__predict_false(err != 0)) {
2018 device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
2019 " packet len = %d\n", err, m->m_pkthdr.len);
2022 bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2023 BUS_DMASYNC_PREWRITE);
2024 tx->info[idx].m = m;
2027 /* TSO is different enough, we handle it in another routine */
2028 if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2029 mxge_encap_tso(ss, m, cnt, ip_off);
2036 pseudo_hdr_offset = 0;
2037 flags = MXGEFW_FLAGS_NO_TSO;
2039 /* checksum offloading? */
2040 if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
2041 /* ensure ip header is in first mbuf, copy
2042 it to a scratch buffer if not */
2043 if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
2044 m_copydata(m, 0, ip_off + sizeof (*ip),
2046 ip = (struct ip *)(ss->scratch + ip_off);
2048 ip = (struct ip *)(mtod(m, char *) + ip_off);
2050 cksum_offset = ip_off + (ip->ip_hl << 2);
2051 pseudo_hdr_offset = cksum_offset + m->m_pkthdr.csum_data;
2052 pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2053 req->cksum_offset = cksum_offset;
2054 flags |= MXGEFW_FLAGS_CKSUM;
2055 odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2059 if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2060 flags |= MXGEFW_FLAGS_SMALL;
2062 /* convert segments into a request list */
2065 req->flags = MXGEFW_FLAGS_FIRST;
2066 for (i = 0; i < cnt; i++) {
2068 htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2070 htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2071 req->length = htobe16(seg->ds_len);
2072 req->cksum_offset = cksum_offset;
2073 if (cksum_offset > seg->ds_len)
2074 cksum_offset -= seg->ds_len;
2077 req->pseudo_hdr_offset = pseudo_hdr_offset;
2078 req->pad = 0; /* complete solid 16-byte block */
2079 req->rdma_count = 1;
2080 req->flags |= flags | ((cum_len & 1) * odd_flag);
2081 cum_len += seg->ds_len;
2087 /* pad runts to 60 bytes */
2091 htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2093 htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2094 req->length = htobe16(60 - cum_len);
2095 req->cksum_offset = 0;
2096 req->pseudo_hdr_offset = pseudo_hdr_offset;
2097 req->pad = 0; /* complete solid 16-byte block */
2098 req->rdma_count = 1;
2099 req->flags |= flags | ((cum_len & 1) * odd_flag);
2103 tx->req_list[0].rdma_count = cnt;
2105 /* print what the firmware will see */
2106 for (i = 0; i < cnt; i++) {
2107 printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2108 "cso:%d, flags:0x%x, rdma:%d\n",
2109 i, (int)ntohl(tx->req_list[i].addr_high),
2110 (int)ntohl(tx->req_list[i].addr_low),
2111 (int)ntohs(tx->req_list[i].length),
2112 (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2113 tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2114 tx->req_list[i].rdma_count);
2116 printf("--------------\n");
2118 tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2119 mxge_submit_req(tx, tx->req_list, cnt);
2120 #ifdef IFNET_BUF_RING
2121 if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2122 /* tell the NIC to start polling this slice */
2124 tx->queue_active = 1;
2137 #ifdef IFNET_BUF_RING
2139 mxge_qflush(struct ifnet *ifp)
2141 mxge_softc_t *sc = ifp->if_softc;
2146 for (slice = 0; slice < sc->num_slices; slice++) {
2147 tx = &sc->ss[slice].tx;
2148 lockmgr(&tx->lock, LK_EXCLUSIVE);
2149 while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2151 lockmgr(&tx->lock, LK_RELEASE);
2157 mxge_start_locked(struct mxge_slice_state *ss)
2168 while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2169 m = drbr_dequeue(ifp, tx->br);
2173 /* let BPF see it */
2176 /* give it to the nic */
2179 /* ran out of transmit slots */
2180 if (((ss->if_drv_flags & IFF_DRV_OACTIVE) == 0)
2181 && (!drbr_empty(ifp, tx->br))) {
2182 ss->if_drv_flags |= IFF_DRV_OACTIVE;
2188 mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2199 if ((ss->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
2201 err = drbr_enqueue(ifp, tx->br, m);
2205 if (drbr_empty(ifp, tx->br) &&
2206 ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2207 /* let BPF see it */
2209 /* give it to the nic */
2211 } else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2214 if (!drbr_empty(ifp, tx->br))
2215 mxge_start_locked(ss);
2220 mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2222 mxge_softc_t *sc = ifp->if_softc;
2223 struct mxge_slice_state *ss;
2228 slice = m->m_pkthdr.flowid;
2229 slice &= (sc->num_slices - 1); /* num_slices always power of 2 */
2231 ss = &sc->ss[slice];
2234 if (lockmgr(&tx->lock, LK_EXCLUSIVE|LK_NOWAIT)) {
2235 err = mxge_transmit_locked(ss, m);
2236 lockmgr(&tx->lock, LK_RELEASE);
2238 err = drbr_enqueue(ifp, tx->br, m);
2247 mxge_start_locked(struct mxge_slice_state *ss)
2257 while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2258 IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
2262 /* let BPF see it */
2265 /* give it to the nic */
2268 /* ran out of transmit slots */
2269 if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
2270 sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
2276 mxge_start(struct ifnet *ifp)
2278 mxge_softc_t *sc = ifp->if_softc;
2279 struct mxge_slice_state *ss;
2281 /* only use the first slice for now */
2283 lockmgr(&ss->tx.lock, LK_EXCLUSIVE);
2284 mxge_start_locked(ss);
2285 lockmgr(&ss->tx.lock, LK_RELEASE);
2289 * copy an array of mcp_kreq_ether_recv_t's to the mcp. Copy
2290 * at most 32 bytes at a time, so as to avoid involving the software
2291 * pio handler in the nic. We re-write the first segment's low
2292 * DMA address to mark it valid only after we write the entire chunk
2296 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2297 mcp_kreq_ether_recv_t *src)
2301 low = src->addr_low;
2302 src->addr_low = 0xffffffff;
2303 mxge_pio_copy(dst, src, 4 * sizeof (*src));
2305 mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2307 src->addr_low = low;
2308 dst->addr_low = low;
2313 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2315 bus_dma_segment_t seg;
2317 mxge_rx_ring_t *rx = &ss->rx_small;
2320 m = m_gethdr(M_DONTWAIT, MT_DATA);
2327 err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2328 &seg, &cnt, BUS_DMA_NOWAIT);
2333 rx->info[idx].m = m;
2334 rx->shadow[idx].addr_low =
2335 htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2336 rx->shadow[idx].addr_high =
2337 htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2341 mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2346 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2348 bus_dma_segment_t seg[3];
2350 mxge_rx_ring_t *rx = &ss->rx_big;
2353 if (rx->cl_size == MCLBYTES)
2354 m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR);
2356 m = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2362 m->m_len = rx->mlen;
2363 err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2364 seg, &cnt, BUS_DMA_NOWAIT);
2369 rx->info[idx].m = m;
2370 rx->shadow[idx].addr_low =
2371 htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2372 rx->shadow[idx].addr_high =
2373 htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2375 #if MXGE_VIRT_JUMBOS
2376 for (i = 1; i < cnt; i++) {
2377 rx->shadow[idx + i].addr_low =
2378 htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2379 rx->shadow[idx + i].addr_high =
2380 htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2385 for (i = 0; i < rx->nbufs; i++) {
2386 if ((idx & 7) == 7) {
2387 mxge_submit_8rx(&rx->lanai[idx - 7],
2388 &rx->shadow[idx - 7]);
2396 * Myri10GE hardware checksums are not valid if the sender
2397 * padded the frame with non-zero padding. This is because
2398 * the firmware just does a simple 16-bit 1s complement
2399 * checksum across the entire frame, excluding the first 14
2400 * bytes. It is best to simply to check the checksum and
2401 * tell the stack about it only if the checksum is good
2404 static inline uint16_t
2405 mxge_rx_csum(struct mbuf *m, int csum)
2407 struct ether_header *eh;
2411 eh = mtod(m, struct ether_header *);
2413 /* only deal with IPv4 TCP & UDP for now */
2414 if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2416 ip = (struct ip *)(eh + 1);
2417 if (__predict_false(ip->ip_p != IPPROTO_TCP &&
2418 ip->ip_p != IPPROTO_UDP))
2421 c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2422 htonl(ntohs(csum) + ntohs(ip->ip_len) +
2423 - (ip->ip_hl << 2) + ip->ip_p));
2432 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2434 struct ether_vlan_header *evl;
2435 struct ether_header *eh;
2438 evl = mtod(m, struct ether_vlan_header *);
2439 eh = mtod(m, struct ether_header *);
2442 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2443 * after what the firmware thought was the end of the ethernet
2447 /* put checksum into host byte order */
2448 *csum = ntohs(*csum);
2449 partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2450 (*csum) += ~partial;
2451 (*csum) += ((*csum) < ~partial);
2452 (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2453 (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2455 /* restore checksum to network byte order;
2456 later consumers expect this */
2457 *csum = htons(*csum);
2460 #ifdef MXGE_NEW_VLAN_API
2461 m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2465 mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2469 VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2470 m_tag_prepend(m, mtag);
2474 m->m_flags |= M_VLANTAG;
2477 * Remove the 802.1q header by copying the Ethernet
2478 * addresses over it and adjusting the beginning of
2479 * the data in the mbuf. The encapsulated Ethernet
2480 * type field is already in place.
2482 bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2483 ETHER_HDR_LEN - ETHER_TYPE_LEN);
2484 m_adj(m, ETHER_VLAN_ENCAP_LEN);
2489 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2494 struct ether_header *eh;
2496 bus_dmamap_t old_map;
2498 uint16_t tcpudp_csum;
2503 idx = rx->cnt & rx->mask;
2504 rx->cnt += rx->nbufs;
2505 /* save a pointer to the received mbuf */
2506 m = rx->info[idx].m;
2507 /* try to replace the received mbuf */
2508 if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2509 /* drop the frame -- the old mbuf is re-cycled */
2514 /* unmap the received buffer */
2515 old_map = rx->info[idx].map;
2516 bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2517 bus_dmamap_unload(rx->dmat, old_map);
2519 /* swap the bus_dmamap_t's */
2520 rx->info[idx].map = rx->extra_map;
2521 rx->extra_map = old_map;
2523 /* mcp implicitly skips 1st 2 bytes so that packet is properly
2525 m->m_data += MXGEFW_PAD;
2527 m->m_pkthdr.rcvif = ifp;
2528 m->m_len = m->m_pkthdr.len = len;
2530 eh = mtod(m, struct ether_header *);
2531 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2532 mxge_vlan_tag_remove(m, &csum);
2534 /* if the checksum is valid, mark it in the mbuf header */
2535 if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2536 if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2538 /* otherwise, it was a UDP frame, or a TCP frame which
2539 we could not do LRO on. Tell the stack that the
2541 m->m_pkthdr.csum_data = 0xffff;
2542 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2544 /* flowid only valid if RSS hashing is enabled */
2545 if (sc->num_slices > 1) {
2546 m->m_pkthdr.flowid = (ss - sc->ss);
2547 m->m_flags |= M_FLOWID;
2549 /* pass the frame up the stack */
2550 (*ifp->if_input)(ifp, m);
2554 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2558 struct ether_header *eh;
2561 bus_dmamap_t old_map;
2563 uint16_t tcpudp_csum;
2568 idx = rx->cnt & rx->mask;
2570 /* save a pointer to the received mbuf */
2571 m = rx->info[idx].m;
2572 /* try to replace the received mbuf */
2573 if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2574 /* drop the frame -- the old mbuf is re-cycled */
2579 /* unmap the received buffer */
2580 old_map = rx->info[idx].map;
2581 bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2582 bus_dmamap_unload(rx->dmat, old_map);
2584 /* swap the bus_dmamap_t's */
2585 rx->info[idx].map = rx->extra_map;
2586 rx->extra_map = old_map;
2588 /* mcp implicitly skips 1st 2 bytes so that packet is properly
2590 m->m_data += MXGEFW_PAD;
2592 m->m_pkthdr.rcvif = ifp;
2593 m->m_len = m->m_pkthdr.len = len;
2595 eh = mtod(m, struct ether_header *);
2596 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2597 mxge_vlan_tag_remove(m, &csum);
2599 /* if the checksum is valid, mark it in the mbuf header */
2600 if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2601 if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2603 /* otherwise, it was a UDP frame, or a TCP frame which
2604 we could not do LRO on. Tell the stack that the
2606 m->m_pkthdr.csum_data = 0xffff;
2607 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2609 /* flowid only valid if RSS hashing is enabled */
2610 if (sc->num_slices > 1) {
2611 m->m_pkthdr.flowid = (ss - sc->ss);
2612 m->m_flags |= M_FLOWID;
2614 /* pass the frame up the stack */
2615 (*ifp->if_input)(ifp, m);
2619 mxge_clean_rx_done(struct mxge_slice_state *ss)
2621 mxge_rx_done_t *rx_done = &ss->rx_done;
2627 while (rx_done->entry[rx_done->idx].length != 0) {
2628 length = ntohs(rx_done->entry[rx_done->idx].length);
2629 rx_done->entry[rx_done->idx].length = 0;
2630 checksum = rx_done->entry[rx_done->idx].checksum;
2631 if (length <= (MHLEN - MXGEFW_PAD))
2632 mxge_rx_done_small(ss, length, checksum);
2634 mxge_rx_done_big(ss, length, checksum);
2636 rx_done->idx = rx_done->cnt & rx_done->mask;
2638 /* limit potential for livelock */
2639 if (__predict_false(++limit > rx_done->mask / 2))
2643 while (!SLIST_EMPTY(&ss->lro_active)) {
2644 struct lro_entry *lro = SLIST_FIRST(&ss->lro_active);
2645 SLIST_REMOVE_HEAD(&ss->lro_active, next);
2646 mxge_lro_flush(ss, lro);
2653 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2664 while (tx->pkt_done != mcp_idx) {
2665 idx = tx->done & tx->mask;
2667 m = tx->info[idx].m;
2668 /* mbuf and DMA map only attached to the first
2671 ss->obytes += m->m_pkthdr.len;
2672 if (m->m_flags & M_MCAST)
2675 tx->info[idx].m = NULL;
2676 map = tx->info[idx].map;
2677 bus_dmamap_unload(tx->dmat, map);
2680 if (tx->info[idx].flag) {
2681 tx->info[idx].flag = 0;
2686 /* If we have space, clear IFF_OACTIVE to tell the stack that
2687 its OK to send packets */
2688 #ifdef IFNET_BUF_RING
2689 flags = &ss->if_drv_flags;
2691 flags = &ifp->if_drv_flags;
2693 lockmgr(&ss->tx.lock, LK_EXCLUSIVE);
2694 if ((*flags) & IFF_DRV_OACTIVE &&
2695 tx->req - tx->done < (tx->mask + 1)/4) {
2696 *(flags) &= ~IFF_DRV_OACTIVE;
2698 mxge_start_locked(ss);
2700 #ifdef IFNET_BUF_RING
2701 if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2702 /* let the NIC stop polling this queue, since there
2703 * are no more transmits pending */
2704 if (tx->req == tx->done) {
2706 tx->queue_active = 0;
2712 lockmgr(&ss->tx.lock, LK_RELEASE);
2716 static struct mxge_media_type mxge_xfp_media_types[] =
2718 {IFM_10G_CX4, 0x7f, "10GBASE-CX4 (module)"},
2719 {IFM_10G_SR, (1 << 7), "10GBASE-SR"},
2720 {IFM_10G_LR, (1 << 6), "10GBASE-LR"},
2721 {0, (1 << 5), "10GBASE-ER"},
2722 {IFM_10G_LRM, (1 << 4), "10GBASE-LRM"},
2723 {0, (1 << 3), "10GBASE-SW"},
2724 {0, (1 << 2), "10GBASE-LW"},
2725 {0, (1 << 1), "10GBASE-EW"},
2726 {0, (1 << 0), "Reserved"}
2728 static struct mxge_media_type mxge_sfp_media_types[] =
2730 {0, (1 << 7), "Reserved"},
2731 {IFM_10G_LRM, (1 << 6), "10GBASE-LRM"},
2732 {IFM_10G_LR, (1 << 5), "10GBASE-LR"},
2733 {IFM_10G_SR, (1 << 4), "10GBASE-SR"}
2737 mxge_set_media(mxge_softc_t *sc, int type)
2739 sc->media_flags |= type;
2740 ifmedia_add(&sc->media, sc->media_flags, 0, NULL);
2741 ifmedia_set(&sc->media, sc->media_flags);
2746 * Determine the media type for a NIC. Some XFPs will identify
2747 * themselves only when their link is up, so this is initiated via a
2748 * link up interrupt. However, this can potentially take up to
2749 * several milliseconds, so it is run via the watchdog routine, rather
2750 * than in the interrupt handler itself. This need only be done
2751 * once, not each time the link is up.
2754 mxge_media_probe(mxge_softc_t *sc)
2759 struct mxge_media_type *mxge_media_types = NULL;
2760 int i, err, ms, mxge_media_type_entries;
2763 sc->need_media_probe = 0;
2765 /* if we've already set a media type, we're done */
2766 if (sc->media_flags != (IFM_ETHER | IFM_AUTO))
2770 * parse the product code to deterimine the interface type
2771 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2772 * after the 3rd dash in the driver's cached copy of the
2773 * EEPROM's product code string.
2775 ptr = sc->product_code_string;
2777 device_printf(sc->dev, "Missing product code\n");
2780 for (i = 0; i < 3; i++, ptr++) {
2781 ptr = index(ptr, '-');
2783 device_printf(sc->dev,
2784 "only %d dashes in PC?!?\n", i);
2790 mxge_set_media(sc, IFM_10G_CX4);
2793 else if (*ptr == 'Q') {
2794 /* -Q is Quad Ribbon Fiber */
2795 device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2796 /* FreeBSD has no media type for Quad ribbon fiber */
2802 mxge_media_types = mxge_xfp_media_types;
2803 mxge_media_type_entries =
2804 sizeof (mxge_xfp_media_types) /
2805 sizeof (mxge_xfp_media_types[0]);
2806 byte = MXGE_XFP_COMPLIANCE_BYTE;
2810 if (*ptr == 'S' || *(ptr +1) == 'S') {
2811 /* -S or -2S is SFP+ */
2812 mxge_media_types = mxge_sfp_media_types;
2813 mxge_media_type_entries =
2814 sizeof (mxge_sfp_media_types) /
2815 sizeof (mxge_sfp_media_types[0]);
2820 if (mxge_media_types == NULL) {
2821 device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2826 * At this point we know the NIC has an XFP cage, so now we
2827 * try to determine what is in the cage by using the
2828 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2829 * register. We read just one byte, which may take over
2833 cmd.data0 = 0; /* just fetch 1 byte, not all 256 */
2835 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2836 if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
2837 device_printf(sc->dev, "failed to read XFP\n");
2839 if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
2840 device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
2842 if (err != MXGEFW_CMD_OK) {
2846 /* now we wait for the data to be cached */
2848 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2849 for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
2852 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2854 if (err != MXGEFW_CMD_OK) {
2855 device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
2856 cage_type, err, ms);
2860 if (cmd.data0 == mxge_media_types[0].bitmask) {
2862 device_printf(sc->dev, "%s:%s\n", cage_type,
2863 mxge_media_types[0].name);
2864 mxge_set_media(sc, IFM_10G_CX4);
2867 for (i = 1; i < mxge_media_type_entries; i++) {
2868 if (cmd.data0 & mxge_media_types[i].bitmask) {
2870 device_printf(sc->dev, "%s:%s\n",
2872 mxge_media_types[i].name);
2874 mxge_set_media(sc, mxge_media_types[i].flag);
2878 device_printf(sc->dev, "%s media 0x%x unknown\n", cage_type,
2885 mxge_intr(void *arg)
2887 struct mxge_slice_state *ss = arg;
2888 mxge_softc_t *sc = ss->sc;
2889 mcp_irq_data_t *stats = ss->fw_stats;
2890 mxge_tx_ring_t *tx = &ss->tx;
2891 mxge_rx_done_t *rx_done = &ss->rx_done;
2892 uint32_t send_done_count;
2896 #ifndef IFNET_BUF_RING
2897 /* an interrupt on a non-zero slice is implicitly valid
2898 since MSI-X irqs are not shared */
2900 mxge_clean_rx_done(ss);
2901 *ss->irq_claim = be32toh(3);
2906 /* make sure the DMA has finished */
2907 if (!stats->valid) {
2910 valid = stats->valid;
2912 if (sc->legacy_irq) {
2913 /* lower legacy IRQ */
2914 *sc->irq_deassert = 0;
2915 if (!mxge_deassert_wait)
2916 /* don't wait for conf. that irq is low */
2922 /* loop while waiting for legacy irq deassertion */
2924 /* check for transmit completes and receives */
2925 send_done_count = be32toh(stats->send_done_count);
2926 while ((send_done_count != tx->pkt_done) ||
2927 (rx_done->entry[rx_done->idx].length != 0)) {
2928 if (send_done_count != tx->pkt_done)
2929 mxge_tx_done(ss, (int)send_done_count);
2930 mxge_clean_rx_done(ss);
2931 send_done_count = be32toh(stats->send_done_count);
2933 if (sc->legacy_irq && mxge_deassert_wait)
2935 } while (*((volatile uint8_t *) &stats->valid));
2937 /* fw link & error stats meaningful only on the first slice */
2938 if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
2939 if (sc->link_state != stats->link_up) {
2940 sc->link_state = stats->link_up;
2941 if (sc->link_state) {
2942 sc->ifp->if_link_state = LINK_STATE_UP;
2943 if_link_state_change(sc->ifp);
2945 device_printf(sc->dev, "link up\n");
2947 sc->ifp->if_link_state = LINK_STATE_DOWN;
2948 if_link_state_change(sc->ifp);
2950 device_printf(sc->dev, "link down\n");
2952 sc->need_media_probe = 1;
2954 if (sc->rdma_tags_available !=
2955 be32toh(stats->rdma_tags_available)) {
2956 sc->rdma_tags_available =
2957 be32toh(stats->rdma_tags_available);
2958 device_printf(sc->dev, "RDMA timed out! %d tags "
2959 "left\n", sc->rdma_tags_available);
2962 if (stats->link_down) {
2963 sc->down_cnt += stats->link_down;
2965 if_link_state_change(sc->ifp, LINK_STATE_DOWN);
2969 /* check to see if we have rx token to pass back */
2971 *ss->irq_claim = be32toh(3);
2972 *(ss->irq_claim + 1) = be32toh(3);
2976 mxge_init(void *arg)
2983 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
2985 struct lro_entry *lro_entry;
2988 while (!SLIST_EMPTY(&ss->lro_free)) {
2989 lro_entry = SLIST_FIRST(&ss->lro_free);
2990 SLIST_REMOVE_HEAD(&ss->lro_free, next);
2991 kfree(lro_entry, M_DEVBUF);
2994 for (i = 0; i <= ss->rx_big.mask; i++) {
2995 if (ss->rx_big.info[i].m == NULL)
2997 bus_dmamap_unload(ss->rx_big.dmat,
2998 ss->rx_big.info[i].map);
2999 m_freem(ss->rx_big.info[i].m);
3000 ss->rx_big.info[i].m = NULL;
3003 for (i = 0; i <= ss->rx_small.mask; i++) {
3004 if (ss->rx_small.info[i].m == NULL)
3006 bus_dmamap_unload(ss->rx_small.dmat,
3007 ss->rx_small.info[i].map);
3008 m_freem(ss->rx_small.info[i].m);
3009 ss->rx_small.info[i].m = NULL;
3012 /* transmit ring used only on the first slice */
3013 if (ss->tx.info == NULL)
3016 for (i = 0; i <= ss->tx.mask; i++) {
3017 ss->tx.info[i].flag = 0;
3018 if (ss->tx.info[i].m == NULL)
3020 bus_dmamap_unload(ss->tx.dmat,
3021 ss->tx.info[i].map);
3022 m_freem(ss->tx.info[i].m);
3023 ss->tx.info[i].m = NULL;
3028 mxge_free_mbufs(mxge_softc_t *sc)
3032 for (slice = 0; slice < sc->num_slices; slice++)
3033 mxge_free_slice_mbufs(&sc->ss[slice]);
3037 mxge_free_slice_rings(struct mxge_slice_state *ss)
3042 if (ss->rx_done.entry != NULL)
3043 mxge_dma_free(&ss->rx_done.dma);
3044 ss->rx_done.entry = NULL;
3046 if (ss->tx.req_bytes != NULL)
3047 kfree(ss->tx.req_bytes, M_DEVBUF);
3048 ss->tx.req_bytes = NULL;
3050 if (ss->tx.seg_list != NULL)
3051 kfree(ss->tx.seg_list, M_DEVBUF);
3052 ss->tx.seg_list = NULL;
3054 if (ss->rx_small.shadow != NULL)
3055 kfree(ss->rx_small.shadow, M_DEVBUF);
3056 ss->rx_small.shadow = NULL;
3058 if (ss->rx_big.shadow != NULL)
3059 kfree(ss->rx_big.shadow, M_DEVBUF);
3060 ss->rx_big.shadow = NULL;
3062 if (ss->tx.info != NULL) {
3063 if (ss->tx.dmat != NULL) {
3064 for (i = 0; i <= ss->tx.mask; i++) {
3065 bus_dmamap_destroy(ss->tx.dmat,
3066 ss->tx.info[i].map);
3068 bus_dma_tag_destroy(ss->tx.dmat);
3070 kfree(ss->tx.info, M_DEVBUF);
3074 if (ss->rx_small.info != NULL) {
3075 if (ss->rx_small.dmat != NULL) {
3076 for (i = 0; i <= ss->rx_small.mask; i++) {
3077 bus_dmamap_destroy(ss->rx_small.dmat,
3078 ss->rx_small.info[i].map);
3080 bus_dmamap_destroy(ss->rx_small.dmat,
3081 ss->rx_small.extra_map);
3082 bus_dma_tag_destroy(ss->rx_small.dmat);
3084 kfree(ss->rx_small.info, M_DEVBUF);
3086 ss->rx_small.info = NULL;
3088 if (ss->rx_big.info != NULL) {
3089 if (ss->rx_big.dmat != NULL) {
3090 for (i = 0; i <= ss->rx_big.mask; i++) {
3091 bus_dmamap_destroy(ss->rx_big.dmat,
3092 ss->rx_big.info[i].map);
3094 bus_dmamap_destroy(ss->rx_big.dmat,
3095 ss->rx_big.extra_map);
3096 bus_dma_tag_destroy(ss->rx_big.dmat);
3098 kfree(ss->rx_big.info, M_DEVBUF);
3100 ss->rx_big.info = NULL;
3104 mxge_free_rings(mxge_softc_t *sc)
3108 for (slice = 0; slice < sc->num_slices; slice++)
3109 mxge_free_slice_rings(&sc->ss[slice]);
3113 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3114 int tx_ring_entries)
3116 mxge_softc_t *sc = ss->sc;
3122 /* allocate per-slice receive resources */
3124 ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3125 ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3127 /* allocate the rx shadow rings */
3128 bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3129 ss->rx_small.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3130 if (ss->rx_small.shadow == NULL)
3133 bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3134 ss->rx_big.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3135 if (ss->rx_big.shadow == NULL)
3138 /* allocate the rx host info rings */
3139 bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3140 ss->rx_small.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3141 if (ss->rx_small.info == NULL)
3144 bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3145 ss->rx_big.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3146 if (ss->rx_big.info == NULL)
3149 /* allocate the rx busdma resources */
3150 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3152 4096, /* boundary */
3153 BUS_SPACE_MAXADDR, /* low */
3154 BUS_SPACE_MAXADDR, /* high */
3155 NULL, NULL, /* filter */
3156 MHLEN, /* maxsize */
3158 MHLEN, /* maxsegsize */
3159 BUS_DMA_ALLOCNOW, /* flags */
3160 NULL, NULL, /* lock */
3161 &ss->rx_small.dmat); /* tag */
3163 device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3168 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3170 #if MXGE_VIRT_JUMBOS
3171 4096, /* boundary */
3175 BUS_SPACE_MAXADDR, /* low */
3176 BUS_SPACE_MAXADDR, /* high */
3177 NULL, NULL, /* filter */
3178 3*4096, /* maxsize */
3179 #if MXGE_VIRT_JUMBOS
3181 4096, /* maxsegsize*/
3184 MJUM9BYTES, /* maxsegsize*/
3186 BUS_DMA_ALLOCNOW, /* flags */
3187 NULL, NULL, /* lock */
3188 &ss->rx_big.dmat); /* tag */
3190 device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3194 for (i = 0; i <= ss->rx_small.mask; i++) {
3195 err = bus_dmamap_create(ss->rx_small.dmat, 0,
3196 &ss->rx_small.info[i].map);
3198 device_printf(sc->dev, "Err %d rx_small dmamap\n",
3203 err = bus_dmamap_create(ss->rx_small.dmat, 0,
3204 &ss->rx_small.extra_map);
3206 device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3211 for (i = 0; i <= ss->rx_big.mask; i++) {
3212 err = bus_dmamap_create(ss->rx_big.dmat, 0,
3213 &ss->rx_big.info[i].map);
3215 device_printf(sc->dev, "Err %d rx_big dmamap\n",
3220 err = bus_dmamap_create(ss->rx_big.dmat, 0,
3221 &ss->rx_big.extra_map);
3223 device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3228 /* now allocate TX resouces */
3230 #ifndef IFNET_BUF_RING
3231 /* only use a single TX ring for now */
3232 if (ss != ss->sc->ss)
3236 ss->tx.mask = tx_ring_entries - 1;
3237 ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3240 /* allocate the tx request copy block */
3242 sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3243 ss->tx.req_bytes = kmalloc(bytes, M_DEVBUF, M_WAITOK);
3244 if (ss->tx.req_bytes == NULL)
3246 /* ensure req_list entries are aligned to 8 bytes */
3247 ss->tx.req_list = (mcp_kreq_ether_send_t *)
3248 ((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3250 /* allocate the tx busdma segment list */
3251 bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3252 ss->tx.seg_list = (bus_dma_segment_t *)
3253 kmalloc(bytes, M_DEVBUF, M_WAITOK);
3254 if (ss->tx.seg_list == NULL)
3257 /* allocate the tx host info ring */
3258 bytes = tx_ring_entries * sizeof (*ss->tx.info);
3259 ss->tx.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3260 if (ss->tx.info == NULL)
3263 /* allocate the tx busdma resources */
3264 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3266 sc->tx_boundary, /* boundary */
3267 BUS_SPACE_MAXADDR, /* low */
3268 BUS_SPACE_MAXADDR, /* high */
3269 NULL, NULL, /* filter */
3270 65536 + 256, /* maxsize */
3271 ss->tx.max_desc - 2, /* num segs */
3272 sc->tx_boundary, /* maxsegsz */
3273 BUS_DMA_ALLOCNOW, /* flags */
3274 NULL, NULL, /* lock */
3275 &ss->tx.dmat); /* tag */
3278 device_printf(sc->dev, "Err %d allocating tx dmat\n",
3283 /* now use these tags to setup dmamaps for each slot
3285 for (i = 0; i <= ss->tx.mask; i++) {
3286 err = bus_dmamap_create(ss->tx.dmat, 0,
3287 &ss->tx.info[i].map);
3289 device_printf(sc->dev, "Err %d tx dmamap\n",
3299 mxge_alloc_rings(mxge_softc_t *sc)
3303 int tx_ring_entries, rx_ring_entries;
3306 /* get ring sizes */
3307 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3308 tx_ring_size = cmd.data0;
3310 device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3314 tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3315 rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3316 IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
3317 sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
3318 IFQ_SET_READY(&sc->ifp->if_snd);
3320 for (slice = 0; slice < sc->num_slices; slice++) {
3321 err = mxge_alloc_slice_rings(&sc->ss[slice],
3330 mxge_free_rings(sc);
3337 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3339 int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3341 if (bufsize < MCLBYTES) {
3342 /* easy, everything fits in a single buffer */
3343 *big_buf_size = MCLBYTES;
3344 *cl_size = MCLBYTES;
3349 if (bufsize < MJUMPAGESIZE) {
3350 /* still easy, everything still fits in a single buffer */
3351 *big_buf_size = MJUMPAGESIZE;
3352 *cl_size = MJUMPAGESIZE;
3356 #if MXGE_VIRT_JUMBOS
3357 /* now we need to use virtually contiguous buffers */
3358 *cl_size = MJUM9BYTES;
3359 *big_buf_size = 4096;
3360 *nbufs = mtu / 4096 + 1;
3361 /* needs to be a power of two, so round up */
3365 *cl_size = MJUM9BYTES;
3366 *big_buf_size = MJUM9BYTES;
3372 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3377 struct lro_entry *lro_entry;
3382 slice = ss - sc->ss;
3384 SLIST_INIT(&ss->lro_free);
3385 SLIST_INIT(&ss->lro_active);
3387 for (i = 0; i < sc->lro_cnt; i++) {
3388 lro_entry = (struct lro_entry *)
3389 kmalloc(sizeof (*lro_entry), M_DEVBUF,
3391 if (lro_entry == NULL) {
3395 SLIST_INSERT_HEAD(&ss->lro_free, lro_entry, next);
3397 /* get the lanai pointers to the send and receive rings */
3400 #ifndef IFNET_BUF_RING
3401 /* We currently only send from the first slice */
3405 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3407 (volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3408 ss->tx.send_go = (volatile uint32_t *)
3409 (sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3410 ss->tx.send_stop = (volatile uint32_t *)
3411 (sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3412 #ifndef IFNET_BUF_RING
3416 err |= mxge_send_cmd(sc,
3417 MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3418 ss->rx_small.lanai =
3419 (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3421 err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3423 (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3426 device_printf(sc->dev,
3427 "failed to get ring sizes or locations\n");
3431 /* stock receive rings */
3432 for (i = 0; i <= ss->rx_small.mask; i++) {
3433 map = ss->rx_small.info[i].map;
3434 err = mxge_get_buf_small(ss, map, i);
3436 device_printf(sc->dev, "alloced %d/%d smalls\n",
3437 i, ss->rx_small.mask + 1);
3441 for (i = 0; i <= ss->rx_big.mask; i++) {
3442 ss->rx_big.shadow[i].addr_low = 0xffffffff;
3443 ss->rx_big.shadow[i].addr_high = 0xffffffff;
3445 ss->rx_big.nbufs = nbufs;
3446 ss->rx_big.cl_size = cl_size;
3447 ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3448 ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3449 for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3450 map = ss->rx_big.info[i].map;
3451 err = mxge_get_buf_big(ss, map, i);
3453 device_printf(sc->dev, "alloced %d/%d bigs\n",
3454 i, ss->rx_big.mask + 1);
3462 mxge_open(mxge_softc_t *sc)
3465 int err, big_bytes, nbufs, slice, cl_size, i;
3467 volatile uint8_t *itable;
3468 struct mxge_slice_state *ss;
3470 /* Copy the MAC address in case it was overridden */
3471 bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3473 err = mxge_reset(sc, 1);
3475 device_printf(sc->dev, "failed to reset\n");
3479 if (sc->num_slices > 1) {
3480 /* setup the indirection table */
3481 cmd.data0 = sc->num_slices;
3482 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3485 err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3488 device_printf(sc->dev,
3489 "failed to setup rss tables\n");
3493 /* just enable an identity mapping */
3494 itable = sc->sram + cmd.data0;
3495 for (i = 0; i < sc->num_slices; i++)
3496 itable[i] = (uint8_t)i;
3499 cmd.data1 = mxge_rss_hash_type;
3500 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3502 device_printf(sc->dev, "failed to enable slices\n");
3508 mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3511 err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3513 /* error is only meaningful if we're trying to set
3514 MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3515 if (err && nbufs > 1) {
3516 device_printf(sc->dev,
3517 "Failed to set alway-use-n to %d\n",
3521 /* Give the firmware the mtu and the big and small buffer
3522 sizes. The firmware wants the big buf size to be a power
3523 of two. Luckily, FreeBSD's clusters are powers of two */
3524 cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3525 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3526 cmd.data0 = MHLEN - MXGEFW_PAD;
3527 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3529 cmd.data0 = big_bytes;
3530 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3533 device_printf(sc->dev, "failed to setup params\n");
3537 /* Now give him the pointer to the stats block */
3539 #ifdef IFNET_BUF_RING
3540 slice < sc->num_slices;
3545 ss = &sc->ss[slice];
3547 MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3549 MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3550 cmd.data2 = sizeof(struct mcp_irq_data);
3551 cmd.data2 |= (slice << 16);
3552 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3556 bus = sc->ss->fw_stats_dma.bus_addr;
3557 bus += offsetof(struct mcp_irq_data, send_done_count);
3558 cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3559 cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3560 err = mxge_send_cmd(sc,
3561 MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3563 /* Firmware cannot support multicast without STATS_DMA_V2 */
3564 sc->fw_multicast_support = 0;
3566 sc->fw_multicast_support = 1;
3570 device_printf(sc->dev, "failed to setup params\n");
3574 for (slice = 0; slice < sc->num_slices; slice++) {
3575 err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3577 device_printf(sc->dev, "couldn't open slice %d\n",
3583 /* Finally, start the firmware running */
3584 err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3586 device_printf(sc->dev, "Couldn't bring up link\n");
3589 #ifdef IFNET_BUF_RING
3590 for (slice = 0; slice < sc->num_slices; slice++) {
3591 ss = &sc->ss[slice];
3592 ss->if_drv_flags |= IFF_DRV_RUNNING;
3593 ss->if_drv_flags &= ~IFF_DRV_OACTIVE;
3596 sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
3597 sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
3598 callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3604 mxge_free_mbufs(sc);
3610 mxge_close(mxge_softc_t *sc)
3613 int err, old_down_cnt;
3614 #ifdef IFNET_BUF_RING
3615 struct mxge_slice_state *ss;
3619 callout_stop(&sc->co_hdl);
3620 #ifdef IFNET_BUF_RING
3621 for (slice = 0; slice < sc->num_slices; slice++) {
3622 ss = &sc->ss[slice];
3623 ss->if_drv_flags &= ~IFF_DRV_RUNNING;
3626 sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
3627 old_down_cnt = sc->down_cnt;
3629 err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3631 device_printf(sc->dev, "Couldn't bring down link\n");
3633 if (old_down_cnt == sc->down_cnt) {
3634 /* wait for down irq */
3635 DELAY(10 * sc->intr_coal_delay);
3638 if (old_down_cnt == sc->down_cnt) {
3639 device_printf(sc->dev, "never got down irq\n");
3642 mxge_free_mbufs(sc);
3648 mxge_setup_cfg_space(mxge_softc_t *sc)
3650 device_t dev = sc->dev;
3652 uint16_t cmd, lnk, pectl;
3654 /* find the PCIe link width and set max read request to 4KB*/
3655 if (pci_find_extcap(dev, PCIY_EXPRESS, ®) == 0) {
3656 lnk = pci_read_config(dev, reg + 0x12, 2);
3657 sc->link_width = (lnk >> 4) & 0x3f;
3659 pectl = pci_read_config(dev, reg + 0x8, 2);
3660 pectl = (pectl & ~0x7000) | (5 << 12);
3661 pci_write_config(dev, reg + 0x8, pectl, 2);
3664 /* Enable DMA and Memory space access */
3665 pci_enable_busmaster(dev);
3666 cmd = pci_read_config(dev, PCIR_COMMAND, 2);
3667 cmd |= PCIM_CMD_MEMEN;
3668 pci_write_config(dev, PCIR_COMMAND, cmd, 2);
3672 mxge_read_reboot(mxge_softc_t *sc)
3674 device_t dev = sc->dev;
3677 /* find the vendor specific offset */
3678 if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
3679 device_printf(sc->dev,
3680 "could not find vendor specific offset\n");
3681 return (uint32_t)-1;
3683 /* enable read32 mode */
3684 pci_write_config(dev, vs + 0x10, 0x3, 1);
3685 /* tell NIC which register to read */
3686 pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3687 return (pci_read_config(dev, vs + 0x14, 4));
3691 mxge_watchdog_reset(mxge_softc_t *sc, int slice)
3693 struct pci_devinfo *dinfo;
3701 device_printf(sc->dev, "Watchdog reset!\n");
3704 * check to see if the NIC rebooted. If it did, then all of
3705 * PCI config space has been reset, and things like the
3706 * busmaster bit will be zero. If this is the case, then we
3707 * must restore PCI config space before the NIC can be used
3710 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3711 if (cmd == 0xffff) {
3713 * maybe the watchdog caught the NIC rebooting; wait
3714 * up to 100ms for it to finish. If it does not come
3715 * back, then give up
3718 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3719 if (cmd == 0xffff) {
3720 device_printf(sc->dev, "NIC disappeared!\n");
3724 if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3725 /* print the reboot status */
3726 reboot = mxge_read_reboot(sc);
3727 device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3729 /* restore PCI configuration space */
3730 dinfo = device_get_ivars(sc->dev);
3731 pci_cfg_restore(sc->dev, dinfo);
3733 /* and redo any changes we made to our config space */
3734 mxge_setup_cfg_space(sc);
3736 if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING) {
3738 err = mxge_open(sc);
3741 tx = &sc->ss[slice].tx;
3742 device_printf(sc->dev,
3743 "NIC did not reboot, slice %d ring state:\n",
3745 device_printf(sc->dev,
3746 "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3747 tx->req, tx->done, tx->queue_active);
3748 device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3749 tx->activate, tx->deactivate);
3750 device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3752 be32toh(sc->ss->fw_stats->send_done_count));
3753 device_printf(sc->dev, "not resetting\n");
3759 mxge_watchdog(mxge_softc_t *sc)
3762 uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3765 /* see if we have outstanding transmits, which
3766 have been pending for more than mxge_ticks */
3768 #ifdef IFNET_BUF_RING
3769 (i < sc->num_slices) && (err == 0);
3771 (i < 1) && (err == 0);
3775 if (tx->req != tx->done &&
3776 tx->watchdog_req != tx->watchdog_done &&
3777 tx->done == tx->watchdog_done) {
3778 /* check for pause blocking before resetting */
3779 if (tx->watchdog_rx_pause == rx_pause)
3780 err = mxge_watchdog_reset(sc, i);
3782 device_printf(sc->dev, "Flow control blocking "
3783 "xmits, check link partner\n");
3786 tx->watchdog_req = tx->req;
3787 tx->watchdog_done = tx->done;
3788 tx->watchdog_rx_pause = rx_pause;
3791 if (sc->need_media_probe)
3792 mxge_media_probe(sc);
3797 mxge_update_stats(mxge_softc_t *sc)
3799 struct mxge_slice_state *ss;
3800 u_long ipackets = 0;
3801 u_long opackets = 0;
3802 #ifdef IFNET_BUF_RING
3810 for (slice = 0; slice < sc->num_slices; slice++) {
3811 ss = &sc->ss[slice];
3812 ipackets += ss->ipackets;
3813 opackets += ss->opackets;
3814 #ifdef IFNET_BUF_RING
3815 obytes += ss->obytes;
3816 omcasts += ss->omcasts;
3817 odrops += ss->tx.br->br_drops;
3819 oerrors += ss->oerrors;
3821 sc->ifp->if_ipackets = ipackets;
3822 sc->ifp->if_opackets = opackets;
3823 #ifdef IFNET_BUF_RING
3824 sc->ifp->if_obytes = obytes;
3825 sc->ifp->if_omcasts = omcasts;
3826 sc->ifp->if_snd.ifq_drops = odrops;
3828 sc->ifp->if_oerrors = oerrors;
3832 mxge_tick(void *arg)
3834 mxge_softc_t *sc = arg;
3837 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
3838 /* aggregate stats from different slices */
3839 mxge_update_stats(sc);
3840 if (!sc->watchdog_countdown) {
3841 err = mxge_watchdog(sc);
3842 sc->watchdog_countdown = 4;
3844 sc->watchdog_countdown--;
3846 callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3847 lockmgr(&sc->driver_lock, LK_RELEASE);
3851 mxge_media_change(struct ifnet *ifp)
3857 mxge_change_mtu(mxge_softc_t *sc, int mtu)
3859 struct ifnet *ifp = sc->ifp;
3860 int real_mtu, old_mtu;
3864 real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3865 if ((real_mtu > sc->max_mtu) || real_mtu < 60)
3867 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
3868 old_mtu = ifp->if_mtu;
3870 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3872 err = mxge_open(sc);
3874 ifp->if_mtu = old_mtu;
3876 (void) mxge_open(sc);
3879 lockmgr(&sc->driver_lock, LK_RELEASE);
3884 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
3886 mxge_softc_t *sc = ifp->if_softc;
3891 ifmr->ifm_status = IFM_AVALID;
3892 ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
3893 ifmr->ifm_active = IFM_AUTO | IFM_ETHER;
3894 ifmr->ifm_active |= sc->link_state ? IFM_FDX : 0;
3898 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
3900 mxge_softc_t *sc = ifp->if_softc;
3901 struct ifreq *ifr = (struct ifreq *)data;
3908 err = ether_ioctl(ifp, command, data);
3912 err = mxge_change_mtu(sc, ifr->ifr_mtu);
3916 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
3918 lockmgr(&sc->driver_lock, LK_RELEASE);
3921 if (ifp->if_flags & IFF_UP) {
3922 if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
3923 err = mxge_open(sc);
3925 /* take care of promis can allmulti
3927 mxge_change_promisc(sc,
3928 ifp->if_flags & IFF_PROMISC);
3929 mxge_set_multicast_list(sc);
3932 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3936 lockmgr(&sc->driver_lock, LK_RELEASE);
3941 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
3942 mxge_set_multicast_list(sc);
3943 lockmgr(&sc->driver_lock, LK_RELEASE);
3947 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
3948 mask = ifr->ifr_reqcap ^ ifp->if_capenable;
3949 if (mask & IFCAP_TXCSUM) {
3950 if (IFCAP_TXCSUM & ifp->if_capenable) {
3951 ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
3952 ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
3955 ifp->if_capenable |= IFCAP_TXCSUM;
3956 ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
3958 } else if (mask & IFCAP_RXCSUM) {
3959 if (IFCAP_RXCSUM & ifp->if_capenable) {
3960 ifp->if_capenable &= ~IFCAP_RXCSUM;
3963 ifp->if_capenable |= IFCAP_RXCSUM;
3967 if (mask & IFCAP_TSO4) {
3968 if (IFCAP_TSO4 & ifp->if_capenable) {
3969 ifp->if_capenable &= ~IFCAP_TSO4;
3970 ifp->if_hwassist &= ~CSUM_TSO;
3971 } else if (IFCAP_TXCSUM & ifp->if_capenable) {
3972 ifp->if_capenable |= IFCAP_TSO4;
3973 ifp->if_hwassist |= CSUM_TSO;
3975 printf("mxge requires tx checksum offload"
3976 " be enabled to use TSO\n");
3980 if (mask & IFCAP_LRO) {
3981 if (IFCAP_LRO & ifp->if_capenable)
3982 err = mxge_change_lro_locked(sc, 0);
3984 err = mxge_change_lro_locked(sc, mxge_lro_cnt);
3986 if (mask & IFCAP_VLAN_HWTAGGING)
3987 ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
3988 lockmgr(&sc->driver_lock, LK_RELEASE);
3989 VLAN_CAPABILITIES(ifp);
3994 err = ifmedia_ioctl(ifp, (struct ifreq *)data,
3995 &sc->media, command);
4005 mxge_fetch_tunables(mxge_softc_t *sc)
4008 TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4009 TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4010 &mxge_flow_control);
4011 TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4012 &mxge_intr_coal_delay);
4013 TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4014 &mxge_nvidia_ecrc_enable);
4015 TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4016 &mxge_force_firmware);
4017 TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4018 &mxge_deassert_wait);
4019 TUNABLE_INT_FETCH("hw.mxge.verbose",
4021 TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4022 TUNABLE_INT_FETCH("hw.mxge.lro_cnt", &sc->lro_cnt);
4023 TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4024 TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4025 TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4026 if (sc->lro_cnt != 0)
4027 mxge_lro_cnt = sc->lro_cnt;
4031 if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4032 mxge_intr_coal_delay = 30;
4033 if (mxge_ticks == 0)
4034 mxge_ticks = hz / 2;
4035 sc->pause = mxge_flow_control;
4036 if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4037 || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4038 mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
4040 if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4041 mxge_initial_mtu < ETHER_MIN_LEN)
4042 mxge_initial_mtu = ETHERMTU_JUMBO;
4047 mxge_free_slices(mxge_softc_t *sc)
4049 struct mxge_slice_state *ss;
4056 for (i = 0; i < sc->num_slices; i++) {
4058 if (ss->fw_stats != NULL) {
4059 mxge_dma_free(&ss->fw_stats_dma);
4060 ss->fw_stats = NULL;
4061 #ifdef IFNET_BUF_RING
4062 if (ss->tx.br != NULL) {
4063 drbr_free(ss->tx.br, M_DEVBUF);
4067 lockuninit(&ss->tx.lock);
4069 if (ss->rx_done.entry != NULL) {
4070 mxge_dma_free(&ss->rx_done.dma);
4071 ss->rx_done.entry = NULL;
4074 free(sc->ss, M_DEVBUF);
4079 mxge_alloc_slices(mxge_softc_t *sc)
4082 struct mxge_slice_state *ss;
4084 int err, i, max_intr_slots;
4086 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4088 device_printf(sc->dev, "Cannot determine rx ring size\n");
4091 sc->rx_ring_size = cmd.data0;
4092 max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4094 bytes = sizeof (*sc->ss) * sc->num_slices;
4095 sc->ss = kmalloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4098 for (i = 0; i < sc->num_slices; i++) {
4103 /* allocate per-slice rx interrupt queues */
4105 bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4106 err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4109 ss->rx_done.entry = ss->rx_done.dma.addr;
4110 bzero(ss->rx_done.entry, bytes);
4113 * allocate the per-slice firmware stats; stats
4114 * (including tx) are used used only on the first
4117 #ifndef IFNET_BUF_RING
4122 bytes = sizeof (*ss->fw_stats);
4123 err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4124 sizeof (*ss->fw_stats), 64);
4127 ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4128 snprintf(ss->tx.lock_name, sizeof(ss->tx.lock_name),
4129 "%s:tx(%d)", device_get_nameunit(sc->dev), i);
4130 lock_init(&ss->tx.lock, ss->tx.lock_name, 0, LK_CANRECURSE);
4131 #ifdef IFNET_BUF_RING
4132 ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4140 mxge_free_slices(sc);
4145 mxge_slice_probe(mxge_softc_t *sc)
4149 int msix_cnt, status, max_intr_slots;
4153 * don't enable multiple slices if they are not enabled,
4154 * or if this is not an SMP system
4157 if (mxge_max_slices == 0 || mxge_max_slices == 1 || mp_ncpus < 2)
4160 /* see how many MSI-X interrupts are available */
4161 msix_cnt = pci_msix_count(sc->dev);
4165 /* now load the slice aware firmware see what it supports */
4166 old_fw = sc->fw_name;
4167 if (old_fw == mxge_fw_aligned)
4168 sc->fw_name = mxge_fw_rss_aligned;
4170 sc->fw_name = mxge_fw_rss_unaligned;
4171 status = mxge_load_firmware(sc, 0);
4173 device_printf(sc->dev, "Falling back to a single slice\n");
4177 /* try to send a reset command to the card to see if it
4179 memset(&cmd, 0, sizeof (cmd));
4180 status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4182 device_printf(sc->dev, "failed reset\n");
4186 /* get rx ring size */
4187 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4189 device_printf(sc->dev, "Cannot determine rx ring size\n");
4192 max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4194 /* tell it the size of the interrupt queues */
4195 cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4196 status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4198 device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4202 /* ask the maximum number of slices it supports */
4203 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4205 device_printf(sc->dev,
4206 "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4209 sc->num_slices = cmd.data0;
4210 if (sc->num_slices > msix_cnt)
4211 sc->num_slices = msix_cnt;
4213 if (mxge_max_slices == -1) {
4214 /* cap to number of CPUs in system */
4215 if (sc->num_slices > mp_ncpus)
4216 sc->num_slices = mp_ncpus;
4218 if (sc->num_slices > mxge_max_slices)
4219 sc->num_slices = mxge_max_slices;
4221 /* make sure it is a power of two */
4222 while (sc->num_slices & (sc->num_slices - 1))
4226 device_printf(sc->dev, "using %d slices\n",
4232 sc->fw_name = old_fw;
4233 (void) mxge_load_firmware(sc, 0);
4237 mxge_add_msix_irqs(mxge_softc_t *sc)
4240 int count, err, i, rid;
4243 sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4246 if (sc->msix_table_res == NULL) {
4247 device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4251 count = sc->num_slices;
4252 err = pci_alloc_msix(sc->dev, &count);
4254 device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4255 "err = %d \n", sc->num_slices, err);
4256 goto abort_with_msix_table;
4258 if (count < sc->num_slices) {
4259 device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4260 count, sc->num_slices);
4261 device_printf(sc->dev,
4262 "Try setting hw.mxge.max_slices to %d\n",
4265 goto abort_with_msix;
4267 bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4268 sc->msix_irq_res = kmalloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4269 if (sc->msix_irq_res == NULL) {
4271 goto abort_with_msix;
4274 for (i = 0; i < sc->num_slices; i++) {
4276 sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4279 if (sc->msix_irq_res[i] == NULL) {
4280 device_printf(sc->dev, "couldn't allocate IRQ res"
4281 " for message %d\n", i);
4283 goto abort_with_res;
4287 bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4288 sc->msix_ih = kmalloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4290 for (i = 0; i < sc->num_slices; i++) {
4291 err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4292 INTR_TYPE_NET | INTR_MPSAFE,
4293 #if __FreeBSD_version > 700030
4296 mxge_intr, &sc->ss[i], &sc->msix_ih[i]);
4298 device_printf(sc->dev, "couldn't setup intr for "
4300 goto abort_with_intr;
4305 device_printf(sc->dev, "using %d msix IRQs:",
4307 for (i = 0; i < sc->num_slices; i++)
4308 printf(" %ld", rman_get_start(sc->msix_irq_res[i]));
4314 for (i = 0; i < sc->num_slices; i++) {
4315 if (sc->msix_ih[i] != NULL) {
4316 bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4318 sc->msix_ih[i] = NULL;
4321 kfree(sc->msix_ih, M_DEVBUF);
4325 for (i = 0; i < sc->num_slices; i++) {
4327 if (sc->msix_irq_res[i] != NULL)
4328 bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4329 sc->msix_irq_res[i]);
4330 sc->msix_irq_res[i] = NULL;
4332 kfree(sc->msix_irq_res, M_DEVBUF);
4336 pci_release_msi(sc->dev);
4338 abort_with_msix_table:
4339 bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4340 sc->msix_table_res);
4346 mxge_add_single_irq(mxge_softc_t *sc)
4348 int count, err, rid;
4350 count = pci_msi_count(sc->dev);
4351 if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4357 sc->irq_res = bus_alloc_resource(sc->dev, SYS_RES_IRQ, &rid, 0, ~0,
4358 1, RF_SHAREABLE | RF_ACTIVE);
4359 if (sc->irq_res == NULL) {
4360 device_printf(sc->dev, "could not alloc interrupt\n");
4364 device_printf(sc->dev, "using %s irq %ld\n",
4365 sc->legacy_irq ? "INTx" : "MSI",
4366 rman_get_start(sc->irq_res));
4367 err = bus_setup_intr(sc->dev, sc->irq_res,
4368 INTR_TYPE_NET | INTR_MPSAFE,
4369 #if __FreeBSD_version > 700030
4372 mxge_intr, &sc->ss[0], &sc->ih);
4374 bus_release_resource(sc->dev, SYS_RES_IRQ,
4375 sc->legacy_irq ? 0 : 1, sc->irq_res);
4376 if (!sc->legacy_irq)
4377 pci_release_msi(sc->dev);
4383 mxge_rem_msix_irqs(mxge_softc_t *sc)
4387 for (i = 0; i < sc->num_slices; i++) {
4388 if (sc->msix_ih[i] != NULL) {
4389 bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4391 sc->msix_ih[i] = NULL;
4394 kfree(sc->msix_ih, M_DEVBUF);
4396 for (i = 0; i < sc->num_slices; i++) {
4398 if (sc->msix_irq_res[i] != NULL)
4399 bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4400 sc->msix_irq_res[i]);
4401 sc->msix_irq_res[i] = NULL;
4403 kfree(sc->msix_irq_res, M_DEVBUF);
4405 bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4406 sc->msix_table_res);
4408 pci_release_msi(sc->dev);
4413 mxge_rem_single_irq(mxge_softc_t *sc)
4415 bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4416 bus_release_resource(sc->dev, SYS_RES_IRQ,
4417 sc->legacy_irq ? 0 : 1, sc->irq_res);
4418 if (!sc->legacy_irq)
4419 pci_release_msi(sc->dev);
4423 mxge_rem_irq(mxge_softc_t *sc)
4425 if (sc->num_slices > 1)
4426 mxge_rem_msix_irqs(sc);
4428 mxge_rem_single_irq(sc);
4432 mxge_add_irq(mxge_softc_t *sc)
4436 if (sc->num_slices > 1)
4437 err = mxge_add_msix_irqs(sc);
4439 err = mxge_add_single_irq(sc);
4441 if (0 && err == 0 && sc->num_slices > 1) {
4442 mxge_rem_msix_irqs(sc);
4443 err = mxge_add_msix_irqs(sc);
4450 mxge_attach(device_t dev)
4452 mxge_softc_t *sc = device_get_softc(dev);
4457 mxge_fetch_tunables(sc);
4459 err = bus_dma_tag_create(NULL, /* parent */
4462 BUS_SPACE_MAXADDR, /* low */
4463 BUS_SPACE_MAXADDR, /* high */
4464 NULL, NULL, /* filter */
4465 65536 + 256, /* maxsize */
4466 MXGE_MAX_SEND_DESC, /* num segs */
4467 65536, /* maxsegsize */
4469 NULL, NULL, /* lock */
4470 &sc->parent_dmat); /* tag */
4473 device_printf(sc->dev, "Err %d allocating parent dmat\n",
4475 goto abort_with_nothing;
4478 ifp = sc->ifp = if_alloc(IFT_ETHER);
4480 device_printf(dev, "can not if_alloc()\n");
4482 goto abort_with_parent_dmat;
4484 if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4486 snprintf(sc->cmd_lock_name, sizeof(sc->cmd_lock_name), "%s:cmd",
4487 device_get_nameunit(dev));
4488 lock_init(&sc->cmd_lock, sc->cmd_lock_name, 0, LK_CANRECURSE);
4489 snprintf(sc->driver_lock_name, sizeof(sc->driver_lock_name),
4490 "%s:drv", device_get_nameunit(dev));
4491 lock_init(&sc->driver_lock, sc->driver_lock_name,
4494 callout_init(&sc->co_hdl);
4496 mxge_setup_cfg_space(sc);
4498 /* Map the board into the kernel */
4500 sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
4502 if (sc->mem_res == NULL) {
4503 device_printf(dev, "could not map memory\n");
4505 goto abort_with_lock;
4507 sc->sram = rman_get_virtual(sc->mem_res);
4508 sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4509 if (sc->sram_size > rman_get_size(sc->mem_res)) {
4510 device_printf(dev, "impossible memory region size %ld\n",
4511 rman_get_size(sc->mem_res));
4513 goto abort_with_mem_res;
4516 /* make NULL terminated copy of the EEPROM strings section of
4518 bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4519 bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4520 rman_get_bushandle(sc->mem_res),
4521 sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4523 MXGE_EEPROM_STRINGS_SIZE - 2);
4524 err = mxge_parse_strings(sc);
4526 goto abort_with_mem_res;
4528 /* Enable write combining for efficient use of PCIe bus */
4531 /* Allocate the out of band dma memory */
4532 err = mxge_dma_alloc(sc, &sc->cmd_dma,
4533 sizeof (mxge_cmd_t), 64);
4535 goto abort_with_mem_res;
4536 sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4537 err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4539 goto abort_with_cmd_dma;
4541 err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4543 goto abort_with_zeropad_dma;
4545 /* select & load the firmware */
4546 err = mxge_select_firmware(sc);
4548 goto abort_with_dmabench;
4549 sc->intr_coal_delay = mxge_intr_coal_delay;
4551 mxge_slice_probe(sc);
4552 err = mxge_alloc_slices(sc);
4554 goto abort_with_dmabench;
4556 err = mxge_reset(sc, 0);
4558 goto abort_with_slices;
4560 err = mxge_alloc_rings(sc);
4562 device_printf(sc->dev, "failed to allocate rings\n");
4563 goto abort_with_dmabench;
4566 err = mxge_add_irq(sc);
4568 device_printf(sc->dev, "failed to add irq\n");
4569 goto abort_with_rings;
4572 ifp->if_baudrate = IF_Gbps(10UL);
4573 ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4576 ifp->if_capabilities |= IFCAP_LRO;
4579 #ifdef MXGE_NEW_VLAN_API
4580 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4583 sc->max_mtu = mxge_max_mtu(sc);
4584 if (sc->max_mtu >= 9000)
4585 ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4587 device_printf(dev, "MTU limited to %d. Install "
4588 "latest firmware for 9000 byte jumbo support\n",
4589 sc->max_mtu - ETHER_HDR_LEN);
4590 ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4591 ifp->if_capenable = ifp->if_capabilities;
4592 if (sc->lro_cnt == 0)
4593 ifp->if_capenable &= ~IFCAP_LRO;
4595 ifp->if_init = mxge_init;
4597 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4598 ifp->if_ioctl = mxge_ioctl;
4599 ifp->if_start = mxge_start;
4600 /* Initialise the ifmedia structure */
4601 ifmedia_init(&sc->media, 0, mxge_media_change,
4603 mxge_set_media(sc, IFM_ETHER | IFM_AUTO);
4604 mxge_media_probe(sc);
4606 ether_ifattach(ifp, sc->mac_addr);
4607 /* ether_ifattach sets mtu to ETHERMTU */
4608 if (mxge_initial_mtu != ETHERMTU)
4609 mxge_change_mtu(sc, mxge_initial_mtu);
4611 mxge_add_sysctls(sc);
4612 #ifdef IFNET_BUF_RING
4613 ifp->if_transmit = mxge_transmit;
4614 ifp->if_qflush = mxge_qflush;
4619 mxge_free_rings(sc);
4621 mxge_free_slices(sc);
4622 abort_with_dmabench:
4623 mxge_dma_free(&sc->dmabench_dma);
4624 abort_with_zeropad_dma:
4625 mxge_dma_free(&sc->zeropad_dma);
4627 mxge_dma_free(&sc->cmd_dma);
4629 bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4631 pci_disable_busmaster(dev);
4632 lockuninit(&sc->cmd_lock);
4633 lockuninit(&sc->driver_lock);
4635 abort_with_parent_dmat:
4636 bus_dma_tag_destroy(sc->parent_dmat);
4643 mxge_detach(device_t dev)
4645 mxge_softc_t *sc = device_get_softc(dev);
4647 if (mxge_vlans_active(sc)) {
4648 device_printf(sc->dev,
4649 "Detach vlans before removing module\n");
4652 lockmgr(&sc->driver_lock, LK_EXCLUSIVE);
4654 if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
4656 lock(&sc->driver_lock, LK_RELEASE);
4657 ether_ifdetach(sc->ifp);
4658 callout_drain(&sc->co_hdl);
4659 ifmedia_removeall(&sc->media);
4660 mxge_dummy_rdma(sc, 0);
4661 mxge_rem_sysctls(sc);
4663 mxge_free_rings(sc);
4664 mxge_free_slices(sc);
4665 mxge_dma_free(&sc->dmabench_dma);
4666 mxge_dma_free(&sc->zeropad_dma);
4667 mxge_dma_free(&sc->cmd_dma);
4668 bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4669 pci_disable_busmaster(dev);
4670 lockuninit(&sc->cmd_lock);
4671 lockuninit(&sc->driver_lock);
4673 bus_dma_tag_destroy(sc->parent_dmat);
4678 mxge_shutdown(device_t dev)
4684 This file uses Myri10GE driver indentation.
4687 c-file-style:"linux"