1 /******************************************************************************
3 Copyright (c) 2006-2009, Myricom Inc.
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
9 1. Redistributions of source code must retain the above copyright notice,
10 this list of conditions and the following disclaimer.
12 2. Neither the name of the Myricom Inc, nor the names of its
13 contributors may be used to endorse or promote products derived from
14 this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
28 ***************************************************************************/
30 #include <sys/cdefs.h>
31 /*__FBSDID("$FreeBSD: src/sys/dev/mxge/if_mxge.c,v 1.63 2009/06/26 11:45:06 rwatson Exp $");*/
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/linker.h>
36 #include <sys/firmware.h>
37 #include <sys/endian.h>
38 #include <sys/in_cksum.h>
39 #include <sys/sockio.h>
41 #include <sys/malloc.h>
42 #include <sys/kernel.h>
43 #include <sys/module.h>
44 #include <sys/serialize.h>
45 #include <sys/socket.h>
46 #include <sys/sysctl.h>
48 /* count xmits ourselves, rather than via drbr */
51 #include <net/if_arp.h>
52 #include <net/ifq_var.h>
53 #include <net/ethernet.h>
54 #include <net/if_dl.h>
55 #include <net/if_media.h>
59 #include <net/if_types.h>
60 #include <net/vlan/if_vlan_var.h>
63 #include <netinet/in_systm.h>
64 #include <netinet/in.h>
65 #include <netinet/ip.h>
66 #include <netinet/tcp.h>
71 #include <bus/pci/pcireg.h>
72 #include <bus/pci/pcivar.h>
73 #include <bus/pci/pci_private.h> /* XXX for pci_cfg_restore */
75 #include <vm/vm.h> /* for pmap_mapdev() */
78 #if defined(__i386) || defined(__amd64)
79 #include <machine/specialreg.h>
82 #include <dev/netif/mxge/mxge_mcp.h>
83 #include <dev/netif/mxge/mcp_gen_header.h>
84 /*#define MXGE_FAKE_IFP*/
85 #include <dev/netif/mxge/if_mxge_var.h>
87 #include <sys/buf_ring.h>
93 static int mxge_nvidia_ecrc_enable = 1;
94 static int mxge_force_firmware = 0;
95 static int mxge_intr_coal_delay = 30;
96 static int mxge_deassert_wait = 1;
97 static int mxge_flow_control = 1;
98 static int mxge_verbose = 0;
99 static int mxge_lro_cnt = 8;
100 static int mxge_ticks;
101 static int mxge_max_slices = 1;
102 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
103 static int mxge_always_promisc = 0;
104 static int mxge_initial_mtu = ETHERMTU_JUMBO;
105 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
106 static char *mxge_fw_aligned = "mxge_eth_z8e";
107 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
108 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
110 static int mxge_probe(device_t dev);
111 static int mxge_attach(device_t dev);
112 static int mxge_detach(device_t dev);
113 static int mxge_shutdown(device_t dev);
114 static void mxge_intr(void *arg);
116 static device_method_t mxge_methods[] =
118 /* Device interface */
119 DEVMETHOD(device_probe, mxge_probe),
120 DEVMETHOD(device_attach, mxge_attach),
121 DEVMETHOD(device_detach, mxge_detach),
122 DEVMETHOD(device_shutdown, mxge_shutdown),
126 static driver_t mxge_driver =
130 sizeof(mxge_softc_t),
133 static devclass_t mxge_devclass;
135 /* Declare ourselves to be a child of the PCI bus.*/
136 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
137 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
138 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
140 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
141 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
142 static int mxge_close(mxge_softc_t *sc);
143 static int mxge_open(mxge_softc_t *sc);
144 static void mxge_tick(void *arg);
146 /* XXX: we don't have Large Receive Offload support yet */
148 mxge_lro_rx(struct mxge_slice_state *ss, struct mbuf *m_head, uint32_t csum)
157 mxge_lro_flush(struct mxge_slice_state *ss, struct lro_entry *lro)
164 mxge_probe(device_t dev)
169 if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
170 ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
171 (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
172 rev = pci_get_revid(dev);
174 case MXGE_PCI_REV_Z8E:
175 device_set_desc(dev, "Myri10G-PCIE-8A");
177 case MXGE_PCI_REV_Z8ES:
178 device_set_desc(dev, "Myri10G-PCIE-8B");
181 device_set_desc(dev, "Myri10G-PCIE-8??");
182 device_printf(dev, "Unrecognized rev %d NIC\n",
192 mxge_enable_wc(mxge_softc_t *sc)
195 #if defined(__i386) || defined(__amd64)
200 len = rman_get_size(sc->mem_res);
201 err = pmap_change_attr((vm_offset_t) sc->sram,
202 len, PAT_WRITE_COMBINING);
204 device_printf(sc->dev, "pmap_change_attr failed, %d\n",
210 sc->wc = 0; /* TBD: PAT support */
215 /* callback to get our DMA address */
217 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
221 *(bus_addr_t *) arg = segs->ds_addr;
226 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
227 bus_size_t alignment)
230 device_t dev = sc->dev;
231 bus_size_t boundary, maxsegsize;
233 if (bytes > 4096 && alignment == 4096) {
241 /* allocate DMAable memory tags */
242 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
243 alignment, /* alignment */
244 boundary, /* boundary */
245 BUS_SPACE_MAXADDR, /* low */
246 BUS_SPACE_MAXADDR, /* high */
247 NULL, NULL, /* filter */
250 maxsegsize, /* maxsegsize */
251 BUS_DMA_COHERENT, /* flags */
252 &dma->dmat); /* tag */
254 device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
258 /* allocate DMAable memory & map */
259 err = bus_dmamem_alloc(dma->dmat, &dma->addr,
260 (BUS_DMA_WAITOK | BUS_DMA_COHERENT
261 | BUS_DMA_ZERO), &dma->map);
263 device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
264 goto abort_with_dmat;
267 /* load the memory */
268 err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
269 mxge_dmamap_callback,
270 (void *)&dma->bus_addr, 0);
272 device_printf(dev, "couldn't load map (err = %d)\n", err);
278 bus_dmamem_free(dma->dmat, dma->addr, dma->map);
280 (void)bus_dma_tag_destroy(dma->dmat);
286 mxge_dma_free(mxge_dma_t *dma)
288 bus_dmamap_unload(dma->dmat, dma->map);
289 bus_dmamem_free(dma->dmat, dma->addr, dma->map);
290 (void)bus_dma_tag_destroy(dma->dmat);
294 * The eeprom strings on the lanaiX have the format
301 mxge_parse_strings(mxge_softc_t *sc)
303 #define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
308 ptr = sc->eeprom_strings;
309 limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
311 while (ptr < limit && *ptr != '\0') {
312 if (memcmp(ptr, "MAC=", 4) == 0) {
314 sc->mac_addr_string = ptr;
315 for (i = 0; i < 6; i++) {
317 if ((ptr + 2) > limit)
319 sc->mac_addr[i] = strtoul(ptr, NULL, 16);
322 } else if (memcmp(ptr, "PC=", 3) == 0) {
324 strncpy(sc->product_code_string, ptr,
325 sizeof (sc->product_code_string) - 1);
326 } else if (memcmp(ptr, "SN=", 3) == 0) {
328 strncpy(sc->serial_number_string, ptr,
329 sizeof (sc->serial_number_string) - 1);
331 MXGE_NEXT_STRING(ptr);
338 device_printf(sc->dev, "failed to parse eeprom_strings\n");
343 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
345 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
348 unsigned long base, off;
350 device_t pdev, mcp55;
351 uint16_t vendor_id, device_id, word;
352 uintptr_t bus, slot, func, ivend, idev;
356 if (!mxge_nvidia_ecrc_enable)
359 pdev = device_get_parent(device_get_parent(sc->dev));
361 device_printf(sc->dev, "could not find parent?\n");
364 vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
365 device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
367 if (vendor_id != 0x10de)
372 if (device_id == 0x005d) {
373 /* ck804, base address is magic */
375 } else if (device_id >= 0x0374 && device_id <= 0x378) {
376 /* mcp55, base address stored in chipset */
377 mcp55 = pci_find_bsf(0, 0, 0);
379 0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
380 0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
381 word = pci_read_config(mcp55, 0x90, 2);
382 base = ((unsigned long)word & 0x7ffeU) << 25;
389 Test below is commented because it is believed that doing
390 config read/write beyond 0xff will access the config space
391 for the next larger function. Uncomment this and remove
392 the hacky pmap_mapdev() way of accessing config space when
393 FreeBSD grows support for extended pcie config space access
396 /* See if we can, by some miracle, access the extended
398 val = pci_read_config(pdev, 0x178, 4);
399 if (val != 0xffffffff) {
401 pci_write_config(pdev, 0x178, val, 4);
405 /* Rather than using normal pci config space writes, we must
406 * map the Nvidia config space ourselves. This is because on
407 * opteron/nvidia class machine the 0xe000000 mapping is
408 * handled by the nvidia chipset, that means the internal PCI
409 * device (the on-chip northbridge), or the amd-8131 bridge
410 * and things behind them are not visible by this method.
413 BUS_READ_IVAR(device_get_parent(pdev), pdev,
415 BUS_READ_IVAR(device_get_parent(pdev), pdev,
416 PCI_IVAR_SLOT, &slot);
417 BUS_READ_IVAR(device_get_parent(pdev), pdev,
418 PCI_IVAR_FUNCTION, &func);
419 BUS_READ_IVAR(device_get_parent(pdev), pdev,
420 PCI_IVAR_VENDOR, &ivend);
421 BUS_READ_IVAR(device_get_parent(pdev), pdev,
422 PCI_IVAR_DEVICE, &idev);
425 + 0x00100000UL * (unsigned long)bus
426 + 0x00001000UL * (unsigned long)(func
429 /* map it into the kernel */
430 va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
434 device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
437 /* get a pointer to the config space mapped into the kernel */
438 cfgptr = va + (off & PAGE_MASK);
440 /* make sure that we can really access it */
441 vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
442 device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
443 if (! (vendor_id == ivend && device_id == idev)) {
444 device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
445 vendor_id, device_id);
446 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
450 ptr32 = (uint32_t*)(cfgptr + 0x178);
453 if (val == 0xffffffff) {
454 device_printf(sc->dev, "extended mapping failed\n");
455 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
459 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
461 device_printf(sc->dev,
462 "Enabled ECRC on upstream Nvidia bridge "
464 (int)bus, (int)slot, (int)func);
469 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
471 device_printf(sc->dev,
472 "Nforce 4 chipset on non-x86/amd64!?!?!\n");
479 mxge_dma_test(mxge_softc_t *sc, int test_type)
482 bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
488 /* Run a small DMA test.
489 * The magic multipliers to the length tell the firmware
490 * to do DMA read, write, or read+write tests. The
491 * results are returned in cmd.data0. The upper 16
492 * bits of the return is the number of transfers completed.
493 * The lower 16 bits is the time in 0.5us ticks that the
494 * transfers took to complete.
497 len = sc->tx_boundary;
499 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
500 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
501 cmd.data2 = len * 0x10000;
502 status = mxge_send_cmd(sc, test_type, &cmd);
507 sc->read_dma = ((cmd.data0>>16) * len * 2) /
508 (cmd.data0 & 0xffff);
509 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
510 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
511 cmd.data2 = len * 0x1;
512 status = mxge_send_cmd(sc, test_type, &cmd);
517 sc->write_dma = ((cmd.data0>>16) * len * 2) /
518 (cmd.data0 & 0xffff);
520 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
521 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
522 cmd.data2 = len * 0x10001;
523 status = mxge_send_cmd(sc, test_type, &cmd);
528 sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
529 (cmd.data0 & 0xffff);
532 if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
533 device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
540 * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
541 * when the PCI-E Completion packets are aligned on an 8-byte
542 * boundary. Some PCI-E chip sets always align Completion packets; on
543 * the ones that do not, the alignment can be enforced by enabling
544 * ECRC generation (if supported).
546 * When PCI-E Completion packets are not aligned, it is actually more
547 * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
549 * If the driver can neither enable ECRC nor verify that it has
550 * already been enabled, then it must use a firmware image which works
551 * around unaligned completion packets (ethp_z8e.dat), and it should
552 * also ensure that it never gives the device a Read-DMA which is
553 * larger than 2KB by setting the tx_boundary to 2KB. If ECRC is
554 * enabled, then the driver should use the aligned (eth_z8e.dat)
555 * firmware image, and set tx_boundary to 4KB.
559 mxge_firmware_probe(mxge_softc_t *sc)
561 device_t dev = sc->dev;
565 sc->tx_boundary = 4096;
567 * Verify the max read request size was set to 4KB
568 * before trying the test with 4KB.
570 if (pci_find_extcap(dev, PCIY_EXPRESS, ®) == 0) {
571 pectl = pci_read_config(dev, reg + 0x8, 2);
572 if ((pectl & (5 << 12)) != (5 << 12)) {
573 device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
575 sc->tx_boundary = 2048;
580 * load the optimized firmware (which assumes aligned PCIe
581 * completions) in order to see if it works on this host.
583 sc->fw_name = mxge_fw_aligned;
584 status = mxge_load_firmware(sc, 1);
590 * Enable ECRC if possible
592 mxge_enable_nvidia_ecrc(sc);
595 * Run a DMA test which watches for unaligned completions and
596 * aborts on the first one seen.
599 status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
601 return 0; /* keep the aligned firmware */
604 device_printf(dev, "DMA test failed: %d\n", status);
605 if (status == ENOSYS)
606 device_printf(dev, "Falling back to ethp! "
607 "Please install up to date fw\n");
612 mxge_select_firmware(mxge_softc_t *sc)
617 if (mxge_force_firmware != 0) {
618 if (mxge_force_firmware == 1)
623 device_printf(sc->dev,
624 "Assuming %s completions (forced)\n",
625 aligned ? "aligned" : "unaligned");
629 /* if the PCIe link width is 4 or less, we can use the aligned
630 firmware and skip any checks */
631 if (sc->link_width != 0 && sc->link_width <= 4) {
632 device_printf(sc->dev,
633 "PCIe x%d Link, expect reduced performance\n",
639 if (0 == mxge_firmware_probe(sc))
644 sc->fw_name = mxge_fw_aligned;
645 sc->tx_boundary = 4096;
647 sc->fw_name = mxge_fw_unaligned;
648 sc->tx_boundary = 2048;
650 return (mxge_load_firmware(sc, 0));
660 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
664 if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
665 device_printf(sc->dev, "Bad firmware type: 0x%x\n",
666 be32toh(hdr->mcp_type));
670 /* save firmware version for sysctl */
671 strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
673 device_printf(sc->dev, "firmware id: %s\n", hdr->version);
675 ksscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
676 &sc->fw_ver_minor, &sc->fw_ver_tiny);
678 if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
679 && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
680 device_printf(sc->dev, "Found firmware version %s\n",
682 device_printf(sc->dev, "Driver needs %d.%d\n",
683 MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
692 z_alloc(void *nil, u_int items, u_int size)
696 ptr = kmalloc(items * size, M_TEMP, M_NOWAIT);
701 z_free(void *nil, void *ptr)
708 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
711 const mcp_gen_header_t *hdr;
718 fw = firmware_image_load(sc->fw_name, NULL);
720 device_printf(sc->dev, "Could not find firmware image %s\n",
725 /* setup zlib and decompress f/w */
726 bzero(&zs, sizeof (zs));
729 status = inflateInit(&zs);
730 if (status != Z_OK) {
735 /* the uncompressed size is stored as the firmware version,
736 which would otherwise go unused */
737 fw_len = (size_t) fw->version;
738 inflate_buffer = kmalloc(fw_len, M_TEMP, M_NOWAIT);
739 if (inflate_buffer == NULL)
741 zs.avail_in = fw->datasize;
742 zs.next_in = __DECONST(char *, fw->data);
743 zs.avail_out = fw_len;
744 zs.next_out = inflate_buffer;
745 status = inflate(&zs, Z_FINISH);
746 if (status != Z_STREAM_END) {
747 device_printf(sc->dev, "zlib %d\n", status);
749 goto abort_with_buffer;
752 fw_len = fw->fw_imglen;
754 hdr_offset = htobe32(*(const uint32_t *)
755 (fw->fw_image + MCP_HEADER_PTR_OFFSET));
756 if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
757 device_printf(sc->dev, "Bad firmware file");
761 hdr = (const void*)(fw->fw_image + hdr_offset);
763 status = mxge_validate_firmware(sc, hdr);
767 /* Copy the inflated firmware to NIC SRAM. */
768 for (i = 0; i < fw_len; i += 256) {
769 mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
771 min(256U, (unsigned)(fw_len - i)));
781 kfree(inflate_buffer, M_TEMP);
786 firmware_image_unload(fw);
791 * Enable or disable periodic RDMAs from the host to make certain
792 * chipsets resend dropped PCIe messages
796 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
799 volatile uint32_t *confirm;
800 volatile char *submit;
801 uint32_t *buf, dma_low, dma_high;
804 buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
806 /* clear confirmation addr */
807 confirm = (volatile uint32_t *)sc->cmd;
811 /* send an rdma command to the PCIe engine, and wait for the
812 response in the confirmation address. The firmware should
813 write a -1 there to indicate it is alive and well
816 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
817 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
818 buf[0] = htobe32(dma_high); /* confirm addr MSW */
819 buf[1] = htobe32(dma_low); /* confirm addr LSW */
820 buf[2] = htobe32(0xffffffff); /* confirm data */
821 dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
822 dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
823 buf[3] = htobe32(dma_high); /* dummy addr MSW */
824 buf[4] = htobe32(dma_low); /* dummy addr LSW */
825 buf[5] = htobe32(enable); /* enable? */
828 submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
830 mxge_pio_copy(submit, buf, 64);
835 while (*confirm != 0xffffffff && i < 20) {
839 if (*confirm != 0xffffffff) {
840 device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
841 (enable ? "enable" : "disable"), confirm,
848 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
851 char buf_bytes[sizeof(*buf) + 8];
852 volatile mcp_cmd_response_t *response = sc->cmd;
853 volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
854 uint32_t dma_low, dma_high;
855 int err, sleep_total = 0;
857 /* ensure buf is aligned to 8 bytes */
858 buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
860 buf->data0 = htobe32(data->data0);
861 buf->data1 = htobe32(data->data1);
862 buf->data2 = htobe32(data->data2);
863 buf->cmd = htobe32(cmd);
864 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
865 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
867 buf->response_addr.low = htobe32(dma_low);
868 buf->response_addr.high = htobe32(dma_high);
871 * We may be called during attach, before if_serializer is available.
872 * This is not a fast path, just check for NULL
874 if (sc->ifp->if_serializer)
875 lwkt_serialize_enter(sc->ifp->if_serializer);
877 response->result = 0xffffffff;
879 mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
881 /* wait up to 20ms */
883 for (sleep_total = 0; sleep_total < 20; sleep_total++) {
884 bus_dmamap_sync(sc->cmd_dma.dmat,
885 sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
887 switch (be32toh(response->result)) {
889 data->data0 = be32toh(response->data);
895 case MXGEFW_CMD_UNKNOWN:
898 case MXGEFW_CMD_ERROR_UNALIGNED:
901 case MXGEFW_CMD_ERROR_BUSY:
905 device_printf(sc->dev,
907 "failed, result = %d\n",
908 cmd, be32toh(response->result));
916 device_printf(sc->dev, "mxge: command %d timed out"
918 cmd, be32toh(response->result));
919 if (sc->ifp->if_serializer)
920 lwkt_serialize_exit(sc->ifp->if_serializer);
925 mxge_adopt_running_firmware(mxge_softc_t *sc)
927 struct mcp_gen_header *hdr;
928 const size_t bytes = sizeof (struct mcp_gen_header);
932 /* find running firmware header */
933 hdr_offset = htobe32(*(volatile uint32_t *)
934 (sc->sram + MCP_HEADER_PTR_OFFSET));
936 if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
937 device_printf(sc->dev,
938 "Running firmware has bad header offset (%d)\n",
943 /* copy header of running firmware from SRAM to host memory to
944 * validate firmware */
945 hdr = kmalloc(bytes, M_DEVBUF, M_NOWAIT);
947 device_printf(sc->dev, "could not kmalloc firmware hdr\n");
950 bus_space_read_region_1(rman_get_bustag(sc->mem_res),
951 rman_get_bushandle(sc->mem_res),
952 hdr_offset, (char *)hdr, bytes);
953 status = mxge_validate_firmware(sc, hdr);
954 kfree(hdr, M_DEVBUF);
957 * check to see if adopted firmware has bug where adopting
958 * it will cause broadcasts to be filtered unless the NIC
959 * is kept in ALLMULTI mode
961 if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
962 sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
963 sc->adopted_rx_filter_bug = 1;
964 device_printf(sc->dev, "Adopting fw %d.%d.%d: "
965 "working around rx filter bug\n",
966 sc->fw_ver_major, sc->fw_ver_minor,
975 mxge_load_firmware(mxge_softc_t *sc, int adopt)
977 volatile uint32_t *confirm;
978 volatile char *submit;
980 uint32_t *buf, size, dma_low, dma_high;
983 buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
985 size = sc->sram_size;
986 status = mxge_load_firmware_helper(sc, &size);
990 /* Try to use the currently running firmware, if
992 status = mxge_adopt_running_firmware(sc);
994 device_printf(sc->dev,
995 "failed to adopt running firmware\n");
998 device_printf(sc->dev,
999 "Successfully adopted running firmware\n");
1000 if (sc->tx_boundary == 4096) {
1001 device_printf(sc->dev,
1002 "Using firmware currently running on NIC"
1004 device_printf(sc->dev,
1005 "performance consider loading optimized "
1008 sc->fw_name = mxge_fw_unaligned;
1009 sc->tx_boundary = 2048;
1012 /* clear confirmation addr */
1013 confirm = (volatile uint32_t *)sc->cmd;
1016 /* send a reload command to the bootstrap MCP, and wait for the
1017 response in the confirmation address. The firmware should
1018 write a -1 there to indicate it is alive and well
1021 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
1022 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
1024 buf[0] = htobe32(dma_high); /* confirm addr MSW */
1025 buf[1] = htobe32(dma_low); /* confirm addr LSW */
1026 buf[2] = htobe32(0xffffffff); /* confirm data */
1028 /* FIX: All newest firmware should un-protect the bottom of
1029 the sram before handoff. However, the very first interfaces
1030 do not. Therefore the handoff copy must skip the first 8 bytes
1032 /* where the code starts*/
1033 buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1034 buf[4] = htobe32(size - 8); /* length of code */
1035 buf[5] = htobe32(8); /* where to copy to */
1036 buf[6] = htobe32(0); /* where to jump to */
1038 submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1039 mxge_pio_copy(submit, buf, 64);
1044 while (*confirm != 0xffffffff && i < 20) {
1047 bus_dmamap_sync(sc->cmd_dma.dmat,
1048 sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1050 if (*confirm != 0xffffffff) {
1051 device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1060 mxge_update_mac_address(mxge_softc_t *sc)
1063 uint8_t *addr = sc->mac_addr;
1067 cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1068 | (addr[2] << 8) | addr[3]);
1070 cmd.data1 = ((addr[4] << 8) | (addr[5]));
1072 status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1077 mxge_change_pause(mxge_softc_t *sc, int pause)
1083 status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1086 status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1090 device_printf(sc->dev, "Failed to set flow control mode\n");
1098 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1103 if (mxge_always_promisc)
1107 status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1110 status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1114 device_printf(sc->dev, "Failed to set promisc mode\n");
1119 mxge_set_multicast_list(mxge_softc_t *sc)
1122 struct ifmultiaddr *ifma;
1123 struct ifnet *ifp = sc->ifp;
1126 /* This firmware is known to not support multicast */
1127 if (!sc->fw_multicast_support)
1130 /* Disable multicast filtering while we play with the lists*/
1131 err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1133 device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1134 " error status: %d\n", err);
1138 if (sc->adopted_rx_filter_bug)
1141 if (ifp->if_flags & IFF_ALLMULTI)
1142 /* request to disable multicast filtering, so quit here */
1145 /* Flush all the filters */
1147 err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1149 device_printf(sc->dev,
1150 "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1151 ", error status: %d\n", err);
1155 /* Walk the multicast list, and add each address */
1157 lwkt_serialize_enter(ifp->if_serializer);
1158 LIST_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1159 if (ifma->ifma_addr->sa_family != AF_LINK)
1161 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1163 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1165 cmd.data0 = htonl(cmd.data0);
1166 cmd.data1 = htonl(cmd.data1);
1167 err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1169 device_printf(sc->dev, "Failed "
1170 "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1172 /* abort, leaving multicast filtering off */
1173 lwkt_serialize_exit(ifp->if_serializer);
1177 lwkt_serialize_exit(ifp->if_serializer);
1178 /* Enable multicast filtering */
1179 err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1181 device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1182 ", error status: %d\n", err);
1187 mxge_max_mtu(mxge_softc_t *sc)
1192 if (MJUMPAGESIZE - MXGEFW_PAD > MXGEFW_MAX_MTU)
1193 return MXGEFW_MAX_MTU - MXGEFW_PAD;
1195 /* try to set nbufs to see if it we can
1196 use virtually contiguous jumbos */
1198 status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1201 return MXGEFW_MAX_MTU - MXGEFW_PAD;
1203 /* otherwise, we're limited to MJUMPAGESIZE */
1204 return MJUMPAGESIZE - MXGEFW_PAD;
1208 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1210 struct mxge_slice_state *ss;
1211 mxge_rx_done_t *rx_done;
1212 volatile uint32_t *irq_claim;
1216 /* try to send a reset command to the card to see if it
1218 memset(&cmd, 0, sizeof (cmd));
1219 status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1221 device_printf(sc->dev, "failed reset\n");
1225 mxge_dummy_rdma(sc, 1);
1228 /* set the intrq size */
1229 cmd.data0 = sc->rx_ring_size;
1230 status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1233 * Even though we already know how many slices are supported
1234 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1235 * has magic side effects, and must be called after a reset.
1236 * It must be called prior to calling any RSS related cmds,
1237 * including assigning an interrupt queue for anything but
1238 * slice 0. It must also be called *after*
1239 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1240 * the firmware to compute offsets.
1243 if (sc->num_slices > 1) {
1244 /* ask the maximum number of slices it supports */
1245 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1248 device_printf(sc->dev,
1249 "failed to get number of slices\n");
1253 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1254 * to setting up the interrupt queue DMA
1256 cmd.data0 = sc->num_slices;
1257 cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1258 #ifdef IFNET_BUF_RING
1259 cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1261 status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1264 device_printf(sc->dev,
1265 "failed to set number of slices\n");
1271 if (interrupts_setup) {
1272 /* Now exchange information about interrupts */
1273 for (slice = 0; slice < sc->num_slices; slice++) {
1274 rx_done = &sc->ss[slice].rx_done;
1275 memset(rx_done->entry, 0, sc->rx_ring_size);
1276 cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1277 cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1279 status |= mxge_send_cmd(sc,
1280 MXGEFW_CMD_SET_INTRQ_DMA,
1285 status |= mxge_send_cmd(sc,
1286 MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1289 sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1291 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1292 irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1295 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1297 sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1299 device_printf(sc->dev, "failed set interrupt parameters\n");
1304 *sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1307 /* run a DMA benchmark */
1308 (void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1310 for (slice = 0; slice < sc->num_slices; slice++) {
1311 ss = &sc->ss[slice];
1313 ss->irq_claim = irq_claim + (2 * slice);
1314 /* reset mcp/driver shared state back to 0 */
1315 ss->rx_done.idx = 0;
1316 ss->rx_done.cnt = 0;
1319 ss->tx.pkt_done = 0;
1320 ss->tx.queue_active = 0;
1321 ss->tx.activate = 0;
1322 ss->tx.deactivate = 0;
1327 ss->rx_small.cnt = 0;
1328 ss->lro_bad_csum = 0;
1330 ss->lro_flushed = 0;
1331 if (ss->fw_stats != NULL) {
1332 ss->fw_stats->valid = 0;
1333 ss->fw_stats->send_done_count = 0;
1336 sc->rdma_tags_available = 15;
1337 status = mxge_update_mac_address(sc);
1338 mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1339 mxge_change_pause(sc, sc->pause);
1340 mxge_set_multicast_list(sc);
1345 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1348 unsigned int intr_coal_delay;
1352 intr_coal_delay = sc->intr_coal_delay;
1353 err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1357 if (intr_coal_delay == sc->intr_coal_delay)
1360 if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1363 lwkt_serialize_enter(sc->ifp->if_serializer);
1364 *sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1365 sc->intr_coal_delay = intr_coal_delay;
1367 lwkt_serialize_exit(sc->ifp->if_serializer);
1372 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1375 unsigned int enabled;
1379 enabled = sc->pause;
1380 err = sysctl_handle_int(oidp, &enabled, arg2, req);
1384 if (enabled == sc->pause)
1387 lwkt_serialize_enter(sc->ifp->if_serializer);
1388 err = mxge_change_pause(sc, enabled);
1389 lwkt_serialize_exit(sc->ifp->if_serializer);
1394 mxge_change_lro_locked(mxge_softc_t *sc, int lro_cnt)
1401 ifp->if_capenable &= ~IFCAP_LRO;
1403 ifp->if_capenable |= IFCAP_LRO;
1404 sc->lro_cnt = lro_cnt;
1405 if (ifp->if_flags & IFF_RUNNING) {
1407 err = mxge_open(sc);
1413 mxge_change_lro(SYSCTL_HANDLER_ARGS)
1416 unsigned int lro_cnt;
1420 lro_cnt = sc->lro_cnt;
1421 err = sysctl_handle_int(oidp, &lro_cnt, arg2, req);
1425 if (lro_cnt == sc->lro_cnt)
1431 lwkt_serialize_enter(sc->ifp->if_serializer);
1432 err = mxge_change_lro_locked(sc, lro_cnt);
1433 lwkt_serialize_exit(sc->ifp->if_serializer);
1438 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1444 arg2 = be32toh(*(int *)arg1);
1446 err = sysctl_handle_int(oidp, arg1, arg2, req);
1452 mxge_rem_sysctls(mxge_softc_t *sc)
1454 struct mxge_slice_state *ss;
1457 if (sc->sysctl_tree != NULL) {
1458 sysctl_ctx_free(&sc->sysctl_ctx);
1459 sc->sysctl_tree = NULL;
1461 if (sc->slice_sysctl_tree == NULL)
1464 for (slice = 0; slice < sc->num_slices; slice++) {
1465 ss = &sc->ss[slice];
1466 if (ss == NULL || ss->sysctl_tree == NULL)
1468 sysctl_ctx_free(&ss->sysctl_ctx);
1469 ss->sysctl_tree = NULL;
1471 sysctl_ctx_free(&sc->slice_sysctl_ctx);
1472 sc->slice_sysctl_tree = NULL;
1476 mxge_add_sysctls(mxge_softc_t *sc)
1478 struct sysctl_ctx_list *ctx;
1479 struct sysctl_oid_list *children;
1481 struct mxge_slice_state *ss;
1485 ctx = &sc->sysctl_ctx;
1486 sysctl_ctx_init(ctx);
1487 sc->sysctl_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_STATIC_CHILDREN(_hw),
1489 device_get_nameunit(sc->dev),
1491 if (sc->sysctl_tree == NULL) {
1492 device_printf(sc->dev, "can't add sysctl node\n");
1496 children = SYSCTL_CHILDREN(sc->sysctl_tree);
1497 fw = sc->ss[0].fw_stats;
1499 /* random information */
1500 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1502 CTLFLAG_RD, &sc->fw_version,
1503 0, "firmware version");
1504 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1506 CTLFLAG_RD, &sc->serial_number_string,
1507 0, "serial number");
1508 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1510 CTLFLAG_RD, &sc->product_code_string,
1512 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1514 CTLFLAG_RD, &sc->link_width,
1516 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1518 CTLFLAG_RD, &sc->tx_boundary,
1520 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1522 CTLFLAG_RD, &sc->wc,
1523 0, "write combining PIO?");
1524 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1526 CTLFLAG_RD, &sc->read_dma,
1527 0, "DMA Read speed in MB/s");
1528 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1530 CTLFLAG_RD, &sc->write_dma,
1531 0, "DMA Write speed in MB/s");
1532 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1533 "read_write_dma_MBs",
1534 CTLFLAG_RD, &sc->read_write_dma,
1535 0, "DMA concurrent Read/Write speed in MB/s");
1538 /* performance related tunables */
1539 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1541 CTLTYPE_INT|CTLFLAG_RW, sc,
1542 0, mxge_change_intr_coal,
1543 "I", "interrupt coalescing delay in usecs");
1545 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1546 "flow_control_enabled",
1547 CTLTYPE_INT|CTLFLAG_RW, sc,
1548 0, mxge_change_flow_control,
1549 "I", "interrupt coalescing delay in usecs");
1551 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1553 CTLFLAG_RW, &mxge_deassert_wait,
1554 0, "Wait for IRQ line to go low in ihandler");
1556 /* stats block from firmware is in network byte order.
1558 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1560 CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1561 0, mxge_handle_be32,
1563 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1564 "rdma_tags_available",
1565 CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1566 0, mxge_handle_be32,
1567 "I", "rdma_tags_available");
1568 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1569 "dropped_bad_crc32",
1570 CTLTYPE_INT|CTLFLAG_RD,
1571 &fw->dropped_bad_crc32,
1572 0, mxge_handle_be32,
1573 "I", "dropped_bad_crc32");
1574 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1576 CTLTYPE_INT|CTLFLAG_RD,
1577 &fw->dropped_bad_phy,
1578 0, mxge_handle_be32,
1579 "I", "dropped_bad_phy");
1580 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1581 "dropped_link_error_or_filtered",
1582 CTLTYPE_INT|CTLFLAG_RD,
1583 &fw->dropped_link_error_or_filtered,
1584 0, mxge_handle_be32,
1585 "I", "dropped_link_error_or_filtered");
1586 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1587 "dropped_link_overflow",
1588 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1589 0, mxge_handle_be32,
1590 "I", "dropped_link_overflow");
1591 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1592 "dropped_multicast_filtered",
1593 CTLTYPE_INT|CTLFLAG_RD,
1594 &fw->dropped_multicast_filtered,
1595 0, mxge_handle_be32,
1596 "I", "dropped_multicast_filtered");
1597 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1598 "dropped_no_big_buffer",
1599 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1600 0, mxge_handle_be32,
1601 "I", "dropped_no_big_buffer");
1602 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1603 "dropped_no_small_buffer",
1604 CTLTYPE_INT|CTLFLAG_RD,
1605 &fw->dropped_no_small_buffer,
1606 0, mxge_handle_be32,
1607 "I", "dropped_no_small_buffer");
1608 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1610 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1611 0, mxge_handle_be32,
1612 "I", "dropped_overrun");
1613 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1615 CTLTYPE_INT|CTLFLAG_RD,
1617 0, mxge_handle_be32,
1618 "I", "dropped_pause");
1619 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1621 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1622 0, mxge_handle_be32,
1623 "I", "dropped_runt");
1625 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1626 "dropped_unicast_filtered",
1627 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1628 0, mxge_handle_be32,
1629 "I", "dropped_unicast_filtered");
1631 /* verbose printing? */
1632 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1634 CTLFLAG_RW, &mxge_verbose,
1635 0, "verbose printing");
1638 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1640 CTLTYPE_INT|CTLFLAG_RW, sc,
1642 "I", "number of lro merge queues");
1645 /* add counters exported for debugging from all slices */
1646 sysctl_ctx_init(&sc->slice_sysctl_ctx);
1647 sc->slice_sysctl_tree =
1648 SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1649 "slice", CTLFLAG_RD, 0, "");
1651 for (slice = 0; slice < sc->num_slices; slice++) {
1652 ss = &sc->ss[slice];
1653 sysctl_ctx_init(&ss->sysctl_ctx);
1654 ctx = &ss->sysctl_ctx;
1655 children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1656 ksprintf(slice_num, "%d", slice);
1658 SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1660 children = SYSCTL_CHILDREN(ss->sysctl_tree);
1661 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1663 CTLFLAG_RD, &ss->rx_small.cnt,
1665 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1667 CTLFLAG_RD, &ss->rx_big.cnt,
1669 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1670 "lro_flushed", CTLFLAG_RD, &ss->lro_flushed,
1671 0, "number of lro merge queues flushed");
1673 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1674 "lro_queued", CTLFLAG_RD, &ss->lro_queued,
1675 0, "number of frames appended to lro merge"
1678 #ifndef IFNET_BUF_RING
1679 /* only transmit from slice 0 for now */
1683 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1685 CTLFLAG_RD, &ss->tx.req,
1688 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1690 CTLFLAG_RD, &ss->tx.done,
1692 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1694 CTLFLAG_RD, &ss->tx.pkt_done,
1696 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1698 CTLFLAG_RD, &ss->tx.stall,
1700 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1702 CTLFLAG_RD, &ss->tx.wake,
1704 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1706 CTLFLAG_RD, &ss->tx.defrag,
1708 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1710 CTLFLAG_RD, &ss->tx.queue_active,
1711 0, "tx_queue_active");
1712 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1714 CTLFLAG_RD, &ss->tx.activate,
1716 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1718 CTLFLAG_RD, &ss->tx.deactivate,
1719 0, "tx_deactivate");
1723 /* copy an array of mcp_kreq_ether_send_t's to the mcp. Copy
1724 backwards one at a time and handle ring wraps */
1727 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1728 mcp_kreq_ether_send_t *src, int cnt)
1730 int idx, starting_slot;
1731 starting_slot = tx->req;
1734 idx = (starting_slot + cnt) & tx->mask;
1735 mxge_pio_copy(&tx->lanai[idx],
1736 &src[cnt], sizeof(*src));
1742 * copy an array of mcp_kreq_ether_send_t's to the mcp. Copy
1743 * at most 32 bytes at a time, so as to avoid involving the software
1744 * pio handler in the nic. We re-write the first segment's flags
1745 * to mark them valid only after writing the entire chain
1749 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1754 volatile uint32_t *dst_ints;
1755 mcp_kreq_ether_send_t *srcp;
1756 volatile mcp_kreq_ether_send_t *dstp, *dst;
1759 idx = tx->req & tx->mask;
1761 last_flags = src->flags;
1764 dst = dstp = &tx->lanai[idx];
1767 if ((idx + cnt) < tx->mask) {
1768 for (i = 0; i < (cnt - 1); i += 2) {
1769 mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1770 wmb(); /* force write every 32 bytes */
1775 /* submit all but the first request, and ensure
1776 that it is submitted below */
1777 mxge_submit_req_backwards(tx, src, cnt);
1781 /* submit the first request */
1782 mxge_pio_copy(dstp, srcp, sizeof(*src));
1783 wmb(); /* barrier before setting valid flag */
1786 /* re-write the last 32-bits with the valid flags */
1787 src->flags = last_flags;
1788 src_ints = (uint32_t *)src;
1790 dst_ints = (volatile uint32_t *)dst;
1792 *dst_ints = *src_ints;
1800 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1801 int busdma_seg_cnt, int ip_off)
1804 mcp_kreq_ether_send_t *req;
1805 bus_dma_segment_t *seg;
1808 uint32_t low, high_swapped;
1809 int len, seglen, cum_len, cum_len_next;
1810 int next_is_first, chop, cnt, rdma_count, small;
1811 uint16_t pseudo_hdr_offset, cksum_offset, mss;
1812 uint8_t flags, flags_next;
1815 mss = m->m_pkthdr.tso_segsz;
1817 /* negative cum_len signifies to the
1818 * send loop that we are still in the
1819 * header portion of the TSO packet.
1822 /* ensure we have the ethernet, IP and TCP
1823 header together in the first mbuf, copy
1824 it to a scratch buffer if not */
1825 if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1826 m_copydata(m, 0, ip_off + sizeof (*ip),
1828 ip = (struct ip *)(ss->scratch + ip_off);
1830 ip = (struct ip *)(mtod(m, char *) + ip_off);
1832 if (__predict_false(m->m_len < ip_off + (ip->ip_hl << 2)
1834 m_copydata(m, 0, ip_off + (ip->ip_hl << 2)
1835 + sizeof (*tcp), ss->scratch);
1836 ip = (struct ip *)(mtod(m, char *) + ip_off);
1839 tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1840 cum_len = -(ip_off + ((ip->ip_hl + tcp->th_off) << 2));
1842 /* TSO implies checksum offload on this hardware */
1843 cksum_offset = ip_off + (ip->ip_hl << 2);
1844 flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1847 /* for TSO, pseudo_hdr_offset holds mss.
1848 * The firmware figures out where to put
1849 * the checksum by parsing the header. */
1850 pseudo_hdr_offset = htobe16(mss);
1857 /* "rdma_count" is the number of RDMAs belonging to the
1858 * current packet BEFORE the current send request. For
1859 * non-TSO packets, this is equal to "count".
1860 * For TSO packets, rdma_count needs to be reset
1861 * to 0 after a segment cut.
1863 * The rdma_count field of the send request is
1864 * the number of RDMAs of the packet starting at
1865 * that request. For TSO send requests with one ore more cuts
1866 * in the middle, this is the number of RDMAs starting
1867 * after the last cut in the request. All previous
1868 * segments before the last cut implicitly have 1 RDMA.
1870 * Since the number of RDMAs is not known beforehand,
1871 * it must be filled-in retroactively - after each
1872 * segmentation cut or at the end of the entire packet.
1875 while (busdma_seg_cnt) {
1876 /* Break the busdma segment up into pieces*/
1877 low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1878 high_swapped = htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1882 flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1884 cum_len_next = cum_len + seglen;
1885 (req-rdma_count)->rdma_count = rdma_count + 1;
1886 if (__predict_true(cum_len >= 0)) {
1888 chop = (cum_len_next > mss);
1889 cum_len_next = cum_len_next % mss;
1890 next_is_first = (cum_len_next == 0);
1891 flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1892 flags_next |= next_is_first *
1894 rdma_count |= -(chop | next_is_first);
1895 rdma_count += chop & !next_is_first;
1896 } else if (cum_len_next >= 0) {
1901 small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1902 flags_next = MXGEFW_FLAGS_TSO_PLD |
1903 MXGEFW_FLAGS_FIRST |
1904 (small * MXGEFW_FLAGS_SMALL);
1907 req->addr_high = high_swapped;
1908 req->addr_low = htobe32(low);
1909 req->pseudo_hdr_offset = pseudo_hdr_offset;
1911 req->rdma_count = 1;
1912 req->length = htobe16(seglen);
1913 req->cksum_offset = cksum_offset;
1914 req->flags = flags | ((cum_len & 1) *
1915 MXGEFW_FLAGS_ALIGN_ODD);
1918 cum_len = cum_len_next;
1923 if (__predict_false(cksum_offset > seglen))
1924 cksum_offset -= seglen;
1927 if (__predict_false(cnt > tx->max_desc))
1933 (req-rdma_count)->rdma_count = rdma_count;
1937 req->flags |= MXGEFW_FLAGS_TSO_LAST;
1938 } while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1940 tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1941 mxge_submit_req(tx, tx->req_list, cnt);
1942 #ifdef IFNET_BUF_RING
1943 if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1944 /* tell the NIC to start polling this slice */
1946 tx->queue_active = 1;
1954 bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1958 kprintf("tx->max_desc exceeded via TSO!\n");
1959 kprintf("mss = %d, %ld, %d!\n", mss,
1960 (long)seg - (long)tx->seg_list, tx->max_desc);
1967 #endif /* IFCAP_TSO4 */
1969 #ifdef MXGE_NEW_VLAN_API
1971 * We reproduce the software vlan tag insertion from
1972 * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
1973 * vlan tag insertion. We need to advertise this in order to have the
1974 * vlan interface respect our csum offload flags.
1976 static struct mbuf *
1977 mxge_vlan_tag_insert(struct mbuf *m)
1979 struct ether_vlan_header *evl;
1981 M_PREPEND(m, EVL_ENCAPLEN, MB_DONTWAIT);
1982 if (__predict_false(m == NULL))
1984 if (m->m_len < sizeof(*evl)) {
1985 m = m_pullup(m, sizeof(*evl));
1986 if (__predict_false(m == NULL))
1990 * Transform the Ethernet header into an Ethernet header
1991 * with 802.1Q encapsulation.
1993 evl = mtod(m, struct ether_vlan_header *);
1994 bcopy((char *)evl + EVL_ENCAPLEN,
1995 (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
1996 evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
1997 evl->evl_tag = htons(m->m_pkthdr.ether_vlantag);
1998 m->m_flags &= ~M_VLANTAG;
2001 #endif /* MXGE_NEW_VLAN_API */
2004 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
2007 mcp_kreq_ether_send_t *req;
2008 bus_dma_segment_t *seg;
2013 int cnt, cum_len, err, i, idx, odd_flag, ip_off;
2014 uint16_t pseudo_hdr_offset;
2015 uint8_t flags, cksum_offset;
2022 ip_off = sizeof (struct ether_header);
2023 #ifdef MXGE_NEW_VLAN_API
2024 if (m->m_flags & M_VLANTAG) {
2025 m = mxge_vlan_tag_insert(m);
2026 if (__predict_false(m == NULL))
2028 ip_off += EVL_ENCAPLEN;
2031 /* (try to) map the frame for DMA */
2032 idx = tx->req & tx->mask;
2033 err = bus_dmamap_load_mbuf_segment(tx->dmat, tx->info[idx].map,
2034 m, tx->seg_list, 1, &cnt,
2036 if (__predict_false(err == EFBIG)) {
2037 /* Too many segments in the chain. Try
2039 m_tmp = m_defrag(m, M_NOWAIT);
2040 if (m_tmp == NULL) {
2045 err = bus_dmamap_load_mbuf_segment(tx->dmat,
2047 m, tx->seg_list, 1, &cnt,
2050 if (__predict_false(err != 0)) {
2051 device_printf(sc->dev, "bus_dmamap_load_mbuf_segment returned %d"
2052 " packet len = %d\n", err, m->m_pkthdr.len);
2055 bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2056 BUS_DMASYNC_PREWRITE);
2057 tx->info[idx].m = m;
2060 /* TSO is different enough, we handle it in another routine */
2061 if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2062 mxge_encap_tso(ss, m, cnt, ip_off);
2069 pseudo_hdr_offset = 0;
2070 flags = MXGEFW_FLAGS_NO_TSO;
2072 /* checksum offloading? */
2073 if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
2074 /* ensure ip header is in first mbuf, copy
2075 it to a scratch buffer if not */
2076 if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
2077 m_copydata(m, 0, ip_off + sizeof (*ip),
2079 ip = (struct ip *)(ss->scratch + ip_off);
2081 ip = (struct ip *)(mtod(m, char *) + ip_off);
2083 cksum_offset = ip_off + (ip->ip_hl << 2);
2084 pseudo_hdr_offset = cksum_offset + m->m_pkthdr.csum_data;
2085 pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2086 req->cksum_offset = cksum_offset;
2087 flags |= MXGEFW_FLAGS_CKSUM;
2088 odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2092 if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2093 flags |= MXGEFW_FLAGS_SMALL;
2095 /* convert segments into a request list */
2098 req->flags = MXGEFW_FLAGS_FIRST;
2099 for (i = 0; i < cnt; i++) {
2101 htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2103 htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2104 req->length = htobe16(seg->ds_len);
2105 req->cksum_offset = cksum_offset;
2106 if (cksum_offset > seg->ds_len)
2107 cksum_offset -= seg->ds_len;
2110 req->pseudo_hdr_offset = pseudo_hdr_offset;
2111 req->pad = 0; /* complete solid 16-byte block */
2112 req->rdma_count = 1;
2113 req->flags |= flags | ((cum_len & 1) * odd_flag);
2114 cum_len += seg->ds_len;
2120 /* pad runts to 60 bytes */
2124 htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2126 htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2127 req->length = htobe16(60 - cum_len);
2128 req->cksum_offset = 0;
2129 req->pseudo_hdr_offset = pseudo_hdr_offset;
2130 req->pad = 0; /* complete solid 16-byte block */
2131 req->rdma_count = 1;
2132 req->flags |= flags | ((cum_len & 1) * odd_flag);
2136 tx->req_list[0].rdma_count = cnt;
2138 /* print what the firmware will see */
2139 for (i = 0; i < cnt; i++) {
2140 kprintf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2141 "cso:%d, flags:0x%x, rdma:%d\n",
2142 i, (int)ntohl(tx->req_list[i].addr_high),
2143 (int)ntohl(tx->req_list[i].addr_low),
2144 (int)ntohs(tx->req_list[i].length),
2145 (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2146 tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2147 tx->req_list[i].rdma_count);
2149 kprintf("--------------\n");
2151 tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2152 mxge_submit_req(tx, tx->req_list, cnt);
2153 #ifdef IFNET_BUF_RING
2154 if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2155 /* tell the NIC to start polling this slice */
2157 tx->queue_active = 1;
2170 #ifdef IFNET_BUF_RING
2172 mxge_qflush(struct ifnet *ifp)
2174 mxge_softc_t *sc = ifp->if_softc;
2179 for (slice = 0; slice < sc->num_slices; slice++) {
2180 tx = &sc->ss[slice].tx;
2181 lwkt_serialize_enter(sc->ifp->if_serializer);
2182 while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2184 lwkt_serialize_exit(sc->ifp->if_serializer);
2190 mxge_start_locked(struct mxge_slice_state *ss)
2201 while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2202 m = drbr_dequeue(ifp, tx->br);
2206 /* let BPF see it */
2209 /* give it to the nic */
2212 /* ran out of transmit slots */
2213 if (((ss->if_flags & IFF_OACTIVE) == 0)
2214 && (!drbr_empty(ifp, tx->br))) {
2215 ss->if_flags |= IFF_OACTIVE;
2221 mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2232 if ((ss->if_flags & (IFF_RUNNING|IFF_OACTIVE)) !=
2234 err = drbr_enqueue(ifp, tx->br, m);
2238 if (drbr_empty(ifp, tx->br) &&
2239 ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2240 /* let BPF see it */
2242 /* give it to the nic */
2244 } else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2247 if (!drbr_empty(ifp, tx->br))
2248 mxge_start_locked(ss);
2253 mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2255 mxge_softc_t *sc = ifp->if_softc;
2256 struct mxge_slice_state *ss;
2262 slice = m->m_pkthdr.flowid;
2264 slice &= (sc->num_slices - 1); /* num_slices always power of 2 */
2266 ss = &sc->ss[slice];
2269 if(lwkt_serialize_try(ifp->if_serializer)) {
2270 err = mxge_transmit_locked(ss, m);
2271 lwkt_serialize_exit(ifp->if_serializer);
2273 err = drbr_enqueue(ifp, tx->br, m);
2282 mxge_start_locked(struct mxge_slice_state *ss)
2292 while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2293 m = ifq_dequeue(&ifp->if_snd, NULL);
2297 /* let BPF see it */
2300 /* give it to the nic */
2303 /* ran out of transmit slots */
2304 if ((sc->ifp->if_flags & IFF_OACTIVE) == 0) {
2305 sc->ifp->if_flags |= IFF_OACTIVE;
2311 mxge_start(struct ifnet *ifp)
2313 mxge_softc_t *sc = ifp->if_softc;
2314 struct mxge_slice_state *ss;
2316 /* only use the first slice for now */
2318 lwkt_serialize_enter(ifp->if_serializer);
2319 mxge_start_locked(ss);
2320 lwkt_serialize_exit(ifp->if_serializer);
2324 * copy an array of mcp_kreq_ether_recv_t's to the mcp. Copy
2325 * at most 32 bytes at a time, so as to avoid involving the software
2326 * pio handler in the nic. We re-write the first segment's low
2327 * DMA address to mark it valid only after we write the entire chunk
2331 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2332 mcp_kreq_ether_recv_t *src)
2336 low = src->addr_low;
2337 src->addr_low = 0xffffffff;
2338 mxge_pio_copy(dst, src, 4 * sizeof (*src));
2340 mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2342 src->addr_low = low;
2343 dst->addr_low = low;
2348 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2350 bus_dma_segment_t seg;
2352 mxge_rx_ring_t *rx = &ss->rx_small;
2355 m = m_gethdr(MB_DONTWAIT, MT_DATA);
2362 err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2363 &seg, 1, &cnt, BUS_DMA_NOWAIT);
2368 rx->info[idx].m = m;
2369 rx->shadow[idx].addr_low =
2370 htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2371 rx->shadow[idx].addr_high =
2372 htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2376 mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2382 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2384 bus_dma_segment_t seg[3];
2386 mxge_rx_ring_t *rx = &ss->rx_big;
2389 if (rx->cl_size == MCLBYTES)
2390 m = m_getcl(MB_DONTWAIT, MT_DATA, M_PKTHDR);
2393 m = m_getjcl(MB_DONTWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2396 * XXX: allocate normal sized buffers for big buffers.
2397 * We should be fine as long as we don't get any jumbo frames
2399 m = m_getcl(MB_DONTWAIT, MT_DATA, M_PKTHDR);
2407 m->m_len = rx->mlen;
2408 err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2409 seg, 1, &cnt, BUS_DMA_NOWAIT);
2414 rx->info[idx].m = m;
2415 rx->shadow[idx].addr_low =
2416 htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2417 rx->shadow[idx].addr_high =
2418 htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2420 #if MXGE_VIRT_JUMBOS
2421 for (i = 1; i < cnt; i++) {
2422 rx->shadow[idx + i].addr_low =
2423 htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2424 rx->shadow[idx + i].addr_high =
2425 htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2430 for (i = 0; i < rx->nbufs; i++) {
2431 if ((idx & 7) == 7) {
2432 mxge_submit_8rx(&rx->lanai[idx - 7],
2433 &rx->shadow[idx - 7]);
2441 * Myri10GE hardware checksums are not valid if the sender
2442 * padded the frame with non-zero padding. This is because
2443 * the firmware just does a simple 16-bit 1s complement
2444 * checksum across the entire frame, excluding the first 14
2445 * bytes. It is best to simply to check the checksum and
2446 * tell the stack about it only if the checksum is good
2449 static inline uint16_t
2450 mxge_rx_csum(struct mbuf *m, int csum)
2452 struct ether_header *eh;
2456 eh = mtod(m, struct ether_header *);
2458 /* only deal with IPv4 TCP & UDP for now */
2459 if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2461 ip = (struct ip *)(eh + 1);
2462 if (__predict_false(ip->ip_p != IPPROTO_TCP &&
2463 ip->ip_p != IPPROTO_UDP))
2466 c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2467 htonl(ntohs(csum) + ntohs(ip->ip_len) +
2468 - (ip->ip_hl << 2) + ip->ip_p));
2477 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2479 struct ether_vlan_header *evl;
2480 struct ether_header *eh;
2483 evl = mtod(m, struct ether_vlan_header *);
2484 eh = mtod(m, struct ether_header *);
2487 * fix checksum by subtracting EVL_ENCAPLEN bytes
2488 * after what the firmware thought was the end of the ethernet
2492 /* put checksum into host byte order */
2493 *csum = ntohs(*csum);
2494 partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2495 (*csum) += ~partial;
2496 (*csum) += ((*csum) < ~partial);
2497 (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2498 (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2500 /* restore checksum to network byte order;
2501 later consumers expect this */
2502 *csum = htons(*csum);
2505 #ifdef MXGE_NEW_VLAN_API
2506 m->m_pkthdr.ether_vlantag = ntohs(evl->evl_tag);
2510 mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2514 VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2515 m_tag_prepend(m, mtag);
2519 m->m_flags |= M_VLANTAG;
2522 * Remove the 802.1q header by copying the Ethernet
2523 * addresses over it and adjusting the beginning of
2524 * the data in the mbuf. The encapsulated Ethernet
2525 * type field is already in place.
2527 bcopy((char *)evl, (char *)evl + EVL_ENCAPLEN,
2528 ETHER_HDR_LEN - ETHER_TYPE_LEN);
2529 m_adj(m, EVL_ENCAPLEN);
2534 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2539 struct ether_header *eh;
2541 bus_dmamap_t old_map;
2543 uint16_t tcpudp_csum;
2548 idx = rx->cnt & rx->mask;
2549 rx->cnt += rx->nbufs;
2550 /* save a pointer to the received mbuf */
2551 m = rx->info[idx].m;
2552 /* try to replace the received mbuf */
2553 if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2554 /* drop the frame -- the old mbuf is re-cycled */
2559 /* unmap the received buffer */
2560 old_map = rx->info[idx].map;
2561 bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2562 bus_dmamap_unload(rx->dmat, old_map);
2564 /* swap the bus_dmamap_t's */
2565 rx->info[idx].map = rx->extra_map;
2566 rx->extra_map = old_map;
2568 /* mcp implicitly skips 1st 2 bytes so that packet is properly
2570 m->m_data += MXGEFW_PAD;
2572 m->m_pkthdr.rcvif = ifp;
2573 m->m_len = m->m_pkthdr.len = len;
2575 eh = mtod(m, struct ether_header *);
2576 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2577 mxge_vlan_tag_remove(m, &csum);
2579 /* if the checksum is valid, mark it in the mbuf header */
2580 if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2581 if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2583 /* otherwise, it was a UDP frame, or a TCP frame which
2584 we could not do LRO on. Tell the stack that the
2586 m->m_pkthdr.csum_data = 0xffff;
2587 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2590 /* flowid only valid if RSS hashing is enabled */
2591 if (sc->num_slices > 1) {
2592 m->m_pkthdr.flowid = (ss - sc->ss);
2593 m->m_flags |= M_FLOWID;
2596 /* pass the frame up the stack */
2597 (*ifp->if_input)(ifp, m);
2601 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2605 struct ether_header *eh;
2608 bus_dmamap_t old_map;
2610 uint16_t tcpudp_csum;
2615 idx = rx->cnt & rx->mask;
2617 /* save a pointer to the received mbuf */
2618 m = rx->info[idx].m;
2619 /* try to replace the received mbuf */
2620 if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2621 /* drop the frame -- the old mbuf is re-cycled */
2626 /* unmap the received buffer */
2627 old_map = rx->info[idx].map;
2628 bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2629 bus_dmamap_unload(rx->dmat, old_map);
2631 /* swap the bus_dmamap_t's */
2632 rx->info[idx].map = rx->extra_map;
2633 rx->extra_map = old_map;
2635 /* mcp implicitly skips 1st 2 bytes so that packet is properly
2637 m->m_data += MXGEFW_PAD;
2639 m->m_pkthdr.rcvif = ifp;
2640 m->m_len = m->m_pkthdr.len = len;
2642 eh = mtod(m, struct ether_header *);
2643 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2644 mxge_vlan_tag_remove(m, &csum);
2646 /* if the checksum is valid, mark it in the mbuf header */
2647 if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2648 if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2650 /* otherwise, it was a UDP frame, or a TCP frame which
2651 we could not do LRO on. Tell the stack that the
2653 m->m_pkthdr.csum_data = 0xffff;
2654 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2657 /* flowid only valid if RSS hashing is enabled */
2658 if (sc->num_slices > 1) {
2659 m->m_pkthdr.flowid = (ss - sc->ss);
2660 m->m_flags |= M_FLOWID;
2663 /* pass the frame up the stack */
2664 (*ifp->if_input)(ifp, m);
2668 mxge_clean_rx_done(struct mxge_slice_state *ss)
2670 mxge_rx_done_t *rx_done = &ss->rx_done;
2676 while (rx_done->entry[rx_done->idx].length != 0) {
2677 length = ntohs(rx_done->entry[rx_done->idx].length);
2678 rx_done->entry[rx_done->idx].length = 0;
2679 checksum = rx_done->entry[rx_done->idx].checksum;
2680 if (length <= (MHLEN - MXGEFW_PAD))
2681 mxge_rx_done_small(ss, length, checksum);
2683 mxge_rx_done_big(ss, length, checksum);
2685 rx_done->idx = rx_done->cnt & rx_done->mask;
2687 /* limit potential for livelock */
2688 if (__predict_false(++limit > rx_done->mask / 2))
2692 while (!SLIST_EMPTY(&ss->lro_active)) {
2693 struct lro_entry *lro = SLIST_FIRST(&ss->lro_active);
2694 SLIST_REMOVE_HEAD(&ss->lro_active, next);
2695 mxge_lro_flush(ss, lro);
2702 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2713 while (tx->pkt_done != mcp_idx) {
2714 idx = tx->done & tx->mask;
2716 m = tx->info[idx].m;
2717 /* mbuf and DMA map only attached to the first
2720 ss->obytes += m->m_pkthdr.len;
2721 if (m->m_flags & M_MCAST)
2724 tx->info[idx].m = NULL;
2725 map = tx->info[idx].map;
2726 bus_dmamap_unload(tx->dmat, map);
2729 if (tx->info[idx].flag) {
2730 tx->info[idx].flag = 0;
2735 /* If we have space, clear IFF_OACTIVE to tell the stack that
2736 its OK to send packets */
2737 #ifdef IFNET_BUF_RING
2738 flags = &ss->if_flags;
2740 flags = &ifp->if_flags;
2742 lwkt_serialize_enter(ifp->if_serializer);
2743 if ((*flags) & IFF_OACTIVE &&
2744 tx->req - tx->done < (tx->mask + 1)/4) {
2745 *(flags) &= ~IFF_OACTIVE;
2747 mxge_start_locked(ss);
2749 #ifdef IFNET_BUF_RING
2750 if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2751 /* let the NIC stop polling this queue, since there
2752 * are no more transmits pending */
2753 if (tx->req == tx->done) {
2755 tx->queue_active = 0;
2761 lwkt_serialize_exit(ifp->if_serializer);
2765 static struct mxge_media_type mxge_xfp_media_types[] =
2767 {IFM_10G_CX4, 0x7f, "10GBASE-CX4 (module)"},
2768 {IFM_10G_SR, (1 << 7), "10GBASE-SR"},
2769 {IFM_10G_LR, (1 << 6), "10GBASE-LR"},
2770 {0, (1 << 5), "10GBASE-ER"},
2771 {IFM_10G_LRM, (1 << 4), "10GBASE-LRM"},
2772 {0, (1 << 3), "10GBASE-SW"},
2773 {0, (1 << 2), "10GBASE-LW"},
2774 {0, (1 << 1), "10GBASE-EW"},
2775 {0, (1 << 0), "Reserved"}
2777 static struct mxge_media_type mxge_sfp_media_types[] =
2779 {0, (1 << 7), "Reserved"},
2780 {IFM_10G_LRM, (1 << 6), "10GBASE-LRM"},
2781 {IFM_10G_LR, (1 << 5), "10GBASE-LR"},
2782 {IFM_10G_SR, (1 << 4), "10GBASE-SR"}
2786 mxge_set_media(mxge_softc_t *sc, int type)
2788 sc->media_flags |= type;
2789 ifmedia_add(&sc->media, sc->media_flags, 0, NULL);
2790 ifmedia_set(&sc->media, sc->media_flags);
2795 * Determine the media type for a NIC. Some XFPs will identify
2796 * themselves only when their link is up, so this is initiated via a
2797 * link up interrupt. However, this can potentially take up to
2798 * several milliseconds, so it is run via the watchdog routine, rather
2799 * than in the interrupt handler itself. This need only be done
2800 * once, not each time the link is up.
2803 mxge_media_probe(mxge_softc_t *sc)
2808 struct mxge_media_type *mxge_media_types = NULL;
2809 int i, err, ms, mxge_media_type_entries;
2812 sc->need_media_probe = 0;
2814 /* if we've already set a media type, we're done */
2815 if (sc->media_flags != (IFM_ETHER | IFM_AUTO))
2819 * parse the product code to deterimine the interface type
2820 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2821 * after the 3rd dash in the driver's cached copy of the
2822 * EEPROM's product code string.
2824 ptr = sc->product_code_string;
2826 device_printf(sc->dev, "Missing product code\n");
2829 for (i = 0; i < 3; i++, ptr++) {
2830 ptr = index(ptr, '-');
2832 device_printf(sc->dev,
2833 "only %d dashes in PC?!?\n", i);
2839 mxge_set_media(sc, IFM_10G_CX4);
2842 else if (*ptr == 'Q') {
2843 /* -Q is Quad Ribbon Fiber */
2844 device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2845 /* FreeBSD has no media type for Quad ribbon fiber */
2851 mxge_media_types = mxge_xfp_media_types;
2852 mxge_media_type_entries =
2853 sizeof (mxge_xfp_media_types) /
2854 sizeof (mxge_xfp_media_types[0]);
2855 byte = MXGE_XFP_COMPLIANCE_BYTE;
2859 if (*ptr == 'S' || *(ptr +1) == 'S') {
2860 /* -S or -2S is SFP+ */
2861 mxge_media_types = mxge_sfp_media_types;
2862 mxge_media_type_entries =
2863 sizeof (mxge_sfp_media_types) /
2864 sizeof (mxge_sfp_media_types[0]);
2869 if (mxge_media_types == NULL) {
2870 device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2875 * At this point we know the NIC has an XFP cage, so now we
2876 * try to determine what is in the cage by using the
2877 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2878 * register. We read just one byte, which may take over
2882 cmd.data0 = 0; /* just fetch 1 byte, not all 256 */
2884 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2885 if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
2886 device_printf(sc->dev, "failed to read XFP\n");
2888 if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
2889 device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
2891 if (err != MXGEFW_CMD_OK) {
2895 /* now we wait for the data to be cached */
2897 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2898 for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
2901 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2903 if (err != MXGEFW_CMD_OK) {
2904 device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
2905 cage_type, err, ms);
2909 if (cmd.data0 == mxge_media_types[0].bitmask) {
2911 device_printf(sc->dev, "%s:%s\n", cage_type,
2912 mxge_media_types[0].name);
2913 mxge_set_media(sc, IFM_10G_CX4);
2916 for (i = 1; i < mxge_media_type_entries; i++) {
2917 if (cmd.data0 & mxge_media_types[i].bitmask) {
2919 device_printf(sc->dev, "%s:%s\n",
2921 mxge_media_types[i].name);
2923 mxge_set_media(sc, mxge_media_types[i].flag);
2927 device_printf(sc->dev, "%s media 0x%x unknown\n", cage_type,
2934 mxge_intr(void *arg)
2936 struct mxge_slice_state *ss = arg;
2937 mxge_softc_t *sc = ss->sc;
2938 mcp_irq_data_t *stats = ss->fw_stats;
2939 mxge_tx_ring_t *tx = &ss->tx;
2940 mxge_rx_done_t *rx_done = &ss->rx_done;
2941 uint32_t send_done_count;
2945 #ifndef IFNET_BUF_RING
2946 /* an interrupt on a non-zero slice is implicitly valid
2947 since MSI-X irqs are not shared */
2949 mxge_clean_rx_done(ss);
2950 *ss->irq_claim = be32toh(3);
2955 /* make sure the DMA has finished */
2956 if (!stats->valid) {
2959 valid = stats->valid;
2961 if (sc->legacy_irq) {
2962 /* lower legacy IRQ */
2963 *sc->irq_deassert = 0;
2964 if (!mxge_deassert_wait)
2965 /* don't wait for conf. that irq is low */
2971 /* loop while waiting for legacy irq deassertion */
2973 /* check for transmit completes and receives */
2974 send_done_count = be32toh(stats->send_done_count);
2975 while ((send_done_count != tx->pkt_done) ||
2976 (rx_done->entry[rx_done->idx].length != 0)) {
2977 if (send_done_count != tx->pkt_done)
2978 mxge_tx_done(ss, (int)send_done_count);
2979 mxge_clean_rx_done(ss);
2980 send_done_count = be32toh(stats->send_done_count);
2982 if (sc->legacy_irq && mxge_deassert_wait)
2984 } while (*((volatile uint8_t *) &stats->valid));
2986 /* fw link & error stats meaningful only on the first slice */
2987 if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
2988 if (sc->link_state != stats->link_up) {
2989 sc->link_state = stats->link_up;
2990 if (sc->link_state) {
2991 sc->ifp->if_link_state = LINK_STATE_UP;
2992 if_link_state_change(sc->ifp);
2994 device_printf(sc->dev, "link up\n");
2996 sc->ifp->if_link_state = LINK_STATE_DOWN;
2997 if_link_state_change(sc->ifp);
2999 device_printf(sc->dev, "link down\n");
3001 sc->need_media_probe = 1;
3003 if (sc->rdma_tags_available !=
3004 be32toh(stats->rdma_tags_available)) {
3005 sc->rdma_tags_available =
3006 be32toh(stats->rdma_tags_available);
3007 device_printf(sc->dev, "RDMA timed out! %d tags "
3008 "left\n", sc->rdma_tags_available);
3011 if (stats->link_down) {
3012 sc->down_cnt += stats->link_down;
3014 sc->ifp->if_link_state = LINK_STATE_DOWN;
3015 if_link_state_change(sc->ifp);
3019 /* check to see if we have rx token to pass back */
3021 *ss->irq_claim = be32toh(3);
3022 *(ss->irq_claim + 1) = be32toh(3);
3026 mxge_init(void *arg)
3033 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
3035 struct lro_entry *lro_entry;
3038 while (!SLIST_EMPTY(&ss->lro_free)) {
3039 lro_entry = SLIST_FIRST(&ss->lro_free);
3040 SLIST_REMOVE_HEAD(&ss->lro_free, next);
3041 kfree(lro_entry, M_DEVBUF);
3044 for (i = 0; i <= ss->rx_big.mask; i++) {
3045 if (ss->rx_big.info[i].m == NULL)
3047 bus_dmamap_unload(ss->rx_big.dmat,
3048 ss->rx_big.info[i].map);
3049 m_freem(ss->rx_big.info[i].m);
3050 ss->rx_big.info[i].m = NULL;
3053 for (i = 0; i <= ss->rx_small.mask; i++) {
3054 if (ss->rx_small.info[i].m == NULL)
3056 bus_dmamap_unload(ss->rx_small.dmat,
3057 ss->rx_small.info[i].map);
3058 m_freem(ss->rx_small.info[i].m);
3059 ss->rx_small.info[i].m = NULL;
3062 /* transmit ring used only on the first slice */
3063 if (ss->tx.info == NULL)
3066 for (i = 0; i <= ss->tx.mask; i++) {
3067 ss->tx.info[i].flag = 0;
3068 if (ss->tx.info[i].m == NULL)
3070 bus_dmamap_unload(ss->tx.dmat,
3071 ss->tx.info[i].map);
3072 m_freem(ss->tx.info[i].m);
3073 ss->tx.info[i].m = NULL;
3078 mxge_free_mbufs(mxge_softc_t *sc)
3082 for (slice = 0; slice < sc->num_slices; slice++)
3083 mxge_free_slice_mbufs(&sc->ss[slice]);
3087 mxge_free_slice_rings(struct mxge_slice_state *ss)
3092 if (ss->rx_done.entry != NULL)
3093 mxge_dma_free(&ss->rx_done.dma);
3094 ss->rx_done.entry = NULL;
3096 if (ss->tx.req_bytes != NULL)
3097 kfree(ss->tx.req_bytes, M_DEVBUF);
3098 ss->tx.req_bytes = NULL;
3100 if (ss->tx.seg_list != NULL)
3101 kfree(ss->tx.seg_list, M_DEVBUF);
3102 ss->tx.seg_list = NULL;
3104 if (ss->rx_small.shadow != NULL)
3105 kfree(ss->rx_small.shadow, M_DEVBUF);
3106 ss->rx_small.shadow = NULL;
3108 if (ss->rx_big.shadow != NULL)
3109 kfree(ss->rx_big.shadow, M_DEVBUF);
3110 ss->rx_big.shadow = NULL;
3112 if (ss->tx.info != NULL) {
3113 if (ss->tx.dmat != NULL) {
3114 for (i = 0; i <= ss->tx.mask; i++) {
3115 bus_dmamap_destroy(ss->tx.dmat,
3116 ss->tx.info[i].map);
3118 bus_dma_tag_destroy(ss->tx.dmat);
3120 kfree(ss->tx.info, M_DEVBUF);
3124 if (ss->rx_small.info != NULL) {
3125 if (ss->rx_small.dmat != NULL) {
3126 for (i = 0; i <= ss->rx_small.mask; i++) {
3127 bus_dmamap_destroy(ss->rx_small.dmat,
3128 ss->rx_small.info[i].map);
3130 bus_dmamap_destroy(ss->rx_small.dmat,
3131 ss->rx_small.extra_map);
3132 bus_dma_tag_destroy(ss->rx_small.dmat);
3134 kfree(ss->rx_small.info, M_DEVBUF);
3136 ss->rx_small.info = NULL;
3138 if (ss->rx_big.info != NULL) {
3139 if (ss->rx_big.dmat != NULL) {
3140 for (i = 0; i <= ss->rx_big.mask; i++) {
3141 bus_dmamap_destroy(ss->rx_big.dmat,
3142 ss->rx_big.info[i].map);
3144 bus_dmamap_destroy(ss->rx_big.dmat,
3145 ss->rx_big.extra_map);
3146 bus_dma_tag_destroy(ss->rx_big.dmat);
3148 kfree(ss->rx_big.info, M_DEVBUF);
3150 ss->rx_big.info = NULL;
3154 mxge_free_rings(mxge_softc_t *sc)
3158 for (slice = 0; slice < sc->num_slices; slice++)
3159 mxge_free_slice_rings(&sc->ss[slice]);
3163 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3164 int tx_ring_entries)
3166 mxge_softc_t *sc = ss->sc;
3172 /* allocate per-slice receive resources */
3174 ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3175 ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3177 /* allocate the rx shadow rings */
3178 bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3179 ss->rx_small.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3180 if (ss->rx_small.shadow == NULL)
3183 bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3184 ss->rx_big.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3185 if (ss->rx_big.shadow == NULL)
3188 /* allocate the rx host info rings */
3189 bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3190 ss->rx_small.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3191 if (ss->rx_small.info == NULL)
3194 bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3195 ss->rx_big.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3196 if (ss->rx_big.info == NULL)
3199 /* allocate the rx busdma resources */
3200 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3202 4096, /* boundary */
3203 BUS_SPACE_MAXADDR, /* low */
3204 BUS_SPACE_MAXADDR, /* high */
3205 NULL, NULL, /* filter */
3206 MHLEN, /* maxsize */
3208 MHLEN, /* maxsegsize */
3209 BUS_DMA_ALLOCNOW, /* flags */
3210 &ss->rx_small.dmat); /* tag */
3212 device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3217 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3219 #if MXGE_VIRT_JUMBOS
3220 4096, /* boundary */
3224 BUS_SPACE_MAXADDR, /* low */
3225 BUS_SPACE_MAXADDR, /* high */
3226 NULL, NULL, /* filter */
3227 3*4096, /* maxsize */
3228 #if MXGE_VIRT_JUMBOS
3230 4096, /* maxsegsize*/
3233 MJUM9BYTES, /* maxsegsize*/
3235 BUS_DMA_ALLOCNOW, /* flags */
3236 &ss->rx_big.dmat); /* tag */
3238 device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3242 for (i = 0; i <= ss->rx_small.mask; i++) {
3243 err = bus_dmamap_create(ss->rx_small.dmat, 0,
3244 &ss->rx_small.info[i].map);
3246 device_printf(sc->dev, "Err %d rx_small dmamap\n",
3251 err = bus_dmamap_create(ss->rx_small.dmat, 0,
3252 &ss->rx_small.extra_map);
3254 device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3259 for (i = 0; i <= ss->rx_big.mask; i++) {
3260 err = bus_dmamap_create(ss->rx_big.dmat, 0,
3261 &ss->rx_big.info[i].map);
3263 device_printf(sc->dev, "Err %d rx_big dmamap\n",
3268 err = bus_dmamap_create(ss->rx_big.dmat, 0,
3269 &ss->rx_big.extra_map);
3271 device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3276 /* now allocate TX resouces */
3278 #ifndef IFNET_BUF_RING
3279 /* only use a single TX ring for now */
3280 if (ss != ss->sc->ss)
3284 ss->tx.mask = tx_ring_entries - 1;
3285 ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3288 /* allocate the tx request copy block */
3290 sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3291 ss->tx.req_bytes = kmalloc(bytes, M_DEVBUF, M_WAITOK);
3292 if (ss->tx.req_bytes == NULL)
3294 /* ensure req_list entries are aligned to 8 bytes */
3295 ss->tx.req_list = (mcp_kreq_ether_send_t *)
3296 ((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3298 /* allocate the tx busdma segment list */
3299 bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3300 ss->tx.seg_list = (bus_dma_segment_t *)
3301 kmalloc(bytes, M_DEVBUF, M_WAITOK);
3302 if (ss->tx.seg_list == NULL)
3305 /* allocate the tx host info ring */
3306 bytes = tx_ring_entries * sizeof (*ss->tx.info);
3307 ss->tx.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3308 if (ss->tx.info == NULL)
3311 /* allocate the tx busdma resources */
3312 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3314 sc->tx_boundary, /* boundary */
3315 BUS_SPACE_MAXADDR, /* low */
3316 BUS_SPACE_MAXADDR, /* high */
3317 NULL, NULL, /* filter */
3318 65536 + 256, /* maxsize */
3319 ss->tx.max_desc - 2, /* num segs */
3320 sc->tx_boundary, /* maxsegsz */
3321 BUS_DMA_ALLOCNOW, /* flags */
3322 &ss->tx.dmat); /* tag */
3325 device_printf(sc->dev, "Err %d allocating tx dmat\n",
3330 /* now use these tags to setup dmamaps for each slot
3332 for (i = 0; i <= ss->tx.mask; i++) {
3333 err = bus_dmamap_create(ss->tx.dmat, 0,
3334 &ss->tx.info[i].map);
3336 device_printf(sc->dev, "Err %d tx dmamap\n",
3346 mxge_alloc_rings(mxge_softc_t *sc)
3350 int tx_ring_entries, rx_ring_entries;
3353 /* get ring sizes */
3354 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3355 tx_ring_size = cmd.data0;
3357 device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3361 tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3362 rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3363 ifq_set_maxlen(&sc->ifp->if_snd, tx_ring_entries - 1);
3364 ifq_set_ready(&sc->ifp->if_snd);
3366 for (slice = 0; slice < sc->num_slices; slice++) {
3367 err = mxge_alloc_slice_rings(&sc->ss[slice],
3376 mxge_free_rings(sc);
3383 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3385 int bufsize = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN + MXGEFW_PAD;
3387 if (bufsize < MCLBYTES) {
3388 /* easy, everything fits in a single buffer */
3389 *big_buf_size = MCLBYTES;
3390 *cl_size = MCLBYTES;
3395 if (bufsize < MJUMPAGESIZE) {
3396 /* still easy, everything still fits in a single buffer */
3397 *big_buf_size = MJUMPAGESIZE;
3398 *cl_size = MJUMPAGESIZE;
3402 #if MXGE_VIRT_JUMBOS
3403 /* now we need to use virtually contiguous buffers */
3404 *cl_size = MJUM9BYTES;
3405 *big_buf_size = 4096;
3406 *nbufs = mtu / 4096 + 1;
3407 /* needs to be a power of two, so round up */
3411 *cl_size = MJUM9BYTES;
3412 *big_buf_size = MJUM9BYTES;
3418 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3423 struct lro_entry *lro_entry;
3428 slice = ss - sc->ss;
3430 SLIST_INIT(&ss->lro_free);
3431 SLIST_INIT(&ss->lro_active);
3433 for (i = 0; i < sc->lro_cnt; i++) {
3434 lro_entry = (struct lro_entry *)
3435 kmalloc(sizeof (*lro_entry), M_DEVBUF,
3437 if (lro_entry == NULL) {
3441 SLIST_INSERT_HEAD(&ss->lro_free, lro_entry, next);
3443 /* get the lanai pointers to the send and receive rings */
3446 #ifndef IFNET_BUF_RING
3447 /* We currently only send from the first slice */
3451 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3453 (volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3454 ss->tx.send_go = (volatile uint32_t *)
3455 (sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3456 ss->tx.send_stop = (volatile uint32_t *)
3457 (sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3458 #ifndef IFNET_BUF_RING
3462 err |= mxge_send_cmd(sc,
3463 MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3464 ss->rx_small.lanai =
3465 (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3467 err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3469 (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3472 device_printf(sc->dev,
3473 "failed to get ring sizes or locations\n");
3477 /* stock receive rings */
3478 for (i = 0; i <= ss->rx_small.mask; i++) {
3479 map = ss->rx_small.info[i].map;
3480 err = mxge_get_buf_small(ss, map, i);
3482 device_printf(sc->dev, "alloced %d/%d smalls\n",
3483 i, ss->rx_small.mask + 1);
3487 for (i = 0; i <= ss->rx_big.mask; i++) {
3488 ss->rx_big.shadow[i].addr_low = 0xffffffff;
3489 ss->rx_big.shadow[i].addr_high = 0xffffffff;
3491 ss->rx_big.nbufs = nbufs;
3492 ss->rx_big.cl_size = cl_size;
3493 ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3494 EVL_ENCAPLEN + MXGEFW_PAD;
3495 for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3496 map = ss->rx_big.info[i].map;
3497 err = mxge_get_buf_big(ss, map, i);
3499 device_printf(sc->dev, "alloced %d/%d bigs\n",
3500 i, ss->rx_big.mask + 1);
3508 mxge_open(mxge_softc_t *sc)
3511 int err, big_bytes, nbufs, slice, cl_size, i;
3513 volatile uint8_t *itable;
3514 struct mxge_slice_state *ss;
3516 /* Copy the MAC address in case it was overridden */
3517 bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3519 err = mxge_reset(sc, 1);
3521 device_printf(sc->dev, "failed to reset\n");
3525 if (sc->num_slices > 1) {
3526 /* setup the indirection table */
3527 cmd.data0 = sc->num_slices;
3528 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3531 err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3534 device_printf(sc->dev,
3535 "failed to setup rss tables\n");
3539 /* just enable an identity mapping */
3540 itable = sc->sram + cmd.data0;
3541 for (i = 0; i < sc->num_slices; i++)
3542 itable[i] = (uint8_t)i;
3545 cmd.data1 = mxge_rss_hash_type;
3546 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3548 device_printf(sc->dev, "failed to enable slices\n");
3554 mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3557 err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3559 /* error is only meaningful if we're trying to set
3560 MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3561 if (err && nbufs > 1) {
3562 device_printf(sc->dev,
3563 "Failed to set alway-use-n to %d\n",
3567 /* Give the firmware the mtu and the big and small buffer
3568 sizes. The firmware wants the big buf size to be a power
3569 of two. Luckily, FreeBSD's clusters are powers of two */
3570 cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
3571 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3572 cmd.data0 = MHLEN - MXGEFW_PAD;
3573 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3575 cmd.data0 = big_bytes;
3576 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3579 device_printf(sc->dev, "failed to setup params\n");
3583 /* Now give him the pointer to the stats block */
3585 #ifdef IFNET_BUF_RING
3586 slice < sc->num_slices;
3591 ss = &sc->ss[slice];
3593 MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3595 MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3596 cmd.data2 = sizeof(struct mcp_irq_data);
3597 cmd.data2 |= (slice << 16);
3598 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3602 bus = sc->ss->fw_stats_dma.bus_addr;
3603 bus += offsetof(struct mcp_irq_data, send_done_count);
3604 cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3605 cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3606 err = mxge_send_cmd(sc,
3607 MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3609 /* Firmware cannot support multicast without STATS_DMA_V2 */
3610 sc->fw_multicast_support = 0;
3612 sc->fw_multicast_support = 1;
3616 device_printf(sc->dev, "failed to setup params\n");
3620 for (slice = 0; slice < sc->num_slices; slice++) {
3621 err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3623 device_printf(sc->dev, "couldn't open slice %d\n",
3629 /* Finally, start the firmware running */
3630 err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3632 device_printf(sc->dev, "Couldn't bring up link\n");
3635 #ifdef IFNET_BUF_RING
3636 for (slice = 0; slice < sc->num_slices; slice++) {
3637 ss = &sc->ss[slice];
3638 ss->if_flags |= IFF_RUNNING;
3639 ss->if_flags &= ~IFF_OACTIVE;
3642 sc->ifp->if_flags |= IFF_RUNNING;
3643 sc->ifp->if_flags &= ~IFF_OACTIVE;
3644 callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3650 mxge_free_mbufs(sc);
3656 mxge_close(mxge_softc_t *sc)
3659 int err, old_down_cnt;
3660 #ifdef IFNET_BUF_RING
3661 struct mxge_slice_state *ss;
3665 callout_stop(&sc->co_hdl);
3666 #ifdef IFNET_BUF_RING
3667 for (slice = 0; slice < sc->num_slices; slice++) {
3668 ss = &sc->ss[slice];
3669 ss->if_flags &= ~IFF_RUNNING;
3672 sc->ifp->if_flags &= ~IFF_RUNNING;
3673 old_down_cnt = sc->down_cnt;
3675 err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3677 device_printf(sc->dev, "Couldn't bring down link\n");
3679 if (old_down_cnt == sc->down_cnt) {
3680 /* wait for down irq */
3681 DELAY(10 * sc->intr_coal_delay);
3684 if (old_down_cnt == sc->down_cnt) {
3685 device_printf(sc->dev, "never got down irq\n");
3688 mxge_free_mbufs(sc);
3694 mxge_setup_cfg_space(mxge_softc_t *sc)
3696 device_t dev = sc->dev;
3698 uint16_t cmd, lnk, pectl;
3700 /* find the PCIe link width and set max read request to 4KB*/
3701 if (pci_find_extcap(dev, PCIY_EXPRESS, ®) == 0) {
3702 lnk = pci_read_config(dev, reg + 0x12, 2);
3703 sc->link_width = (lnk >> 4) & 0x3f;
3705 pectl = pci_read_config(dev, reg + 0x8, 2);
3706 pectl = (pectl & ~0x7000) | (5 << 12);
3707 pci_write_config(dev, reg + 0x8, pectl, 2);
3710 /* Enable DMA and Memory space access */
3711 pci_enable_busmaster(dev);
3712 cmd = pci_read_config(dev, PCIR_COMMAND, 2);
3713 cmd |= PCIM_CMD_MEMEN;
3714 pci_write_config(dev, PCIR_COMMAND, cmd, 2);
3718 mxge_read_reboot(mxge_softc_t *sc)
3720 device_t dev = sc->dev;
3723 /* find the vendor specific offset */
3724 if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
3725 device_printf(sc->dev,
3726 "could not find vendor specific offset\n");
3727 return (uint32_t)-1;
3729 /* enable read32 mode */
3730 pci_write_config(dev, vs + 0x10, 0x3, 1);
3731 /* tell NIC which register to read */
3732 pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3733 return (pci_read_config(dev, vs + 0x14, 4));
3737 mxge_watchdog_reset(mxge_softc_t *sc, int slice)
3739 struct pci_devinfo *dinfo;
3747 device_printf(sc->dev, "Watchdog reset!\n");
3750 * check to see if the NIC rebooted. If it did, then all of
3751 * PCI config space has been reset, and things like the
3752 * busmaster bit will be zero. If this is the case, then we
3753 * must restore PCI config space before the NIC can be used
3756 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3757 if (cmd == 0xffff) {
3759 * maybe the watchdog caught the NIC rebooting; wait
3760 * up to 100ms for it to finish. If it does not come
3761 * back, then give up
3764 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3765 if (cmd == 0xffff) {
3766 device_printf(sc->dev, "NIC disappeared!\n");
3770 if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3771 /* print the reboot status */
3772 reboot = mxge_read_reboot(sc);
3773 device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3775 /* restore PCI configuration space */
3776 dinfo = device_get_ivars(sc->dev);
3777 pci_cfg_restore(sc->dev, dinfo);
3779 /* and redo any changes we made to our config space */
3780 mxge_setup_cfg_space(sc);
3782 if (sc->ifp->if_flags & IFF_RUNNING) {
3784 err = mxge_open(sc);
3787 tx = &sc->ss[slice].tx;
3788 device_printf(sc->dev,
3789 "NIC did not reboot, slice %d ring state:\n",
3791 device_printf(sc->dev,
3792 "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3793 tx->req, tx->done, tx->queue_active);
3794 device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3795 tx->activate, tx->deactivate);
3796 device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3798 be32toh(sc->ss->fw_stats->send_done_count));
3799 device_printf(sc->dev, "not resetting\n");
3805 mxge_watchdog(mxge_softc_t *sc)
3808 uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3811 /* see if we have outstanding transmits, which
3812 have been pending for more than mxge_ticks */
3814 #ifdef IFNET_BUF_RING
3815 (i < sc->num_slices) && (err == 0);
3817 (i < 1) && (err == 0);
3821 if (tx->req != tx->done &&
3822 tx->watchdog_req != tx->watchdog_done &&
3823 tx->done == tx->watchdog_done) {
3824 /* check for pause blocking before resetting */
3825 if (tx->watchdog_rx_pause == rx_pause)
3826 err = mxge_watchdog_reset(sc, i);
3828 device_printf(sc->dev, "Flow control blocking "
3829 "xmits, check link partner\n");
3832 tx->watchdog_req = tx->req;
3833 tx->watchdog_done = tx->done;
3834 tx->watchdog_rx_pause = rx_pause;
3837 if (sc->need_media_probe)
3838 mxge_media_probe(sc);
3843 mxge_update_stats(mxge_softc_t *sc)
3845 struct mxge_slice_state *ss;
3846 u_long ipackets = 0;
3847 u_long opackets = 0;
3848 #ifdef IFNET_BUF_RING
3856 for (slice = 0; slice < sc->num_slices; slice++) {
3857 ss = &sc->ss[slice];
3858 ipackets += ss->ipackets;
3859 opackets += ss->opackets;
3860 #ifdef IFNET_BUF_RING
3861 obytes += ss->obytes;
3862 omcasts += ss->omcasts;
3863 odrops += ss->tx.br->br_drops;
3865 oerrors += ss->oerrors;
3867 sc->ifp->if_ipackets = ipackets;
3868 sc->ifp->if_opackets = opackets;
3869 #ifdef IFNET_BUF_RING
3870 sc->ifp->if_obytes = obytes;
3871 sc->ifp->if_omcasts = omcasts;
3872 sc->ifp->if_snd.ifq_drops = odrops;
3874 sc->ifp->if_oerrors = oerrors;
3878 mxge_tick(void *arg)
3880 mxge_softc_t *sc = arg;
3883 lwkt_serialize_enter(sc->ifp->if_serializer);
3884 /* aggregate stats from different slices */
3885 mxge_update_stats(sc);
3886 if (!sc->watchdog_countdown) {
3887 err = mxge_watchdog(sc);
3888 sc->watchdog_countdown = 4;
3890 sc->watchdog_countdown--;
3892 callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3893 lwkt_serialize_exit(sc->ifp->if_serializer);
3897 mxge_media_change(struct ifnet *ifp)
3903 mxge_change_mtu(mxge_softc_t *sc, int mtu)
3905 struct ifnet *ifp = sc->ifp;
3906 int real_mtu, old_mtu;
3910 real_mtu = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
3911 if ((real_mtu > sc->max_mtu) || real_mtu < 60)
3913 lwkt_serialize_enter(ifp->if_serializer);
3914 old_mtu = ifp->if_mtu;
3916 if (ifp->if_flags & IFF_RUNNING) {
3918 err = mxge_open(sc);
3920 ifp->if_mtu = old_mtu;
3922 (void) mxge_open(sc);
3925 lwkt_serialize_exit(ifp->if_serializer);
3930 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
3932 mxge_softc_t *sc = ifp->if_softc;
3937 ifmr->ifm_status = IFM_AVALID;
3938 ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
3939 ifmr->ifm_active = IFM_AUTO | IFM_ETHER;
3940 ifmr->ifm_active |= sc->link_state ? IFM_FDX : 0;
3944 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data, struct ucred *cr)
3946 mxge_softc_t *sc = ifp->if_softc;
3947 struct ifreq *ifr = (struct ifreq *)data;
3952 ASSERT_SERIALIZED(ifp->if_serializer);
3956 err = ether_ioctl(ifp, command, data);
3960 err = mxge_change_mtu(sc, ifr->ifr_mtu);
3967 if (ifp->if_flags & IFF_UP) {
3968 if (!(ifp->if_flags & IFF_RUNNING)) {
3969 err = mxge_open(sc);
3971 /* take care of promis can allmulti
3973 mxge_change_promisc(sc,
3974 ifp->if_flags & IFF_PROMISC);
3975 mxge_set_multicast_list(sc);
3978 if (ifp->if_flags & IFF_RUNNING) {
3986 mxge_set_multicast_list(sc);
3990 mask = ifr->ifr_reqcap ^ ifp->if_capenable;
3991 if (mask & IFCAP_TXCSUM) {
3992 if (IFCAP_TXCSUM & ifp->if_capenable) {
3993 ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
3994 ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
3997 ifp->if_capenable |= IFCAP_TXCSUM;
3998 ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
4000 } else if (mask & IFCAP_RXCSUM) {
4001 if (IFCAP_RXCSUM & ifp->if_capenable) {
4002 ifp->if_capenable &= ~IFCAP_RXCSUM;
4005 ifp->if_capenable |= IFCAP_RXCSUM;
4009 if (mask & IFCAP_TSO4) {
4010 if (IFCAP_TSO4 & ifp->if_capenable) {
4011 ifp->if_capenable &= ~IFCAP_TSO4;
4012 ifp->if_hwassist &= ~CSUM_TSO;
4013 } else if (IFCAP_TXCSUM & ifp->if_capenable) {
4014 ifp->if_capenable |= IFCAP_TSO4;
4015 ifp->if_hwassist |= CSUM_TSO;
4017 kprintf("mxge requires tx checksum offload"
4018 " be enabled to use TSO\n");
4022 if (mask & IFCAP_LRO) {
4023 if (IFCAP_LRO & ifp->if_capenable)
4024 err = mxge_change_lro_locked(sc, 0);
4026 err = mxge_change_lro_locked(sc, mxge_lro_cnt);
4028 if (mask & IFCAP_VLAN_HWTAGGING)
4029 ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
4030 VLAN_CAPABILITIES(ifp);
4035 err = ifmedia_ioctl(ifp, (struct ifreq *)data,
4036 &sc->media, command);
4046 mxge_fetch_tunables(mxge_softc_t *sc)
4049 TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4050 TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4051 &mxge_flow_control);
4052 TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4053 &mxge_intr_coal_delay);
4054 TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4055 &mxge_nvidia_ecrc_enable);
4056 TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4057 &mxge_force_firmware);
4058 TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4059 &mxge_deassert_wait);
4060 TUNABLE_INT_FETCH("hw.mxge.verbose",
4062 TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4063 TUNABLE_INT_FETCH("hw.mxge.lro_cnt", &sc->lro_cnt);
4064 TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4065 TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4066 TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4067 if (sc->lro_cnt != 0)
4068 mxge_lro_cnt = sc->lro_cnt;
4072 if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4073 mxge_intr_coal_delay = 30;
4074 if (mxge_ticks == 0)
4075 mxge_ticks = hz / 2;
4076 sc->pause = mxge_flow_control;
4077 if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4078 || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4079 mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
4081 if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4082 mxge_initial_mtu < ETHER_MIN_LEN)
4083 mxge_initial_mtu = ETHERMTU_JUMBO;
4088 mxge_free_slices(mxge_softc_t *sc)
4090 struct mxge_slice_state *ss;
4097 for (i = 0; i < sc->num_slices; i++) {
4099 if (ss->fw_stats != NULL) {
4100 mxge_dma_free(&ss->fw_stats_dma);
4101 ss->fw_stats = NULL;
4102 #ifdef IFNET_BUF_RING
4103 if (ss->tx.br != NULL) {
4104 drbr_free(ss->tx.br, M_DEVBUF);
4109 if (ss->rx_done.entry != NULL) {
4110 mxge_dma_free(&ss->rx_done.dma);
4111 ss->rx_done.entry = NULL;
4114 kfree(sc->ss, M_DEVBUF);
4119 mxge_alloc_slices(mxge_softc_t *sc)
4122 struct mxge_slice_state *ss;
4124 int err, i, max_intr_slots;
4126 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4128 device_printf(sc->dev, "Cannot determine rx ring size\n");
4131 sc->rx_ring_size = cmd.data0;
4132 max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4134 bytes = sizeof (*sc->ss) * sc->num_slices;
4135 sc->ss = kmalloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4138 for (i = 0; i < sc->num_slices; i++) {
4143 /* allocate per-slice rx interrupt queues */
4145 bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4146 err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4149 ss->rx_done.entry = ss->rx_done.dma.addr;
4150 bzero(ss->rx_done.entry, bytes);
4153 * allocate the per-slice firmware stats; stats
4154 * (including tx) are used used only on the first
4157 #ifndef IFNET_BUF_RING
4162 bytes = sizeof (*ss->fw_stats);
4163 err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4164 sizeof (*ss->fw_stats), 64);
4167 ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4168 #ifdef IFNET_BUF_RING
4169 ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4177 mxge_free_slices(sc);
4182 mxge_slice_probe(mxge_softc_t *sc)
4186 int msix_cnt, status, max_intr_slots;
4190 * don't enable multiple slices if they are not enabled,
4191 * or if this is not an SMP system
4194 if (mxge_max_slices == 0 || mxge_max_slices == 1 || ncpus < 2)
4197 /* see how many MSI-X interrupts are available */
4198 msix_cnt = pci_msix_count(sc->dev);
4202 /* now load the slice aware firmware see what it supports */
4203 old_fw = sc->fw_name;
4204 if (old_fw == mxge_fw_aligned)
4205 sc->fw_name = mxge_fw_rss_aligned;
4207 sc->fw_name = mxge_fw_rss_unaligned;
4208 status = mxge_load_firmware(sc, 0);
4210 device_printf(sc->dev, "Falling back to a single slice\n");
4214 /* try to send a reset command to the card to see if it
4216 memset(&cmd, 0, sizeof (cmd));
4217 status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4219 device_printf(sc->dev, "failed reset\n");
4223 /* get rx ring size */
4224 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4226 device_printf(sc->dev, "Cannot determine rx ring size\n");
4229 max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4231 /* tell it the size of the interrupt queues */
4232 cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4233 status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4235 device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4239 /* ask the maximum number of slices it supports */
4240 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4242 device_printf(sc->dev,
4243 "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4246 sc->num_slices = cmd.data0;
4247 if (sc->num_slices > msix_cnt)
4248 sc->num_slices = msix_cnt;
4250 if (mxge_max_slices == -1) {
4251 /* cap to number of CPUs in system */
4252 if (sc->num_slices > ncpus)
4253 sc->num_slices = ncpus;
4255 if (sc->num_slices > mxge_max_slices)
4256 sc->num_slices = mxge_max_slices;
4258 /* make sure it is a power of two */
4259 while (sc->num_slices & (sc->num_slices - 1))
4263 device_printf(sc->dev, "using %d slices\n",
4269 sc->fw_name = old_fw;
4270 (void) mxge_load_firmware(sc, 0);
4274 mxge_add_msix_irqs(mxge_softc_t *sc)
4277 int count, err, i, rid;
4280 sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4283 if (sc->msix_table_res == NULL) {
4284 device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4288 count = sc->num_slices;
4289 err = pci_alloc_msix(sc->dev, &count);
4291 device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4292 "err = %d \n", sc->num_slices, err);
4293 goto abort_with_msix_table;
4295 if (count < sc->num_slices) {
4296 device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4297 count, sc->num_slices);
4298 device_printf(sc->dev,
4299 "Try setting hw.mxge.max_slices to %d\n",
4302 goto abort_with_msix;
4304 bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4305 sc->msix_irq_res = kmalloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4306 if (sc->msix_irq_res == NULL) {
4308 goto abort_with_msix;
4311 for (i = 0; i < sc->num_slices; i++) {
4313 sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4316 if (sc->msix_irq_res[i] == NULL) {
4317 device_printf(sc->dev, "couldn't allocate IRQ res"
4318 " for message %d\n", i);
4320 goto abort_with_res;
4324 bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4325 sc->msix_ih = kmalloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4327 for (i = 0; i < sc->num_slices; i++) {
4328 err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4330 mxge_intr, &sc->ss[i], &sc->msix_ih[i],
4331 sc->ifp->if_serializer);
4333 device_printf(sc->dev, "couldn't setup intr for "
4335 goto abort_with_intr;
4340 device_printf(sc->dev, "using %d msix IRQs:",
4342 for (i = 0; i < sc->num_slices; i++)
4343 kprintf(" %ld", rman_get_start(sc->msix_irq_res[i]));
4349 for (i = 0; i < sc->num_slices; i++) {
4350 if (sc->msix_ih[i] != NULL) {
4351 bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4353 sc->msix_ih[i] = NULL;
4356 kfree(sc->msix_ih, M_DEVBUF);
4360 for (i = 0; i < sc->num_slices; i++) {
4362 if (sc->msix_irq_res[i] != NULL)
4363 bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4364 sc->msix_irq_res[i]);
4365 sc->msix_irq_res[i] = NULL;
4367 kfree(sc->msix_irq_res, M_DEVBUF);
4371 pci_release_msi(sc->dev);
4373 abort_with_msix_table:
4374 bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4375 sc->msix_table_res);
4381 mxge_add_single_irq(mxge_softc_t *sc)
4383 int count, err, rid;
4385 count = pci_msi_count(sc->dev);
4386 if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4392 sc->irq_res = bus_alloc_resource(sc->dev, SYS_RES_IRQ, &rid, 0, ~0,
4393 1, RF_SHAREABLE | RF_ACTIVE);
4394 if (sc->irq_res == NULL) {
4395 device_printf(sc->dev, "could not alloc interrupt\n");
4399 device_printf(sc->dev, "using %s irq %ld\n",
4400 sc->legacy_irq ? "INTx" : "MSI",
4401 rman_get_start(sc->irq_res));
4402 err = bus_setup_intr(sc->dev, sc->irq_res,
4404 mxge_intr, &sc->ss[0], &sc->ih,
4405 sc->ifp->if_serializer);
4407 bus_release_resource(sc->dev, SYS_RES_IRQ,
4408 sc->legacy_irq ? 0 : 1, sc->irq_res);
4409 if (!sc->legacy_irq)
4410 pci_release_msi(sc->dev);
4416 mxge_rem_msix_irqs(mxge_softc_t *sc)
4420 for (i = 0; i < sc->num_slices; i++) {
4421 if (sc->msix_ih[i] != NULL) {
4422 bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4424 sc->msix_ih[i] = NULL;
4427 kfree(sc->msix_ih, M_DEVBUF);
4429 for (i = 0; i < sc->num_slices; i++) {
4431 if (sc->msix_irq_res[i] != NULL)
4432 bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4433 sc->msix_irq_res[i]);
4434 sc->msix_irq_res[i] = NULL;
4436 kfree(sc->msix_irq_res, M_DEVBUF);
4438 bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4439 sc->msix_table_res);
4441 pci_release_msi(sc->dev);
4446 mxge_rem_single_irq(mxge_softc_t *sc)
4448 bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4449 bus_release_resource(sc->dev, SYS_RES_IRQ,
4450 sc->legacy_irq ? 0 : 1, sc->irq_res);
4451 if (!sc->legacy_irq)
4452 pci_release_msi(sc->dev);
4456 mxge_rem_irq(mxge_softc_t *sc)
4458 if (sc->num_slices > 1)
4459 mxge_rem_msix_irqs(sc);
4461 mxge_rem_single_irq(sc);
4465 mxge_add_irq(mxge_softc_t *sc)
4469 if (sc->num_slices > 1)
4470 err = mxge_add_msix_irqs(sc);
4472 err = mxge_add_single_irq(sc);
4474 if (0 && err == 0 && sc->num_slices > 1) {
4475 mxge_rem_msix_irqs(sc);
4476 err = mxge_add_msix_irqs(sc);
4483 mxge_attach(device_t dev)
4485 mxge_softc_t *sc = device_get_softc(dev);
4486 struct ifnet *ifp = &sc->arpcom.ac_if;
4490 * avoid rewriting half the lines in this file to use
4491 * &sc->arpcom.ac_if instead
4495 mxge_fetch_tunables(sc);
4497 err = bus_dma_tag_create(NULL, /* parent */
4500 BUS_SPACE_MAXADDR, /* low */
4501 BUS_SPACE_MAXADDR, /* high */
4502 NULL, NULL, /* filter */
4503 65536 + 256, /* maxsize */
4504 MXGE_MAX_SEND_DESC, /* num segs */
4505 65536, /* maxsegsize */
4507 &sc->parent_dmat); /* tag */
4510 device_printf(sc->dev, "Err %d allocating parent dmat\n",
4512 goto abort_with_nothing;
4516 if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4518 callout_init_mp(&sc->co_hdl);
4520 mxge_setup_cfg_space(sc);
4522 /* Map the board into the kernel */
4524 sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
4526 if (sc->mem_res == NULL) {
4527 device_printf(dev, "could not map memory\n");
4529 goto abort_with_nothing;
4531 sc->sram = rman_get_virtual(sc->mem_res);
4532 sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4533 if (sc->sram_size > rman_get_size(sc->mem_res)) {
4534 device_printf(dev, "impossible memory region size %ld\n",
4535 rman_get_size(sc->mem_res));
4537 goto abort_with_mem_res;
4540 /* make NULL terminated copy of the EEPROM strings section of
4542 bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4543 bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4544 rman_get_bushandle(sc->mem_res),
4545 sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4547 MXGE_EEPROM_STRINGS_SIZE - 2);
4548 err = mxge_parse_strings(sc);
4550 goto abort_with_mem_res;
4552 /* Enable write combining for efficient use of PCIe bus */
4555 /* Allocate the out of band dma memory */
4556 err = mxge_dma_alloc(sc, &sc->cmd_dma,
4557 sizeof (mxge_cmd_t), 64);
4559 goto abort_with_mem_res;
4560 sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4561 err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4563 goto abort_with_cmd_dma;
4565 err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4567 goto abort_with_zeropad_dma;
4569 /* select & load the firmware */
4570 err = mxge_select_firmware(sc);
4572 goto abort_with_dmabench;
4573 sc->intr_coal_delay = mxge_intr_coal_delay;
4575 mxge_slice_probe(sc);
4576 err = mxge_alloc_slices(sc);
4578 goto abort_with_dmabench;
4580 err = mxge_reset(sc, 0);
4582 goto abort_with_slices;
4584 err = mxge_alloc_rings(sc);
4586 device_printf(sc->dev, "failed to allocate rings\n");
4587 goto abort_with_dmabench;
4590 err = mxge_add_irq(sc);
4592 device_printf(sc->dev, "failed to add irq\n");
4593 goto abort_with_rings;
4596 ifp->if_baudrate = IF_Gbps(10UL);
4597 ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4600 ifp->if_capabilities |= IFCAP_LRO;
4603 #ifdef MXGE_NEW_VLAN_API
4604 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4607 sc->max_mtu = mxge_max_mtu(sc);
4608 if (sc->max_mtu >= 9000)
4609 ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4611 device_printf(dev, "MTU limited to %d. Install "
4612 "latest firmware for 9000 byte jumbo support\n",
4613 sc->max_mtu - ETHER_HDR_LEN);
4614 ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4615 ifp->if_capenable = ifp->if_capabilities;
4616 if (sc->lro_cnt == 0)
4617 ifp->if_capenable &= ~IFCAP_LRO;
4619 ifp->if_init = mxge_init;
4621 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4622 ifp->if_ioctl = mxge_ioctl;
4623 ifp->if_start = mxge_start;
4624 /* Initialise the ifmedia structure */
4625 ifmedia_init(&sc->media, 0, mxge_media_change,
4627 mxge_set_media(sc, IFM_ETHER | IFM_AUTO);
4628 mxge_media_probe(sc);
4630 ether_ifattach(ifp, sc->mac_addr, NULL);
4631 /* ether_ifattach sets mtu to ETHERMTU */
4632 if (mxge_initial_mtu != ETHERMTU)
4633 mxge_change_mtu(sc, mxge_initial_mtu);
4635 mxge_add_sysctls(sc);
4636 #ifdef IFNET_BUF_RING
4637 ifp->if_transmit = mxge_transmit;
4638 ifp->if_qflush = mxge_qflush;
4643 mxge_free_rings(sc);
4645 mxge_free_slices(sc);
4646 abort_with_dmabench:
4647 mxge_dma_free(&sc->dmabench_dma);
4648 abort_with_zeropad_dma:
4649 mxge_dma_free(&sc->zeropad_dma);
4651 mxge_dma_free(&sc->cmd_dma);
4653 bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4654 pci_disable_busmaster(dev);
4655 bus_dma_tag_destroy(sc->parent_dmat);
4661 mxge_detach(device_t dev)
4663 mxge_softc_t *sc = device_get_softc(dev);
4665 lwkt_serialize_enter(sc->ifp->if_serializer);
4667 if (sc->ifp->if_flags & IFF_RUNNING)
4670 * XXX: race: the callout callback could be spinning on
4671 * the serializer and run anyway
4673 callout_stop(&sc->co_hdl);
4674 lwkt_serialize_exit(sc->ifp->if_serializer);
4676 ether_ifdetach(sc->ifp);
4677 ifmedia_removeall(&sc->media);
4678 mxge_dummy_rdma(sc, 0);
4679 mxge_rem_sysctls(sc);
4681 mxge_free_rings(sc);
4682 mxge_free_slices(sc);
4683 mxge_dma_free(&sc->dmabench_dma);
4684 mxge_dma_free(&sc->zeropad_dma);
4685 mxge_dma_free(&sc->cmd_dma);
4686 bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4687 pci_disable_busmaster(dev);
4688 bus_dma_tag_destroy(sc->parent_dmat);
4693 mxge_shutdown(device_t dev)
4699 This file uses Myri10GE driver indentation.
4702 c-file-style:"linux"