1 /******************************************************************************
3 Copyright (c) 2006-2009, Myricom Inc.
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
9 1. Redistributions of source code must retain the above copyright notice,
10 this list of conditions and the following disclaimer.
12 2. Neither the name of the Myricom Inc, nor the names of its
13 contributors may be used to endorse or promote products derived from
14 this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
28 ***************************************************************************/
30 #include <sys/cdefs.h>
31 /*__FBSDID("$FreeBSD: src/sys/dev/mxge/if_mxge.c,v 1.63 2009/06/26 11:45:06 rwatson Exp $");*/
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/linker.h>
36 #include <sys/firmware.h>
37 #include <sys/endian.h>
38 #include <sys/in_cksum.h>
39 #include <sys/sockio.h>
41 #include <sys/malloc.h>
42 #include <sys/kernel.h>
43 #include <sys/module.h>
44 #include <sys/serialize.h>
45 #include <sys/socket.h>
46 #include <sys/sysctl.h>
48 /* count xmits ourselves, rather than via drbr */
51 #include <net/if_arp.h>
52 #include <net/ifq_var.h>
53 #include <net/ethernet.h>
54 #include <net/if_dl.h>
55 #include <net/if_media.h>
59 #include <net/if_types.h>
60 #include <net/vlan/if_vlan_var.h>
63 #include <netinet/in_systm.h>
64 #include <netinet/in.h>
65 #include <netinet/ip.h>
66 #include <netinet/tcp.h>
71 #include <bus/pci/pcireg.h>
72 #include <bus/pci/pcivar.h>
73 #include <bus/pci/pci_private.h> /* XXX for pci_cfg_restore */
75 #include <vm/vm.h> /* for pmap_mapdev() */
78 #if defined(__i386) || defined(__amd64)
79 #include <machine/specialreg.h>
82 #include <dev/netif/mxge/mxge_mcp.h>
83 #include <dev/netif/mxge/mcp_gen_header.h>
84 /*#define MXGE_FAKE_IFP*/
85 #include <dev/netif/mxge/if_mxge_var.h>
87 #include <sys/buf_ring.h>
93 static int mxge_nvidia_ecrc_enable = 1;
94 static int mxge_force_firmware = 0;
95 static int mxge_intr_coal_delay = 30;
96 static int mxge_deassert_wait = 1;
97 static int mxge_flow_control = 1;
98 static int mxge_verbose = 0;
99 static int mxge_lro_cnt = 8;
100 static int mxge_ticks;
101 static int mxge_max_slices = 1;
102 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
103 static int mxge_always_promisc = 0;
104 static int mxge_initial_mtu = ETHERMTU_JUMBO;
105 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
106 static char *mxge_fw_aligned = "mxge_eth_z8e";
107 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
108 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
110 static int mxge_probe(device_t dev);
111 static int mxge_attach(device_t dev);
112 static int mxge_detach(device_t dev);
113 static int mxge_shutdown(device_t dev);
114 static void mxge_intr(void *arg);
116 static device_method_t mxge_methods[] =
118 /* Device interface */
119 DEVMETHOD(device_probe, mxge_probe),
120 DEVMETHOD(device_attach, mxge_attach),
121 DEVMETHOD(device_detach, mxge_detach),
122 DEVMETHOD(device_shutdown, mxge_shutdown),
126 static driver_t mxge_driver =
130 sizeof(mxge_softc_t),
133 static devclass_t mxge_devclass;
135 /* Declare ourselves to be a child of the PCI bus.*/
136 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
137 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
138 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
140 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
141 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
142 static int mxge_close(mxge_softc_t *sc);
143 static int mxge_open(mxge_softc_t *sc);
144 static void mxge_tick(void *arg);
146 /* XXX: we don't have Large Receive Offload support yet */
148 mxge_lro_rx(struct mxge_slice_state *ss, struct mbuf *m_head, uint32_t csum)
157 mxge_lro_flush(struct mxge_slice_state *ss, struct lro_entry *lro)
164 mxge_probe(device_t dev)
169 if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
170 ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
171 (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
172 rev = pci_get_revid(dev);
174 case MXGE_PCI_REV_Z8E:
175 device_set_desc(dev, "Myri10G-PCIE-8A");
177 case MXGE_PCI_REV_Z8ES:
178 device_set_desc(dev, "Myri10G-PCIE-8B");
181 device_set_desc(dev, "Myri10G-PCIE-8??");
182 device_printf(dev, "Unrecognized rev %d NIC\n",
192 mxge_enable_wc(mxge_softc_t *sc)
195 #if defined(__i386) || defined(__amd64)
200 len = rman_get_size(sc->mem_res);
201 err = pmap_change_attr((vm_offset_t) sc->sram,
202 len, PAT_WRITE_COMBINING);
204 device_printf(sc->dev, "pmap_change_attr failed, %d\n",
210 sc->wc = 0; /* TBD: PAT support */
215 /* callback to get our DMA address */
217 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
221 *(bus_addr_t *) arg = segs->ds_addr;
226 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
227 bus_size_t alignment)
230 device_t dev = sc->dev;
231 bus_size_t boundary, maxsegsize;
233 if (bytes > 4096 && alignment == 4096) {
241 /* allocate DMAable memory tags */
242 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
243 alignment, /* alignment */
244 boundary, /* boundary */
245 BUS_SPACE_MAXADDR, /* low */
246 BUS_SPACE_MAXADDR, /* high */
247 NULL, NULL, /* filter */
250 maxsegsize, /* maxsegsize */
251 BUS_DMA_COHERENT, /* flags */
252 &dma->dmat); /* tag */
254 device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
258 /* allocate DMAable memory & map */
259 err = bus_dmamem_alloc(dma->dmat, &dma->addr,
260 (BUS_DMA_WAITOK | BUS_DMA_COHERENT
261 | BUS_DMA_ZERO), &dma->map);
263 device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
264 goto abort_with_dmat;
267 /* load the memory */
268 err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
269 mxge_dmamap_callback,
270 (void *)&dma->bus_addr, 0);
272 device_printf(dev, "couldn't load map (err = %d)\n", err);
278 bus_dmamem_free(dma->dmat, dma->addr, dma->map);
280 (void)bus_dma_tag_destroy(dma->dmat);
286 mxge_dma_free(mxge_dma_t *dma)
288 bus_dmamap_unload(dma->dmat, dma->map);
289 bus_dmamem_free(dma->dmat, dma->addr, dma->map);
290 (void)bus_dma_tag_destroy(dma->dmat);
294 * The eeprom strings on the lanaiX have the format
301 mxge_parse_strings(mxge_softc_t *sc)
303 #define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
308 ptr = sc->eeprom_strings;
309 limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
311 while (ptr < limit && *ptr != '\0') {
312 if (memcmp(ptr, "MAC=", 4) == 0) {
314 sc->mac_addr_string = ptr;
315 for (i = 0; i < 6; i++) {
317 if ((ptr + 2) > limit)
319 sc->mac_addr[i] = strtoul(ptr, NULL, 16);
322 } else if (memcmp(ptr, "PC=", 3) == 0) {
324 strncpy(sc->product_code_string, ptr,
325 sizeof (sc->product_code_string) - 1);
326 } else if (memcmp(ptr, "SN=", 3) == 0) {
328 strncpy(sc->serial_number_string, ptr,
329 sizeof (sc->serial_number_string) - 1);
331 MXGE_NEXT_STRING(ptr);
338 device_printf(sc->dev, "failed to parse eeprom_strings\n");
343 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
345 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
348 unsigned long base, off;
350 device_t pdev, mcp55;
351 uint16_t vendor_id, device_id, word;
352 uintptr_t bus, slot, func, ivend, idev;
356 if (!mxge_nvidia_ecrc_enable)
359 pdev = device_get_parent(device_get_parent(sc->dev));
361 device_printf(sc->dev, "could not find parent?\n");
364 vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
365 device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
367 if (vendor_id != 0x10de)
372 if (device_id == 0x005d) {
373 /* ck804, base address is magic */
375 } else if (device_id >= 0x0374 && device_id <= 0x378) {
376 /* mcp55, base address stored in chipset */
377 mcp55 = pci_find_bsf(0, 0, 0);
379 0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
380 0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
381 word = pci_read_config(mcp55, 0x90, 2);
382 base = ((unsigned long)word & 0x7ffeU) << 25;
389 Test below is commented because it is believed that doing
390 config read/write beyond 0xff will access the config space
391 for the next larger function. Uncomment this and remove
392 the hacky pmap_mapdev() way of accessing config space when
393 FreeBSD grows support for extended pcie config space access
396 /* See if we can, by some miracle, access the extended
398 val = pci_read_config(pdev, 0x178, 4);
399 if (val != 0xffffffff) {
401 pci_write_config(pdev, 0x178, val, 4);
405 /* Rather than using normal pci config space writes, we must
406 * map the Nvidia config space ourselves. This is because on
407 * opteron/nvidia class machine the 0xe000000 mapping is
408 * handled by the nvidia chipset, that means the internal PCI
409 * device (the on-chip northbridge), or the amd-8131 bridge
410 * and things behind them are not visible by this method.
413 BUS_READ_IVAR(device_get_parent(pdev), pdev,
415 BUS_READ_IVAR(device_get_parent(pdev), pdev,
416 PCI_IVAR_SLOT, &slot);
417 BUS_READ_IVAR(device_get_parent(pdev), pdev,
418 PCI_IVAR_FUNCTION, &func);
419 BUS_READ_IVAR(device_get_parent(pdev), pdev,
420 PCI_IVAR_VENDOR, &ivend);
421 BUS_READ_IVAR(device_get_parent(pdev), pdev,
422 PCI_IVAR_DEVICE, &idev);
425 + 0x00100000UL * (unsigned long)bus
426 + 0x00001000UL * (unsigned long)(func
429 /* map it into the kernel */
430 va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
434 device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
437 /* get a pointer to the config space mapped into the kernel */
438 cfgptr = va + (off & PAGE_MASK);
440 /* make sure that we can really access it */
441 vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
442 device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
443 if (! (vendor_id == ivend && device_id == idev)) {
444 device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
445 vendor_id, device_id);
446 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
450 ptr32 = (uint32_t*)(cfgptr + 0x178);
453 if (val == 0xffffffff) {
454 device_printf(sc->dev, "extended mapping failed\n");
455 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
459 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
461 device_printf(sc->dev,
462 "Enabled ECRC on upstream Nvidia bridge "
464 (int)bus, (int)slot, (int)func);
469 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
471 device_printf(sc->dev,
472 "Nforce 4 chipset on non-x86/amd64!?!?!\n");
479 mxge_dma_test(mxge_softc_t *sc, int test_type)
482 bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
488 /* Run a small DMA test.
489 * The magic multipliers to the length tell the firmware
490 * to do DMA read, write, or read+write tests. The
491 * results are returned in cmd.data0. The upper 16
492 * bits of the return is the number of transfers completed.
493 * The lower 16 bits is the time in 0.5us ticks that the
494 * transfers took to complete.
497 len = sc->tx_boundary;
499 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
500 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
501 cmd.data2 = len * 0x10000;
502 status = mxge_send_cmd(sc, test_type, &cmd);
507 sc->read_dma = ((cmd.data0>>16) * len * 2) /
508 (cmd.data0 & 0xffff);
509 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
510 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
511 cmd.data2 = len * 0x1;
512 status = mxge_send_cmd(sc, test_type, &cmd);
517 sc->write_dma = ((cmd.data0>>16) * len * 2) /
518 (cmd.data0 & 0xffff);
520 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
521 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
522 cmd.data2 = len * 0x10001;
523 status = mxge_send_cmd(sc, test_type, &cmd);
528 sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
529 (cmd.data0 & 0xffff);
532 if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
533 device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
540 * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
541 * when the PCI-E Completion packets are aligned on an 8-byte
542 * boundary. Some PCI-E chip sets always align Completion packets; on
543 * the ones that do not, the alignment can be enforced by enabling
544 * ECRC generation (if supported).
546 * When PCI-E Completion packets are not aligned, it is actually more
547 * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
549 * If the driver can neither enable ECRC nor verify that it has
550 * already been enabled, then it must use a firmware image which works
551 * around unaligned completion packets (ethp_z8e.dat), and it should
552 * also ensure that it never gives the device a Read-DMA which is
553 * larger than 2KB by setting the tx_boundary to 2KB. If ECRC is
554 * enabled, then the driver should use the aligned (eth_z8e.dat)
555 * firmware image, and set tx_boundary to 4KB.
559 mxge_firmware_probe(mxge_softc_t *sc)
561 device_t dev = sc->dev;
565 sc->tx_boundary = 4096;
567 * Verify the max read request size was set to 4KB
568 * before trying the test with 4KB.
570 if (pci_find_extcap(dev, PCIY_EXPRESS, ®) == 0) {
571 pectl = pci_read_config(dev, reg + 0x8, 2);
572 if ((pectl & (5 << 12)) != (5 << 12)) {
573 device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
575 sc->tx_boundary = 2048;
580 * load the optimized firmware (which assumes aligned PCIe
581 * completions) in order to see if it works on this host.
583 sc->fw_name = mxge_fw_aligned;
584 status = mxge_load_firmware(sc, 1);
590 * Enable ECRC if possible
592 mxge_enable_nvidia_ecrc(sc);
595 * Run a DMA test which watches for unaligned completions and
596 * aborts on the first one seen.
599 status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
601 return 0; /* keep the aligned firmware */
604 device_printf(dev, "DMA test failed: %d\n", status);
605 if (status == ENOSYS)
606 device_printf(dev, "Falling back to ethp! "
607 "Please install up to date fw\n");
612 mxge_select_firmware(mxge_softc_t *sc)
617 if (mxge_force_firmware != 0) {
618 if (mxge_force_firmware == 1)
623 device_printf(sc->dev,
624 "Assuming %s completions (forced)\n",
625 aligned ? "aligned" : "unaligned");
629 /* if the PCIe link width is 4 or less, we can use the aligned
630 firmware and skip any checks */
631 if (sc->link_width != 0 && sc->link_width <= 4) {
632 device_printf(sc->dev,
633 "PCIe x%d Link, expect reduced performance\n",
639 if (0 == mxge_firmware_probe(sc))
644 sc->fw_name = mxge_fw_aligned;
645 sc->tx_boundary = 4096;
647 sc->fw_name = mxge_fw_unaligned;
648 sc->tx_boundary = 2048;
650 return (mxge_load_firmware(sc, 0));
660 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
664 if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
665 device_printf(sc->dev, "Bad firmware type: 0x%x\n",
666 be32toh(hdr->mcp_type));
670 /* save firmware version for sysctl */
671 strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
673 device_printf(sc->dev, "firmware id: %s\n", hdr->version);
675 ksscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
676 &sc->fw_ver_minor, &sc->fw_ver_tiny);
678 if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
679 && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
680 device_printf(sc->dev, "Found firmware version %s\n",
682 device_printf(sc->dev, "Driver needs %d.%d\n",
683 MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
692 z_alloc(void *nil, u_int items, u_int size)
696 ptr = kmalloc(items * size, M_TEMP, M_NOWAIT);
701 z_free(void *nil, void *ptr)
708 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
711 const mcp_gen_header_t *hdr;
718 fw = firmware_image_load(sc->fw_name, NULL);
720 device_printf(sc->dev, "Could not find firmware image %s\n",
725 /* setup zlib and decompress f/w */
726 bzero(&zs, sizeof (zs));
729 status = inflateInit(&zs);
730 if (status != Z_OK) {
735 /* the uncompressed size is stored as the firmware version,
736 which would otherwise go unused */
737 fw_len = (size_t) fw->version;
738 inflate_buffer = kmalloc(fw_len, M_TEMP, M_NOWAIT);
739 if (inflate_buffer == NULL)
741 zs.avail_in = fw->datasize;
742 zs.next_in = __DECONST(char *, fw->data);
743 zs.avail_out = fw_len;
744 zs.next_out = inflate_buffer;
745 status = inflate(&zs, Z_FINISH);
746 if (status != Z_STREAM_END) {
747 device_printf(sc->dev, "zlib %d\n", status);
749 goto abort_with_buffer;
752 fw_len = fw->fw_imglen;
754 hdr_offset = htobe32(*(const uint32_t *)
755 (fw->fw_image + MCP_HEADER_PTR_OFFSET));
756 if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
757 device_printf(sc->dev, "Bad firmware file");
761 hdr = (const void*)(fw->fw_image + hdr_offset);
763 status = mxge_validate_firmware(sc, hdr);
767 /* Copy the inflated firmware to NIC SRAM. */
768 for (i = 0; i < fw_len; i += 256) {
769 mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
771 min(256U, (unsigned)(fw_len - i)));
781 kfree(inflate_buffer, M_TEMP);
786 firmware_image_unload(fw);
791 * Enable or disable periodic RDMAs from the host to make certain
792 * chipsets resend dropped PCIe messages
796 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
799 volatile uint32_t *confirm;
800 volatile char *submit;
801 uint32_t *buf, dma_low, dma_high;
804 buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
806 /* clear confirmation addr */
807 confirm = (volatile uint32_t *)sc->cmd;
811 /* send an rdma command to the PCIe engine, and wait for the
812 response in the confirmation address. The firmware should
813 write a -1 there to indicate it is alive and well
816 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
817 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
818 buf[0] = htobe32(dma_high); /* confirm addr MSW */
819 buf[1] = htobe32(dma_low); /* confirm addr LSW */
820 buf[2] = htobe32(0xffffffff); /* confirm data */
821 dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
822 dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
823 buf[3] = htobe32(dma_high); /* dummy addr MSW */
824 buf[4] = htobe32(dma_low); /* dummy addr LSW */
825 buf[5] = htobe32(enable); /* enable? */
828 submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
830 mxge_pio_copy(submit, buf, 64);
835 while (*confirm != 0xffffffff && i < 20) {
839 if (*confirm != 0xffffffff) {
840 device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
841 (enable ? "enable" : "disable"), confirm,
848 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
851 char buf_bytes[sizeof(*buf) + 8];
852 volatile mcp_cmd_response_t *response = sc->cmd;
853 volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
854 uint32_t dma_low, dma_high;
855 int err, sleep_total = 0;
857 /* ensure buf is aligned to 8 bytes */
858 buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
860 buf->data0 = htobe32(data->data0);
861 buf->data1 = htobe32(data->data1);
862 buf->data2 = htobe32(data->data2);
863 buf->cmd = htobe32(cmd);
864 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
865 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
867 buf->response_addr.low = htobe32(dma_low);
868 buf->response_addr.high = htobe32(dma_high);
870 lwkt_serialize_enter(sc->ifp->if_serializer);
872 response->result = 0xffffffff;
874 mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
876 /* wait up to 20ms */
878 for (sleep_total = 0; sleep_total < 20; sleep_total++) {
879 bus_dmamap_sync(sc->cmd_dma.dmat,
880 sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
882 switch (be32toh(response->result)) {
884 data->data0 = be32toh(response->data);
890 case MXGEFW_CMD_UNKNOWN:
893 case MXGEFW_CMD_ERROR_UNALIGNED:
896 case MXGEFW_CMD_ERROR_BUSY:
900 device_printf(sc->dev,
902 "failed, result = %d\n",
903 cmd, be32toh(response->result));
911 device_printf(sc->dev, "mxge: command %d timed out"
913 cmd, be32toh(response->result));
914 lwkt_serialize_exit(sc->ifp->if_serializer);
919 mxge_adopt_running_firmware(mxge_softc_t *sc)
921 struct mcp_gen_header *hdr;
922 const size_t bytes = sizeof (struct mcp_gen_header);
926 /* find running firmware header */
927 hdr_offset = htobe32(*(volatile uint32_t *)
928 (sc->sram + MCP_HEADER_PTR_OFFSET));
930 if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
931 device_printf(sc->dev,
932 "Running firmware has bad header offset (%d)\n",
937 /* copy header of running firmware from SRAM to host memory to
938 * validate firmware */
939 hdr = kmalloc(bytes, M_DEVBUF, M_NOWAIT);
941 device_printf(sc->dev, "could not kmalloc firmware hdr\n");
944 bus_space_read_region_1(rman_get_bustag(sc->mem_res),
945 rman_get_bushandle(sc->mem_res),
946 hdr_offset, (char *)hdr, bytes);
947 status = mxge_validate_firmware(sc, hdr);
948 kfree(hdr, M_DEVBUF);
951 * check to see if adopted firmware has bug where adopting
952 * it will cause broadcasts to be filtered unless the NIC
953 * is kept in ALLMULTI mode
955 if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
956 sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
957 sc->adopted_rx_filter_bug = 1;
958 device_printf(sc->dev, "Adopting fw %d.%d.%d: "
959 "working around rx filter bug\n",
960 sc->fw_ver_major, sc->fw_ver_minor,
969 mxge_load_firmware(mxge_softc_t *sc, int adopt)
971 volatile uint32_t *confirm;
972 volatile char *submit;
974 uint32_t *buf, size, dma_low, dma_high;
977 buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
979 size = sc->sram_size;
980 status = mxge_load_firmware_helper(sc, &size);
984 /* Try to use the currently running firmware, if
986 status = mxge_adopt_running_firmware(sc);
988 device_printf(sc->dev,
989 "failed to adopt running firmware\n");
992 device_printf(sc->dev,
993 "Successfully adopted running firmware\n");
994 if (sc->tx_boundary == 4096) {
995 device_printf(sc->dev,
996 "Using firmware currently running on NIC"
998 device_printf(sc->dev,
999 "performance consider loading optimized "
1002 sc->fw_name = mxge_fw_unaligned;
1003 sc->tx_boundary = 2048;
1006 /* clear confirmation addr */
1007 confirm = (volatile uint32_t *)sc->cmd;
1010 /* send a reload command to the bootstrap MCP, and wait for the
1011 response in the confirmation address. The firmware should
1012 write a -1 there to indicate it is alive and well
1015 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
1016 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
1018 buf[0] = htobe32(dma_high); /* confirm addr MSW */
1019 buf[1] = htobe32(dma_low); /* confirm addr LSW */
1020 buf[2] = htobe32(0xffffffff); /* confirm data */
1022 /* FIX: All newest firmware should un-protect the bottom of
1023 the sram before handoff. However, the very first interfaces
1024 do not. Therefore the handoff copy must skip the first 8 bytes
1026 /* where the code starts*/
1027 buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1028 buf[4] = htobe32(size - 8); /* length of code */
1029 buf[5] = htobe32(8); /* where to copy to */
1030 buf[6] = htobe32(0); /* where to jump to */
1032 submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1033 mxge_pio_copy(submit, buf, 64);
1038 while (*confirm != 0xffffffff && i < 20) {
1041 bus_dmamap_sync(sc->cmd_dma.dmat,
1042 sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1044 if (*confirm != 0xffffffff) {
1045 device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1054 mxge_update_mac_address(mxge_softc_t *sc)
1057 uint8_t *addr = sc->mac_addr;
1061 cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1062 | (addr[2] << 8) | addr[3]);
1064 cmd.data1 = ((addr[4] << 8) | (addr[5]));
1066 status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1071 mxge_change_pause(mxge_softc_t *sc, int pause)
1077 status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1080 status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1084 device_printf(sc->dev, "Failed to set flow control mode\n");
1092 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1097 if (mxge_always_promisc)
1101 status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1104 status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1108 device_printf(sc->dev, "Failed to set promisc mode\n");
1113 mxge_set_multicast_list(mxge_softc_t *sc)
1116 struct ifmultiaddr *ifma;
1117 struct ifnet *ifp = sc->ifp;
1120 /* This firmware is known to not support multicast */
1121 if (!sc->fw_multicast_support)
1124 /* Disable multicast filtering while we play with the lists*/
1125 err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1127 device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1128 " error status: %d\n", err);
1132 if (sc->adopted_rx_filter_bug)
1135 if (ifp->if_flags & IFF_ALLMULTI)
1136 /* request to disable multicast filtering, so quit here */
1139 /* Flush all the filters */
1141 err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1143 device_printf(sc->dev,
1144 "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1145 ", error status: %d\n", err);
1149 /* Walk the multicast list, and add each address */
1151 lwkt_serialize_enter(ifp->if_serializer);
1152 LIST_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1153 if (ifma->ifma_addr->sa_family != AF_LINK)
1155 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1157 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1159 cmd.data0 = htonl(cmd.data0);
1160 cmd.data1 = htonl(cmd.data1);
1161 err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1163 device_printf(sc->dev, "Failed "
1164 "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1166 /* abort, leaving multicast filtering off */
1167 lwkt_serialize_exit(ifp->if_serializer);
1171 lwkt_serialize_exit(ifp->if_serializer);
1172 /* Enable multicast filtering */
1173 err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1175 device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1176 ", error status: %d\n", err);
1181 mxge_max_mtu(mxge_softc_t *sc)
1186 if (MJUMPAGESIZE - MXGEFW_PAD > MXGEFW_MAX_MTU)
1187 return MXGEFW_MAX_MTU - MXGEFW_PAD;
1189 /* try to set nbufs to see if it we can
1190 use virtually contiguous jumbos */
1192 status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1195 return MXGEFW_MAX_MTU - MXGEFW_PAD;
1197 /* otherwise, we're limited to MJUMPAGESIZE */
1198 return MJUMPAGESIZE - MXGEFW_PAD;
1202 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1204 struct mxge_slice_state *ss;
1205 mxge_rx_done_t *rx_done;
1206 volatile uint32_t *irq_claim;
1210 /* try to send a reset command to the card to see if it
1212 memset(&cmd, 0, sizeof (cmd));
1213 status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1215 device_printf(sc->dev, "failed reset\n");
1219 mxge_dummy_rdma(sc, 1);
1222 /* set the intrq size */
1223 cmd.data0 = sc->rx_ring_size;
1224 status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1227 * Even though we already know how many slices are supported
1228 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1229 * has magic side effects, and must be called after a reset.
1230 * It must be called prior to calling any RSS related cmds,
1231 * including assigning an interrupt queue for anything but
1232 * slice 0. It must also be called *after*
1233 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1234 * the firmware to compute offsets.
1237 if (sc->num_slices > 1) {
1238 /* ask the maximum number of slices it supports */
1239 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1242 device_printf(sc->dev,
1243 "failed to get number of slices\n");
1247 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1248 * to setting up the interrupt queue DMA
1250 cmd.data0 = sc->num_slices;
1251 cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1252 #ifdef IFNET_BUF_RING
1253 cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1255 status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1258 device_printf(sc->dev,
1259 "failed to set number of slices\n");
1265 if (interrupts_setup) {
1266 /* Now exchange information about interrupts */
1267 for (slice = 0; slice < sc->num_slices; slice++) {
1268 rx_done = &sc->ss[slice].rx_done;
1269 memset(rx_done->entry, 0, sc->rx_ring_size);
1270 cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1271 cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1273 status |= mxge_send_cmd(sc,
1274 MXGEFW_CMD_SET_INTRQ_DMA,
1279 status |= mxge_send_cmd(sc,
1280 MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1283 sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1285 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1286 irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1289 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1291 sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1293 device_printf(sc->dev, "failed set interrupt parameters\n");
1298 *sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1301 /* run a DMA benchmark */
1302 (void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1304 for (slice = 0; slice < sc->num_slices; slice++) {
1305 ss = &sc->ss[slice];
1307 ss->irq_claim = irq_claim + (2 * slice);
1308 /* reset mcp/driver shared state back to 0 */
1309 ss->rx_done.idx = 0;
1310 ss->rx_done.cnt = 0;
1313 ss->tx.pkt_done = 0;
1314 ss->tx.queue_active = 0;
1315 ss->tx.activate = 0;
1316 ss->tx.deactivate = 0;
1321 ss->rx_small.cnt = 0;
1322 ss->lro_bad_csum = 0;
1324 ss->lro_flushed = 0;
1325 if (ss->fw_stats != NULL) {
1326 ss->fw_stats->valid = 0;
1327 ss->fw_stats->send_done_count = 0;
1330 sc->rdma_tags_available = 15;
1331 status = mxge_update_mac_address(sc);
1332 mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1333 mxge_change_pause(sc, sc->pause);
1334 mxge_set_multicast_list(sc);
1339 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1342 unsigned int intr_coal_delay;
1346 intr_coal_delay = sc->intr_coal_delay;
1347 err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1351 if (intr_coal_delay == sc->intr_coal_delay)
1354 if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1357 lwkt_serialize_enter(sc->ifp->if_serializer);
1358 *sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1359 sc->intr_coal_delay = intr_coal_delay;
1361 lwkt_serialize_exit(sc->ifp->if_serializer);
1366 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1369 unsigned int enabled;
1373 enabled = sc->pause;
1374 err = sysctl_handle_int(oidp, &enabled, arg2, req);
1378 if (enabled == sc->pause)
1381 lwkt_serialize_enter(sc->ifp->if_serializer);
1382 err = mxge_change_pause(sc, enabled);
1383 lwkt_serialize_exit(sc->ifp->if_serializer);
1388 mxge_change_lro_locked(mxge_softc_t *sc, int lro_cnt)
1395 ifp->if_capenable &= ~IFCAP_LRO;
1397 ifp->if_capenable |= IFCAP_LRO;
1398 sc->lro_cnt = lro_cnt;
1399 if (ifp->if_flags & IFF_RUNNING) {
1401 err = mxge_open(sc);
1407 mxge_change_lro(SYSCTL_HANDLER_ARGS)
1410 unsigned int lro_cnt;
1414 lro_cnt = sc->lro_cnt;
1415 err = sysctl_handle_int(oidp, &lro_cnt, arg2, req);
1419 if (lro_cnt == sc->lro_cnt)
1425 lwkt_serialize_enter(sc->ifp->if_serializer);
1426 err = mxge_change_lro_locked(sc, lro_cnt);
1427 lwkt_serialize_exit(sc->ifp->if_serializer);
1432 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1438 arg2 = be32toh(*(int *)arg1);
1440 err = sysctl_handle_int(oidp, arg1, arg2, req);
1446 mxge_rem_sysctls(mxge_softc_t *sc)
1448 struct mxge_slice_state *ss;
1451 if (sc->sysctl_tree != NULL) {
1452 sysctl_ctx_free(&sc->sysctl_ctx);
1453 sc->sysctl_tree = NULL;
1455 if (sc->slice_sysctl_tree == NULL)
1458 for (slice = 0; slice < sc->num_slices; slice++) {
1459 ss = &sc->ss[slice];
1460 if (ss == NULL || ss->sysctl_tree == NULL)
1462 sysctl_ctx_free(&ss->sysctl_ctx);
1463 ss->sysctl_tree = NULL;
1465 sysctl_ctx_free(&sc->slice_sysctl_ctx);
1466 sc->slice_sysctl_tree = NULL;
1470 mxge_add_sysctls(mxge_softc_t *sc)
1472 struct sysctl_ctx_list *ctx;
1473 struct sysctl_oid_list *children;
1475 struct mxge_slice_state *ss;
1479 ctx = &sc->sysctl_ctx;
1480 sysctl_ctx_init(ctx);
1481 sc->sysctl_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_STATIC_CHILDREN(_hw),
1483 device_get_nameunit(sc->dev),
1485 if (sc->sysctl_tree == NULL) {
1486 device_printf(sc->dev, "can't add sysctl node\n");
1490 children = SYSCTL_CHILDREN(sc->sysctl_tree);
1491 fw = sc->ss[0].fw_stats;
1493 /* random information */
1494 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1496 CTLFLAG_RD, &sc->fw_version,
1497 0, "firmware version");
1498 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1500 CTLFLAG_RD, &sc->serial_number_string,
1501 0, "serial number");
1502 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1504 CTLFLAG_RD, &sc->product_code_string,
1506 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1508 CTLFLAG_RD, &sc->link_width,
1510 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1512 CTLFLAG_RD, &sc->tx_boundary,
1514 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1516 CTLFLAG_RD, &sc->wc,
1517 0, "write combining PIO?");
1518 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1520 CTLFLAG_RD, &sc->read_dma,
1521 0, "DMA Read speed in MB/s");
1522 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1524 CTLFLAG_RD, &sc->write_dma,
1525 0, "DMA Write speed in MB/s");
1526 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1527 "read_write_dma_MBs",
1528 CTLFLAG_RD, &sc->read_write_dma,
1529 0, "DMA concurrent Read/Write speed in MB/s");
1532 /* performance related tunables */
1533 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1535 CTLTYPE_INT|CTLFLAG_RW, sc,
1536 0, mxge_change_intr_coal,
1537 "I", "interrupt coalescing delay in usecs");
1539 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1540 "flow_control_enabled",
1541 CTLTYPE_INT|CTLFLAG_RW, sc,
1542 0, mxge_change_flow_control,
1543 "I", "interrupt coalescing delay in usecs");
1545 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1547 CTLFLAG_RW, &mxge_deassert_wait,
1548 0, "Wait for IRQ line to go low in ihandler");
1550 /* stats block from firmware is in network byte order.
1552 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1554 CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1555 0, mxge_handle_be32,
1557 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1558 "rdma_tags_available",
1559 CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1560 0, mxge_handle_be32,
1561 "I", "rdma_tags_available");
1562 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1563 "dropped_bad_crc32",
1564 CTLTYPE_INT|CTLFLAG_RD,
1565 &fw->dropped_bad_crc32,
1566 0, mxge_handle_be32,
1567 "I", "dropped_bad_crc32");
1568 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1570 CTLTYPE_INT|CTLFLAG_RD,
1571 &fw->dropped_bad_phy,
1572 0, mxge_handle_be32,
1573 "I", "dropped_bad_phy");
1574 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1575 "dropped_link_error_or_filtered",
1576 CTLTYPE_INT|CTLFLAG_RD,
1577 &fw->dropped_link_error_or_filtered,
1578 0, mxge_handle_be32,
1579 "I", "dropped_link_error_or_filtered");
1580 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1581 "dropped_link_overflow",
1582 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1583 0, mxge_handle_be32,
1584 "I", "dropped_link_overflow");
1585 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1586 "dropped_multicast_filtered",
1587 CTLTYPE_INT|CTLFLAG_RD,
1588 &fw->dropped_multicast_filtered,
1589 0, mxge_handle_be32,
1590 "I", "dropped_multicast_filtered");
1591 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1592 "dropped_no_big_buffer",
1593 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1594 0, mxge_handle_be32,
1595 "I", "dropped_no_big_buffer");
1596 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1597 "dropped_no_small_buffer",
1598 CTLTYPE_INT|CTLFLAG_RD,
1599 &fw->dropped_no_small_buffer,
1600 0, mxge_handle_be32,
1601 "I", "dropped_no_small_buffer");
1602 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1604 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1605 0, mxge_handle_be32,
1606 "I", "dropped_overrun");
1607 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1609 CTLTYPE_INT|CTLFLAG_RD,
1611 0, mxge_handle_be32,
1612 "I", "dropped_pause");
1613 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1615 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1616 0, mxge_handle_be32,
1617 "I", "dropped_runt");
1619 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1620 "dropped_unicast_filtered",
1621 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1622 0, mxge_handle_be32,
1623 "I", "dropped_unicast_filtered");
1625 /* verbose printing? */
1626 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1628 CTLFLAG_RW, &mxge_verbose,
1629 0, "verbose printing");
1632 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1634 CTLTYPE_INT|CTLFLAG_RW, sc,
1636 "I", "number of lro merge queues");
1639 /* add counters exported for debugging from all slices */
1640 sysctl_ctx_init(&sc->slice_sysctl_ctx);
1641 sc->slice_sysctl_tree =
1642 SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1643 "slice", CTLFLAG_RD, 0, "");
1645 for (slice = 0; slice < sc->num_slices; slice++) {
1646 ss = &sc->ss[slice];
1647 sysctl_ctx_init(&ss->sysctl_ctx);
1648 ctx = &ss->sysctl_ctx;
1649 children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1650 ksprintf(slice_num, "%d", slice);
1652 SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1654 children = SYSCTL_CHILDREN(ss->sysctl_tree);
1655 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1657 CTLFLAG_RD, &ss->rx_small.cnt,
1659 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1661 CTLFLAG_RD, &ss->rx_big.cnt,
1663 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1664 "lro_flushed", CTLFLAG_RD, &ss->lro_flushed,
1665 0, "number of lro merge queues flushed");
1667 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1668 "lro_queued", CTLFLAG_RD, &ss->lro_queued,
1669 0, "number of frames appended to lro merge"
1672 #ifndef IFNET_BUF_RING
1673 /* only transmit from slice 0 for now */
1677 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1679 CTLFLAG_RD, &ss->tx.req,
1682 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1684 CTLFLAG_RD, &ss->tx.done,
1686 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1688 CTLFLAG_RD, &ss->tx.pkt_done,
1690 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1692 CTLFLAG_RD, &ss->tx.stall,
1694 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1696 CTLFLAG_RD, &ss->tx.wake,
1698 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1700 CTLFLAG_RD, &ss->tx.defrag,
1702 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1704 CTLFLAG_RD, &ss->tx.queue_active,
1705 0, "tx_queue_active");
1706 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1708 CTLFLAG_RD, &ss->tx.activate,
1710 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1712 CTLFLAG_RD, &ss->tx.deactivate,
1713 0, "tx_deactivate");
1717 /* copy an array of mcp_kreq_ether_send_t's to the mcp. Copy
1718 backwards one at a time and handle ring wraps */
1721 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1722 mcp_kreq_ether_send_t *src, int cnt)
1724 int idx, starting_slot;
1725 starting_slot = tx->req;
1728 idx = (starting_slot + cnt) & tx->mask;
1729 mxge_pio_copy(&tx->lanai[idx],
1730 &src[cnt], sizeof(*src));
1736 * copy an array of mcp_kreq_ether_send_t's to the mcp. Copy
1737 * at most 32 bytes at a time, so as to avoid involving the software
1738 * pio handler in the nic. We re-write the first segment's flags
1739 * to mark them valid only after writing the entire chain
1743 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1748 volatile uint32_t *dst_ints;
1749 mcp_kreq_ether_send_t *srcp;
1750 volatile mcp_kreq_ether_send_t *dstp, *dst;
1753 idx = tx->req & tx->mask;
1755 last_flags = src->flags;
1758 dst = dstp = &tx->lanai[idx];
1761 if ((idx + cnt) < tx->mask) {
1762 for (i = 0; i < (cnt - 1); i += 2) {
1763 mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1764 wmb(); /* force write every 32 bytes */
1769 /* submit all but the first request, and ensure
1770 that it is submitted below */
1771 mxge_submit_req_backwards(tx, src, cnt);
1775 /* submit the first request */
1776 mxge_pio_copy(dstp, srcp, sizeof(*src));
1777 wmb(); /* barrier before setting valid flag */
1780 /* re-write the last 32-bits with the valid flags */
1781 src->flags = last_flags;
1782 src_ints = (uint32_t *)src;
1784 dst_ints = (volatile uint32_t *)dst;
1786 *dst_ints = *src_ints;
1794 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1795 int busdma_seg_cnt, int ip_off)
1798 mcp_kreq_ether_send_t *req;
1799 bus_dma_segment_t *seg;
1802 uint32_t low, high_swapped;
1803 int len, seglen, cum_len, cum_len_next;
1804 int next_is_first, chop, cnt, rdma_count, small;
1805 uint16_t pseudo_hdr_offset, cksum_offset, mss;
1806 uint8_t flags, flags_next;
1809 mss = m->m_pkthdr.tso_segsz;
1811 /* negative cum_len signifies to the
1812 * send loop that we are still in the
1813 * header portion of the TSO packet.
1816 /* ensure we have the ethernet, IP and TCP
1817 header together in the first mbuf, copy
1818 it to a scratch buffer if not */
1819 if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1820 m_copydata(m, 0, ip_off + sizeof (*ip),
1822 ip = (struct ip *)(ss->scratch + ip_off);
1824 ip = (struct ip *)(mtod(m, char *) + ip_off);
1826 if (__predict_false(m->m_len < ip_off + (ip->ip_hl << 2)
1828 m_copydata(m, 0, ip_off + (ip->ip_hl << 2)
1829 + sizeof (*tcp), ss->scratch);
1830 ip = (struct ip *)(mtod(m, char *) + ip_off);
1833 tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1834 cum_len = -(ip_off + ((ip->ip_hl + tcp->th_off) << 2));
1836 /* TSO implies checksum offload on this hardware */
1837 cksum_offset = ip_off + (ip->ip_hl << 2);
1838 flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1841 /* for TSO, pseudo_hdr_offset holds mss.
1842 * The firmware figures out where to put
1843 * the checksum by parsing the header. */
1844 pseudo_hdr_offset = htobe16(mss);
1851 /* "rdma_count" is the number of RDMAs belonging to the
1852 * current packet BEFORE the current send request. For
1853 * non-TSO packets, this is equal to "count".
1854 * For TSO packets, rdma_count needs to be reset
1855 * to 0 after a segment cut.
1857 * The rdma_count field of the send request is
1858 * the number of RDMAs of the packet starting at
1859 * that request. For TSO send requests with one ore more cuts
1860 * in the middle, this is the number of RDMAs starting
1861 * after the last cut in the request. All previous
1862 * segments before the last cut implicitly have 1 RDMA.
1864 * Since the number of RDMAs is not known beforehand,
1865 * it must be filled-in retroactively - after each
1866 * segmentation cut or at the end of the entire packet.
1869 while (busdma_seg_cnt) {
1870 /* Break the busdma segment up into pieces*/
1871 low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1872 high_swapped = htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1876 flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1878 cum_len_next = cum_len + seglen;
1879 (req-rdma_count)->rdma_count = rdma_count + 1;
1880 if (__predict_true(cum_len >= 0)) {
1882 chop = (cum_len_next > mss);
1883 cum_len_next = cum_len_next % mss;
1884 next_is_first = (cum_len_next == 0);
1885 flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1886 flags_next |= next_is_first *
1888 rdma_count |= -(chop | next_is_first);
1889 rdma_count += chop & !next_is_first;
1890 } else if (cum_len_next >= 0) {
1895 small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1896 flags_next = MXGEFW_FLAGS_TSO_PLD |
1897 MXGEFW_FLAGS_FIRST |
1898 (small * MXGEFW_FLAGS_SMALL);
1901 req->addr_high = high_swapped;
1902 req->addr_low = htobe32(low);
1903 req->pseudo_hdr_offset = pseudo_hdr_offset;
1905 req->rdma_count = 1;
1906 req->length = htobe16(seglen);
1907 req->cksum_offset = cksum_offset;
1908 req->flags = flags | ((cum_len & 1) *
1909 MXGEFW_FLAGS_ALIGN_ODD);
1912 cum_len = cum_len_next;
1917 if (__predict_false(cksum_offset > seglen))
1918 cksum_offset -= seglen;
1921 if (__predict_false(cnt > tx->max_desc))
1927 (req-rdma_count)->rdma_count = rdma_count;
1931 req->flags |= MXGEFW_FLAGS_TSO_LAST;
1932 } while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1934 tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1935 mxge_submit_req(tx, tx->req_list, cnt);
1936 #ifdef IFNET_BUF_RING
1937 if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1938 /* tell the NIC to start polling this slice */
1940 tx->queue_active = 1;
1948 bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1952 kprintf("tx->max_desc exceeded via TSO!\n");
1953 kprintf("mss = %d, %ld, %d!\n", mss,
1954 (long)seg - (long)tx->seg_list, tx->max_desc);
1961 #endif /* IFCAP_TSO4 */
1963 #ifdef MXGE_NEW_VLAN_API
1965 * We reproduce the software vlan tag insertion from
1966 * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
1967 * vlan tag insertion. We need to advertise this in order to have the
1968 * vlan interface respect our csum offload flags.
1970 static struct mbuf *
1971 mxge_vlan_tag_insert(struct mbuf *m)
1973 struct ether_vlan_header *evl;
1975 M_PREPEND(m, EVL_ENCAPLEN, MB_DONTWAIT);
1976 if (__predict_false(m == NULL))
1978 if (m->m_len < sizeof(*evl)) {
1979 m = m_pullup(m, sizeof(*evl));
1980 if (__predict_false(m == NULL))
1984 * Transform the Ethernet header into an Ethernet header
1985 * with 802.1Q encapsulation.
1987 evl = mtod(m, struct ether_vlan_header *);
1988 bcopy((char *)evl + EVL_ENCAPLEN,
1989 (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
1990 evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
1991 evl->evl_tag = htons(m->m_pkthdr.ether_vlantag);
1992 m->m_flags &= ~M_VLANTAG;
1995 #endif /* MXGE_NEW_VLAN_API */
1998 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
2001 mcp_kreq_ether_send_t *req;
2002 bus_dma_segment_t *seg;
2007 int cnt, cum_len, err, i, idx, odd_flag, ip_off;
2008 uint16_t pseudo_hdr_offset;
2009 uint8_t flags, cksum_offset;
2016 ip_off = sizeof (struct ether_header);
2017 #ifdef MXGE_NEW_VLAN_API
2018 if (m->m_flags & M_VLANTAG) {
2019 m = mxge_vlan_tag_insert(m);
2020 if (__predict_false(m == NULL))
2022 ip_off += EVL_ENCAPLEN;
2025 /* (try to) map the frame for DMA */
2026 idx = tx->req & tx->mask;
2027 err = bus_dmamap_load_mbuf_segment(tx->dmat, tx->info[idx].map,
2028 m, tx->seg_list, 1, &cnt,
2030 if (__predict_false(err == EFBIG)) {
2031 /* Too many segments in the chain. Try
2033 m_tmp = m_defrag(m, M_NOWAIT);
2034 if (m_tmp == NULL) {
2039 err = bus_dmamap_load_mbuf_segment(tx->dmat,
2041 m, tx->seg_list, 1, &cnt,
2044 if (__predict_false(err != 0)) {
2045 device_printf(sc->dev, "bus_dmamap_load_mbuf_segment returned %d"
2046 " packet len = %d\n", err, m->m_pkthdr.len);
2049 bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2050 BUS_DMASYNC_PREWRITE);
2051 tx->info[idx].m = m;
2054 /* TSO is different enough, we handle it in another routine */
2055 if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2056 mxge_encap_tso(ss, m, cnt, ip_off);
2063 pseudo_hdr_offset = 0;
2064 flags = MXGEFW_FLAGS_NO_TSO;
2066 /* checksum offloading? */
2067 if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
2068 /* ensure ip header is in first mbuf, copy
2069 it to a scratch buffer if not */
2070 if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
2071 m_copydata(m, 0, ip_off + sizeof (*ip),
2073 ip = (struct ip *)(ss->scratch + ip_off);
2075 ip = (struct ip *)(mtod(m, char *) + ip_off);
2077 cksum_offset = ip_off + (ip->ip_hl << 2);
2078 pseudo_hdr_offset = cksum_offset + m->m_pkthdr.csum_data;
2079 pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2080 req->cksum_offset = cksum_offset;
2081 flags |= MXGEFW_FLAGS_CKSUM;
2082 odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2086 if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2087 flags |= MXGEFW_FLAGS_SMALL;
2089 /* convert segments into a request list */
2092 req->flags = MXGEFW_FLAGS_FIRST;
2093 for (i = 0; i < cnt; i++) {
2095 htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2097 htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2098 req->length = htobe16(seg->ds_len);
2099 req->cksum_offset = cksum_offset;
2100 if (cksum_offset > seg->ds_len)
2101 cksum_offset -= seg->ds_len;
2104 req->pseudo_hdr_offset = pseudo_hdr_offset;
2105 req->pad = 0; /* complete solid 16-byte block */
2106 req->rdma_count = 1;
2107 req->flags |= flags | ((cum_len & 1) * odd_flag);
2108 cum_len += seg->ds_len;
2114 /* pad runts to 60 bytes */
2118 htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2120 htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2121 req->length = htobe16(60 - cum_len);
2122 req->cksum_offset = 0;
2123 req->pseudo_hdr_offset = pseudo_hdr_offset;
2124 req->pad = 0; /* complete solid 16-byte block */
2125 req->rdma_count = 1;
2126 req->flags |= flags | ((cum_len & 1) * odd_flag);
2130 tx->req_list[0].rdma_count = cnt;
2132 /* print what the firmware will see */
2133 for (i = 0; i < cnt; i++) {
2134 kprintf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2135 "cso:%d, flags:0x%x, rdma:%d\n",
2136 i, (int)ntohl(tx->req_list[i].addr_high),
2137 (int)ntohl(tx->req_list[i].addr_low),
2138 (int)ntohs(tx->req_list[i].length),
2139 (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2140 tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2141 tx->req_list[i].rdma_count);
2143 kprintf("--------------\n");
2145 tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2146 mxge_submit_req(tx, tx->req_list, cnt);
2147 #ifdef IFNET_BUF_RING
2148 if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2149 /* tell the NIC to start polling this slice */
2151 tx->queue_active = 1;
2164 #ifdef IFNET_BUF_RING
2166 mxge_qflush(struct ifnet *ifp)
2168 mxge_softc_t *sc = ifp->if_softc;
2173 for (slice = 0; slice < sc->num_slices; slice++) {
2174 tx = &sc->ss[slice].tx;
2175 lwkt_serialize_enter(sc->ifp->if_serializer);
2176 while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2178 lwkt_serialize_exit(sc->ifp->if_serializer);
2184 mxge_start_locked(struct mxge_slice_state *ss)
2195 while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2196 m = drbr_dequeue(ifp, tx->br);
2200 /* let BPF see it */
2203 /* give it to the nic */
2206 /* ran out of transmit slots */
2207 if (((ss->if_flags & IFF_OACTIVE) == 0)
2208 && (!drbr_empty(ifp, tx->br))) {
2209 ss->if_flags |= IFF_OACTIVE;
2215 mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2226 if ((ss->if_flags & (IFF_RUNNING|IFF_OACTIVE)) !=
2228 err = drbr_enqueue(ifp, tx->br, m);
2232 if (drbr_empty(ifp, tx->br) &&
2233 ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2234 /* let BPF see it */
2236 /* give it to the nic */
2238 } else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2241 if (!drbr_empty(ifp, tx->br))
2242 mxge_start_locked(ss);
2247 mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2249 mxge_softc_t *sc = ifp->if_softc;
2250 struct mxge_slice_state *ss;
2256 slice = m->m_pkthdr.flowid;
2258 slice &= (sc->num_slices - 1); /* num_slices always power of 2 */
2260 ss = &sc->ss[slice];
2263 if(lwkt_serialize_try(ifp->if_serializer)) {
2264 err = mxge_transmit_locked(ss, m);
2265 lwkt_serialize_exit(ifp->if_serializer);
2267 err = drbr_enqueue(ifp, tx->br, m);
2276 mxge_start_locked(struct mxge_slice_state *ss)
2286 while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2287 m = ifq_dequeue(&ifp->if_snd, NULL);
2291 /* let BPF see it */
2294 /* give it to the nic */
2297 /* ran out of transmit slots */
2298 if ((sc->ifp->if_flags & IFF_OACTIVE) == 0) {
2299 sc->ifp->if_flags |= IFF_OACTIVE;
2305 mxge_start(struct ifnet *ifp)
2307 mxge_softc_t *sc = ifp->if_softc;
2308 struct mxge_slice_state *ss;
2310 /* only use the first slice for now */
2312 lwkt_serialize_enter(ifp->if_serializer);
2313 mxge_start_locked(ss);
2314 lwkt_serialize_exit(ifp->if_serializer);
2318 * copy an array of mcp_kreq_ether_recv_t's to the mcp. Copy
2319 * at most 32 bytes at a time, so as to avoid involving the software
2320 * pio handler in the nic. We re-write the first segment's low
2321 * DMA address to mark it valid only after we write the entire chunk
2325 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2326 mcp_kreq_ether_recv_t *src)
2330 low = src->addr_low;
2331 src->addr_low = 0xffffffff;
2332 mxge_pio_copy(dst, src, 4 * sizeof (*src));
2334 mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2336 src->addr_low = low;
2337 dst->addr_low = low;
2342 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2344 bus_dma_segment_t seg;
2346 mxge_rx_ring_t *rx = &ss->rx_small;
2349 m = m_gethdr(MB_DONTWAIT, MT_DATA);
2356 err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2357 &seg, 1, &cnt, BUS_DMA_NOWAIT);
2362 rx->info[idx].m = m;
2363 rx->shadow[idx].addr_low =
2364 htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2365 rx->shadow[idx].addr_high =
2366 htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2370 mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2376 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2378 bus_dma_segment_t seg[3];
2380 mxge_rx_ring_t *rx = &ss->rx_big;
2383 if (rx->cl_size == MCLBYTES)
2384 m = m_getcl(MB_DONTWAIT, MT_DATA, M_PKTHDR);
2387 m = m_getjcl(MB_DONTWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2390 * XXX: allocate normal sized buffers for big buffers.
2391 * We should be fine as long as we don't get any jumbo frames
2393 m = m_getcl(MB_DONTWAIT, MT_DATA, M_PKTHDR);
2401 m->m_len = rx->mlen;
2402 err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2403 seg, 1, &cnt, BUS_DMA_NOWAIT);
2408 rx->info[idx].m = m;
2409 rx->shadow[idx].addr_low =
2410 htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2411 rx->shadow[idx].addr_high =
2412 htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2414 #if MXGE_VIRT_JUMBOS
2415 for (i = 1; i < cnt; i++) {
2416 rx->shadow[idx + i].addr_low =
2417 htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2418 rx->shadow[idx + i].addr_high =
2419 htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2424 for (i = 0; i < rx->nbufs; i++) {
2425 if ((idx & 7) == 7) {
2426 mxge_submit_8rx(&rx->lanai[idx - 7],
2427 &rx->shadow[idx - 7]);
2435 * Myri10GE hardware checksums are not valid if the sender
2436 * padded the frame with non-zero padding. This is because
2437 * the firmware just does a simple 16-bit 1s complement
2438 * checksum across the entire frame, excluding the first 14
2439 * bytes. It is best to simply to check the checksum and
2440 * tell the stack about it only if the checksum is good
2443 static inline uint16_t
2444 mxge_rx_csum(struct mbuf *m, int csum)
2446 struct ether_header *eh;
2450 eh = mtod(m, struct ether_header *);
2452 /* only deal with IPv4 TCP & UDP for now */
2453 if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2455 ip = (struct ip *)(eh + 1);
2456 if (__predict_false(ip->ip_p != IPPROTO_TCP &&
2457 ip->ip_p != IPPROTO_UDP))
2460 c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2461 htonl(ntohs(csum) + ntohs(ip->ip_len) +
2462 - (ip->ip_hl << 2) + ip->ip_p));
2471 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2473 struct ether_vlan_header *evl;
2474 struct ether_header *eh;
2477 evl = mtod(m, struct ether_vlan_header *);
2478 eh = mtod(m, struct ether_header *);
2481 * fix checksum by subtracting EVL_ENCAPLEN bytes
2482 * after what the firmware thought was the end of the ethernet
2486 /* put checksum into host byte order */
2487 *csum = ntohs(*csum);
2488 partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2489 (*csum) += ~partial;
2490 (*csum) += ((*csum) < ~partial);
2491 (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2492 (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2494 /* restore checksum to network byte order;
2495 later consumers expect this */
2496 *csum = htons(*csum);
2499 #ifdef MXGE_NEW_VLAN_API
2500 m->m_pkthdr.ether_vlantag = ntohs(evl->evl_tag);
2504 mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2508 VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2509 m_tag_prepend(m, mtag);
2513 m->m_flags |= M_VLANTAG;
2516 * Remove the 802.1q header by copying the Ethernet
2517 * addresses over it and adjusting the beginning of
2518 * the data in the mbuf. The encapsulated Ethernet
2519 * type field is already in place.
2521 bcopy((char *)evl, (char *)evl + EVL_ENCAPLEN,
2522 ETHER_HDR_LEN - ETHER_TYPE_LEN);
2523 m_adj(m, EVL_ENCAPLEN);
2528 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2533 struct ether_header *eh;
2535 bus_dmamap_t old_map;
2537 uint16_t tcpudp_csum;
2542 idx = rx->cnt & rx->mask;
2543 rx->cnt += rx->nbufs;
2544 /* save a pointer to the received mbuf */
2545 m = rx->info[idx].m;
2546 /* try to replace the received mbuf */
2547 if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2548 /* drop the frame -- the old mbuf is re-cycled */
2553 /* unmap the received buffer */
2554 old_map = rx->info[idx].map;
2555 bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2556 bus_dmamap_unload(rx->dmat, old_map);
2558 /* swap the bus_dmamap_t's */
2559 rx->info[idx].map = rx->extra_map;
2560 rx->extra_map = old_map;
2562 /* mcp implicitly skips 1st 2 bytes so that packet is properly
2564 m->m_data += MXGEFW_PAD;
2566 m->m_pkthdr.rcvif = ifp;
2567 m->m_len = m->m_pkthdr.len = len;
2569 eh = mtod(m, struct ether_header *);
2570 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2571 mxge_vlan_tag_remove(m, &csum);
2573 /* if the checksum is valid, mark it in the mbuf header */
2574 if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2575 if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2577 /* otherwise, it was a UDP frame, or a TCP frame which
2578 we could not do LRO on. Tell the stack that the
2580 m->m_pkthdr.csum_data = 0xffff;
2581 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2584 /* flowid only valid if RSS hashing is enabled */
2585 if (sc->num_slices > 1) {
2586 m->m_pkthdr.flowid = (ss - sc->ss);
2587 m->m_flags |= M_FLOWID;
2590 /* pass the frame up the stack */
2591 (*ifp->if_input)(ifp, m);
2595 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2599 struct ether_header *eh;
2602 bus_dmamap_t old_map;
2604 uint16_t tcpudp_csum;
2609 idx = rx->cnt & rx->mask;
2611 /* save a pointer to the received mbuf */
2612 m = rx->info[idx].m;
2613 /* try to replace the received mbuf */
2614 if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2615 /* drop the frame -- the old mbuf is re-cycled */
2620 /* unmap the received buffer */
2621 old_map = rx->info[idx].map;
2622 bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2623 bus_dmamap_unload(rx->dmat, old_map);
2625 /* swap the bus_dmamap_t's */
2626 rx->info[idx].map = rx->extra_map;
2627 rx->extra_map = old_map;
2629 /* mcp implicitly skips 1st 2 bytes so that packet is properly
2631 m->m_data += MXGEFW_PAD;
2633 m->m_pkthdr.rcvif = ifp;
2634 m->m_len = m->m_pkthdr.len = len;
2636 eh = mtod(m, struct ether_header *);
2637 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2638 mxge_vlan_tag_remove(m, &csum);
2640 /* if the checksum is valid, mark it in the mbuf header */
2641 if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2642 if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2644 /* otherwise, it was a UDP frame, or a TCP frame which
2645 we could not do LRO on. Tell the stack that the
2647 m->m_pkthdr.csum_data = 0xffff;
2648 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2651 /* flowid only valid if RSS hashing is enabled */
2652 if (sc->num_slices > 1) {
2653 m->m_pkthdr.flowid = (ss - sc->ss);
2654 m->m_flags |= M_FLOWID;
2657 /* pass the frame up the stack */
2658 (*ifp->if_input)(ifp, m);
2662 mxge_clean_rx_done(struct mxge_slice_state *ss)
2664 mxge_rx_done_t *rx_done = &ss->rx_done;
2670 while (rx_done->entry[rx_done->idx].length != 0) {
2671 length = ntohs(rx_done->entry[rx_done->idx].length);
2672 rx_done->entry[rx_done->idx].length = 0;
2673 checksum = rx_done->entry[rx_done->idx].checksum;
2674 if (length <= (MHLEN - MXGEFW_PAD))
2675 mxge_rx_done_small(ss, length, checksum);
2677 mxge_rx_done_big(ss, length, checksum);
2679 rx_done->idx = rx_done->cnt & rx_done->mask;
2681 /* limit potential for livelock */
2682 if (__predict_false(++limit > rx_done->mask / 2))
2686 while (!SLIST_EMPTY(&ss->lro_active)) {
2687 struct lro_entry *lro = SLIST_FIRST(&ss->lro_active);
2688 SLIST_REMOVE_HEAD(&ss->lro_active, next);
2689 mxge_lro_flush(ss, lro);
2696 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2707 while (tx->pkt_done != mcp_idx) {
2708 idx = tx->done & tx->mask;
2710 m = tx->info[idx].m;
2711 /* mbuf and DMA map only attached to the first
2714 ss->obytes += m->m_pkthdr.len;
2715 if (m->m_flags & M_MCAST)
2718 tx->info[idx].m = NULL;
2719 map = tx->info[idx].map;
2720 bus_dmamap_unload(tx->dmat, map);
2723 if (tx->info[idx].flag) {
2724 tx->info[idx].flag = 0;
2729 /* If we have space, clear IFF_OACTIVE to tell the stack that
2730 its OK to send packets */
2731 #ifdef IFNET_BUF_RING
2732 flags = &ss->if_flags;
2734 flags = &ifp->if_flags;
2736 lwkt_serialize_enter(ifp->if_serializer);
2737 if ((*flags) & IFF_OACTIVE &&
2738 tx->req - tx->done < (tx->mask + 1)/4) {
2739 *(flags) &= ~IFF_OACTIVE;
2741 mxge_start_locked(ss);
2743 #ifdef IFNET_BUF_RING
2744 if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2745 /* let the NIC stop polling this queue, since there
2746 * are no more transmits pending */
2747 if (tx->req == tx->done) {
2749 tx->queue_active = 0;
2755 lwkt_serialize_exit(ifp->if_serializer);
2759 static struct mxge_media_type mxge_xfp_media_types[] =
2761 {IFM_10G_CX4, 0x7f, "10GBASE-CX4 (module)"},
2762 {IFM_10G_SR, (1 << 7), "10GBASE-SR"},
2763 {IFM_10G_LR, (1 << 6), "10GBASE-LR"},
2764 {0, (1 << 5), "10GBASE-ER"},
2765 {IFM_10G_LRM, (1 << 4), "10GBASE-LRM"},
2766 {0, (1 << 3), "10GBASE-SW"},
2767 {0, (1 << 2), "10GBASE-LW"},
2768 {0, (1 << 1), "10GBASE-EW"},
2769 {0, (1 << 0), "Reserved"}
2771 static struct mxge_media_type mxge_sfp_media_types[] =
2773 {0, (1 << 7), "Reserved"},
2774 {IFM_10G_LRM, (1 << 6), "10GBASE-LRM"},
2775 {IFM_10G_LR, (1 << 5), "10GBASE-LR"},
2776 {IFM_10G_SR, (1 << 4), "10GBASE-SR"}
2780 mxge_set_media(mxge_softc_t *sc, int type)
2782 sc->media_flags |= type;
2783 ifmedia_add(&sc->media, sc->media_flags, 0, NULL);
2784 ifmedia_set(&sc->media, sc->media_flags);
2789 * Determine the media type for a NIC. Some XFPs will identify
2790 * themselves only when their link is up, so this is initiated via a
2791 * link up interrupt. However, this can potentially take up to
2792 * several milliseconds, so it is run via the watchdog routine, rather
2793 * than in the interrupt handler itself. This need only be done
2794 * once, not each time the link is up.
2797 mxge_media_probe(mxge_softc_t *sc)
2802 struct mxge_media_type *mxge_media_types = NULL;
2803 int i, err, ms, mxge_media_type_entries;
2806 sc->need_media_probe = 0;
2808 /* if we've already set a media type, we're done */
2809 if (sc->media_flags != (IFM_ETHER | IFM_AUTO))
2813 * parse the product code to deterimine the interface type
2814 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2815 * after the 3rd dash in the driver's cached copy of the
2816 * EEPROM's product code string.
2818 ptr = sc->product_code_string;
2820 device_printf(sc->dev, "Missing product code\n");
2823 for (i = 0; i < 3; i++, ptr++) {
2824 ptr = index(ptr, '-');
2826 device_printf(sc->dev,
2827 "only %d dashes in PC?!?\n", i);
2833 mxge_set_media(sc, IFM_10G_CX4);
2836 else if (*ptr == 'Q') {
2837 /* -Q is Quad Ribbon Fiber */
2838 device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2839 /* FreeBSD has no media type for Quad ribbon fiber */
2845 mxge_media_types = mxge_xfp_media_types;
2846 mxge_media_type_entries =
2847 sizeof (mxge_xfp_media_types) /
2848 sizeof (mxge_xfp_media_types[0]);
2849 byte = MXGE_XFP_COMPLIANCE_BYTE;
2853 if (*ptr == 'S' || *(ptr +1) == 'S') {
2854 /* -S or -2S is SFP+ */
2855 mxge_media_types = mxge_sfp_media_types;
2856 mxge_media_type_entries =
2857 sizeof (mxge_sfp_media_types) /
2858 sizeof (mxge_sfp_media_types[0]);
2863 if (mxge_media_types == NULL) {
2864 device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2869 * At this point we know the NIC has an XFP cage, so now we
2870 * try to determine what is in the cage by using the
2871 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2872 * register. We read just one byte, which may take over
2876 cmd.data0 = 0; /* just fetch 1 byte, not all 256 */
2878 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2879 if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
2880 device_printf(sc->dev, "failed to read XFP\n");
2882 if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
2883 device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
2885 if (err != MXGEFW_CMD_OK) {
2889 /* now we wait for the data to be cached */
2891 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2892 for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
2895 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2897 if (err != MXGEFW_CMD_OK) {
2898 device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
2899 cage_type, err, ms);
2903 if (cmd.data0 == mxge_media_types[0].bitmask) {
2905 device_printf(sc->dev, "%s:%s\n", cage_type,
2906 mxge_media_types[0].name);
2907 mxge_set_media(sc, IFM_10G_CX4);
2910 for (i = 1; i < mxge_media_type_entries; i++) {
2911 if (cmd.data0 & mxge_media_types[i].bitmask) {
2913 device_printf(sc->dev, "%s:%s\n",
2915 mxge_media_types[i].name);
2917 mxge_set_media(sc, mxge_media_types[i].flag);
2921 device_printf(sc->dev, "%s media 0x%x unknown\n", cage_type,
2928 mxge_intr(void *arg)
2930 struct mxge_slice_state *ss = arg;
2931 mxge_softc_t *sc = ss->sc;
2932 mcp_irq_data_t *stats = ss->fw_stats;
2933 mxge_tx_ring_t *tx = &ss->tx;
2934 mxge_rx_done_t *rx_done = &ss->rx_done;
2935 uint32_t send_done_count;
2939 #ifndef IFNET_BUF_RING
2940 /* an interrupt on a non-zero slice is implicitly valid
2941 since MSI-X irqs are not shared */
2943 mxge_clean_rx_done(ss);
2944 *ss->irq_claim = be32toh(3);
2949 /* make sure the DMA has finished */
2950 if (!stats->valid) {
2953 valid = stats->valid;
2955 if (sc->legacy_irq) {
2956 /* lower legacy IRQ */
2957 *sc->irq_deassert = 0;
2958 if (!mxge_deassert_wait)
2959 /* don't wait for conf. that irq is low */
2965 /* loop while waiting for legacy irq deassertion */
2967 /* check for transmit completes and receives */
2968 send_done_count = be32toh(stats->send_done_count);
2969 while ((send_done_count != tx->pkt_done) ||
2970 (rx_done->entry[rx_done->idx].length != 0)) {
2971 if (send_done_count != tx->pkt_done)
2972 mxge_tx_done(ss, (int)send_done_count);
2973 mxge_clean_rx_done(ss);
2974 send_done_count = be32toh(stats->send_done_count);
2976 if (sc->legacy_irq && mxge_deassert_wait)
2978 } while (*((volatile uint8_t *) &stats->valid));
2980 /* fw link & error stats meaningful only on the first slice */
2981 if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
2982 if (sc->link_state != stats->link_up) {
2983 sc->link_state = stats->link_up;
2984 if (sc->link_state) {
2985 sc->ifp->if_link_state = LINK_STATE_UP;
2986 if_link_state_change(sc->ifp);
2988 device_printf(sc->dev, "link up\n");
2990 sc->ifp->if_link_state = LINK_STATE_DOWN;
2991 if_link_state_change(sc->ifp);
2993 device_printf(sc->dev, "link down\n");
2995 sc->need_media_probe = 1;
2997 if (sc->rdma_tags_available !=
2998 be32toh(stats->rdma_tags_available)) {
2999 sc->rdma_tags_available =
3000 be32toh(stats->rdma_tags_available);
3001 device_printf(sc->dev, "RDMA timed out! %d tags "
3002 "left\n", sc->rdma_tags_available);
3005 if (stats->link_down) {
3006 sc->down_cnt += stats->link_down;
3008 sc->ifp->if_link_state = LINK_STATE_DOWN;
3009 if_link_state_change(sc->ifp);
3013 /* check to see if we have rx token to pass back */
3015 *ss->irq_claim = be32toh(3);
3016 *(ss->irq_claim + 1) = be32toh(3);
3020 mxge_init(void *arg)
3027 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
3029 struct lro_entry *lro_entry;
3032 while (!SLIST_EMPTY(&ss->lro_free)) {
3033 lro_entry = SLIST_FIRST(&ss->lro_free);
3034 SLIST_REMOVE_HEAD(&ss->lro_free, next);
3035 kfree(lro_entry, M_DEVBUF);
3038 for (i = 0; i <= ss->rx_big.mask; i++) {
3039 if (ss->rx_big.info[i].m == NULL)
3041 bus_dmamap_unload(ss->rx_big.dmat,
3042 ss->rx_big.info[i].map);
3043 m_freem(ss->rx_big.info[i].m);
3044 ss->rx_big.info[i].m = NULL;
3047 for (i = 0; i <= ss->rx_small.mask; i++) {
3048 if (ss->rx_small.info[i].m == NULL)
3050 bus_dmamap_unload(ss->rx_small.dmat,
3051 ss->rx_small.info[i].map);
3052 m_freem(ss->rx_small.info[i].m);
3053 ss->rx_small.info[i].m = NULL;
3056 /* transmit ring used only on the first slice */
3057 if (ss->tx.info == NULL)
3060 for (i = 0; i <= ss->tx.mask; i++) {
3061 ss->tx.info[i].flag = 0;
3062 if (ss->tx.info[i].m == NULL)
3064 bus_dmamap_unload(ss->tx.dmat,
3065 ss->tx.info[i].map);
3066 m_freem(ss->tx.info[i].m);
3067 ss->tx.info[i].m = NULL;
3072 mxge_free_mbufs(mxge_softc_t *sc)
3076 for (slice = 0; slice < sc->num_slices; slice++)
3077 mxge_free_slice_mbufs(&sc->ss[slice]);
3081 mxge_free_slice_rings(struct mxge_slice_state *ss)
3086 if (ss->rx_done.entry != NULL)
3087 mxge_dma_free(&ss->rx_done.dma);
3088 ss->rx_done.entry = NULL;
3090 if (ss->tx.req_bytes != NULL)
3091 kfree(ss->tx.req_bytes, M_DEVBUF);
3092 ss->tx.req_bytes = NULL;
3094 if (ss->tx.seg_list != NULL)
3095 kfree(ss->tx.seg_list, M_DEVBUF);
3096 ss->tx.seg_list = NULL;
3098 if (ss->rx_small.shadow != NULL)
3099 kfree(ss->rx_small.shadow, M_DEVBUF);
3100 ss->rx_small.shadow = NULL;
3102 if (ss->rx_big.shadow != NULL)
3103 kfree(ss->rx_big.shadow, M_DEVBUF);
3104 ss->rx_big.shadow = NULL;
3106 if (ss->tx.info != NULL) {
3107 if (ss->tx.dmat != NULL) {
3108 for (i = 0; i <= ss->tx.mask; i++) {
3109 bus_dmamap_destroy(ss->tx.dmat,
3110 ss->tx.info[i].map);
3112 bus_dma_tag_destroy(ss->tx.dmat);
3114 kfree(ss->tx.info, M_DEVBUF);
3118 if (ss->rx_small.info != NULL) {
3119 if (ss->rx_small.dmat != NULL) {
3120 for (i = 0; i <= ss->rx_small.mask; i++) {
3121 bus_dmamap_destroy(ss->rx_small.dmat,
3122 ss->rx_small.info[i].map);
3124 bus_dmamap_destroy(ss->rx_small.dmat,
3125 ss->rx_small.extra_map);
3126 bus_dma_tag_destroy(ss->rx_small.dmat);
3128 kfree(ss->rx_small.info, M_DEVBUF);
3130 ss->rx_small.info = NULL;
3132 if (ss->rx_big.info != NULL) {
3133 if (ss->rx_big.dmat != NULL) {
3134 for (i = 0; i <= ss->rx_big.mask; i++) {
3135 bus_dmamap_destroy(ss->rx_big.dmat,
3136 ss->rx_big.info[i].map);
3138 bus_dmamap_destroy(ss->rx_big.dmat,
3139 ss->rx_big.extra_map);
3140 bus_dma_tag_destroy(ss->rx_big.dmat);
3142 kfree(ss->rx_big.info, M_DEVBUF);
3144 ss->rx_big.info = NULL;
3148 mxge_free_rings(mxge_softc_t *sc)
3152 for (slice = 0; slice < sc->num_slices; slice++)
3153 mxge_free_slice_rings(&sc->ss[slice]);
3157 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3158 int tx_ring_entries)
3160 mxge_softc_t *sc = ss->sc;
3166 /* allocate per-slice receive resources */
3168 ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3169 ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3171 /* allocate the rx shadow rings */
3172 bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3173 ss->rx_small.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3174 if (ss->rx_small.shadow == NULL)
3177 bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3178 ss->rx_big.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3179 if (ss->rx_big.shadow == NULL)
3182 /* allocate the rx host info rings */
3183 bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3184 ss->rx_small.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3185 if (ss->rx_small.info == NULL)
3188 bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3189 ss->rx_big.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3190 if (ss->rx_big.info == NULL)
3193 /* allocate the rx busdma resources */
3194 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3196 4096, /* boundary */
3197 BUS_SPACE_MAXADDR, /* low */
3198 BUS_SPACE_MAXADDR, /* high */
3199 NULL, NULL, /* filter */
3200 MHLEN, /* maxsize */
3202 MHLEN, /* maxsegsize */
3203 BUS_DMA_ALLOCNOW, /* flags */
3204 &ss->rx_small.dmat); /* tag */
3206 device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3211 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3213 #if MXGE_VIRT_JUMBOS
3214 4096, /* boundary */
3218 BUS_SPACE_MAXADDR, /* low */
3219 BUS_SPACE_MAXADDR, /* high */
3220 NULL, NULL, /* filter */
3221 3*4096, /* maxsize */
3222 #if MXGE_VIRT_JUMBOS
3224 4096, /* maxsegsize*/
3227 MJUM9BYTES, /* maxsegsize*/
3229 BUS_DMA_ALLOCNOW, /* flags */
3230 &ss->rx_big.dmat); /* tag */
3232 device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3236 for (i = 0; i <= ss->rx_small.mask; i++) {
3237 err = bus_dmamap_create(ss->rx_small.dmat, 0,
3238 &ss->rx_small.info[i].map);
3240 device_printf(sc->dev, "Err %d rx_small dmamap\n",
3245 err = bus_dmamap_create(ss->rx_small.dmat, 0,
3246 &ss->rx_small.extra_map);
3248 device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3253 for (i = 0; i <= ss->rx_big.mask; i++) {
3254 err = bus_dmamap_create(ss->rx_big.dmat, 0,
3255 &ss->rx_big.info[i].map);
3257 device_printf(sc->dev, "Err %d rx_big dmamap\n",
3262 err = bus_dmamap_create(ss->rx_big.dmat, 0,
3263 &ss->rx_big.extra_map);
3265 device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3270 /* now allocate TX resouces */
3272 #ifndef IFNET_BUF_RING
3273 /* only use a single TX ring for now */
3274 if (ss != ss->sc->ss)
3278 ss->tx.mask = tx_ring_entries - 1;
3279 ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3282 /* allocate the tx request copy block */
3284 sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3285 ss->tx.req_bytes = kmalloc(bytes, M_DEVBUF, M_WAITOK);
3286 if (ss->tx.req_bytes == NULL)
3288 /* ensure req_list entries are aligned to 8 bytes */
3289 ss->tx.req_list = (mcp_kreq_ether_send_t *)
3290 ((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3292 /* allocate the tx busdma segment list */
3293 bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3294 ss->tx.seg_list = (bus_dma_segment_t *)
3295 kmalloc(bytes, M_DEVBUF, M_WAITOK);
3296 if (ss->tx.seg_list == NULL)
3299 /* allocate the tx host info ring */
3300 bytes = tx_ring_entries * sizeof (*ss->tx.info);
3301 ss->tx.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3302 if (ss->tx.info == NULL)
3305 /* allocate the tx busdma resources */
3306 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3308 sc->tx_boundary, /* boundary */
3309 BUS_SPACE_MAXADDR, /* low */
3310 BUS_SPACE_MAXADDR, /* high */
3311 NULL, NULL, /* filter */
3312 65536 + 256, /* maxsize */
3313 ss->tx.max_desc - 2, /* num segs */
3314 sc->tx_boundary, /* maxsegsz */
3315 BUS_DMA_ALLOCNOW, /* flags */
3316 &ss->tx.dmat); /* tag */
3319 device_printf(sc->dev, "Err %d allocating tx dmat\n",
3324 /* now use these tags to setup dmamaps for each slot
3326 for (i = 0; i <= ss->tx.mask; i++) {
3327 err = bus_dmamap_create(ss->tx.dmat, 0,
3328 &ss->tx.info[i].map);
3330 device_printf(sc->dev, "Err %d tx dmamap\n",
3340 mxge_alloc_rings(mxge_softc_t *sc)
3344 int tx_ring_entries, rx_ring_entries;
3347 /* get ring sizes */
3348 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3349 tx_ring_size = cmd.data0;
3351 device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3355 tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3356 rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3357 ifq_set_maxlen(&sc->ifp->if_snd, tx_ring_entries - 1);
3358 ifq_set_ready(&sc->ifp->if_snd);
3360 for (slice = 0; slice < sc->num_slices; slice++) {
3361 err = mxge_alloc_slice_rings(&sc->ss[slice],
3370 mxge_free_rings(sc);
3377 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3379 int bufsize = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN + MXGEFW_PAD;
3381 if (bufsize < MCLBYTES) {
3382 /* easy, everything fits in a single buffer */
3383 *big_buf_size = MCLBYTES;
3384 *cl_size = MCLBYTES;
3389 if (bufsize < MJUMPAGESIZE) {
3390 /* still easy, everything still fits in a single buffer */
3391 *big_buf_size = MJUMPAGESIZE;
3392 *cl_size = MJUMPAGESIZE;
3396 #if MXGE_VIRT_JUMBOS
3397 /* now we need to use virtually contiguous buffers */
3398 *cl_size = MJUM9BYTES;
3399 *big_buf_size = 4096;
3400 *nbufs = mtu / 4096 + 1;
3401 /* needs to be a power of two, so round up */
3405 *cl_size = MJUM9BYTES;
3406 *big_buf_size = MJUM9BYTES;
3412 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3417 struct lro_entry *lro_entry;
3422 slice = ss - sc->ss;
3424 SLIST_INIT(&ss->lro_free);
3425 SLIST_INIT(&ss->lro_active);
3427 for (i = 0; i < sc->lro_cnt; i++) {
3428 lro_entry = (struct lro_entry *)
3429 kmalloc(sizeof (*lro_entry), M_DEVBUF,
3431 if (lro_entry == NULL) {
3435 SLIST_INSERT_HEAD(&ss->lro_free, lro_entry, next);
3437 /* get the lanai pointers to the send and receive rings */
3440 #ifndef IFNET_BUF_RING
3441 /* We currently only send from the first slice */
3445 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3447 (volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3448 ss->tx.send_go = (volatile uint32_t *)
3449 (sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3450 ss->tx.send_stop = (volatile uint32_t *)
3451 (sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3452 #ifndef IFNET_BUF_RING
3456 err |= mxge_send_cmd(sc,
3457 MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3458 ss->rx_small.lanai =
3459 (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3461 err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3463 (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3466 device_printf(sc->dev,
3467 "failed to get ring sizes or locations\n");
3471 /* stock receive rings */
3472 for (i = 0; i <= ss->rx_small.mask; i++) {
3473 map = ss->rx_small.info[i].map;
3474 err = mxge_get_buf_small(ss, map, i);
3476 device_printf(sc->dev, "alloced %d/%d smalls\n",
3477 i, ss->rx_small.mask + 1);
3481 for (i = 0; i <= ss->rx_big.mask; i++) {
3482 ss->rx_big.shadow[i].addr_low = 0xffffffff;
3483 ss->rx_big.shadow[i].addr_high = 0xffffffff;
3485 ss->rx_big.nbufs = nbufs;
3486 ss->rx_big.cl_size = cl_size;
3487 ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3488 EVL_ENCAPLEN + MXGEFW_PAD;
3489 for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3490 map = ss->rx_big.info[i].map;
3491 err = mxge_get_buf_big(ss, map, i);
3493 device_printf(sc->dev, "alloced %d/%d bigs\n",
3494 i, ss->rx_big.mask + 1);
3502 mxge_open(mxge_softc_t *sc)
3505 int err, big_bytes, nbufs, slice, cl_size, i;
3507 volatile uint8_t *itable;
3508 struct mxge_slice_state *ss;
3510 /* Copy the MAC address in case it was overridden */
3511 bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3513 err = mxge_reset(sc, 1);
3515 device_printf(sc->dev, "failed to reset\n");
3519 if (sc->num_slices > 1) {
3520 /* setup the indirection table */
3521 cmd.data0 = sc->num_slices;
3522 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3525 err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3528 device_printf(sc->dev,
3529 "failed to setup rss tables\n");
3533 /* just enable an identity mapping */
3534 itable = sc->sram + cmd.data0;
3535 for (i = 0; i < sc->num_slices; i++)
3536 itable[i] = (uint8_t)i;
3539 cmd.data1 = mxge_rss_hash_type;
3540 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3542 device_printf(sc->dev, "failed to enable slices\n");
3548 mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3551 err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3553 /* error is only meaningful if we're trying to set
3554 MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3555 if (err && nbufs > 1) {
3556 device_printf(sc->dev,
3557 "Failed to set alway-use-n to %d\n",
3561 /* Give the firmware the mtu and the big and small buffer
3562 sizes. The firmware wants the big buf size to be a power
3563 of two. Luckily, FreeBSD's clusters are powers of two */
3564 cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
3565 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3566 cmd.data0 = MHLEN - MXGEFW_PAD;
3567 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3569 cmd.data0 = big_bytes;
3570 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3573 device_printf(sc->dev, "failed to setup params\n");
3577 /* Now give him the pointer to the stats block */
3579 #ifdef IFNET_BUF_RING
3580 slice < sc->num_slices;
3585 ss = &sc->ss[slice];
3587 MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3589 MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3590 cmd.data2 = sizeof(struct mcp_irq_data);
3591 cmd.data2 |= (slice << 16);
3592 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3596 bus = sc->ss->fw_stats_dma.bus_addr;
3597 bus += offsetof(struct mcp_irq_data, send_done_count);
3598 cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3599 cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3600 err = mxge_send_cmd(sc,
3601 MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3603 /* Firmware cannot support multicast without STATS_DMA_V2 */
3604 sc->fw_multicast_support = 0;
3606 sc->fw_multicast_support = 1;
3610 device_printf(sc->dev, "failed to setup params\n");
3614 for (slice = 0; slice < sc->num_slices; slice++) {
3615 err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3617 device_printf(sc->dev, "couldn't open slice %d\n",
3623 /* Finally, start the firmware running */
3624 err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3626 device_printf(sc->dev, "Couldn't bring up link\n");
3629 #ifdef IFNET_BUF_RING
3630 for (slice = 0; slice < sc->num_slices; slice++) {
3631 ss = &sc->ss[slice];
3632 ss->if_flags |= IFF_RUNNING;
3633 ss->if_flags &= ~IFF_OACTIVE;
3636 sc->ifp->if_flags |= IFF_RUNNING;
3637 sc->ifp->if_flags &= ~IFF_OACTIVE;
3638 callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3644 mxge_free_mbufs(sc);
3650 mxge_close(mxge_softc_t *sc)
3653 int err, old_down_cnt;
3654 #ifdef IFNET_BUF_RING
3655 struct mxge_slice_state *ss;
3659 callout_stop(&sc->co_hdl);
3660 #ifdef IFNET_BUF_RING
3661 for (slice = 0; slice < sc->num_slices; slice++) {
3662 ss = &sc->ss[slice];
3663 ss->if_flags &= ~IFF_RUNNING;
3666 sc->ifp->if_flags &= ~IFF_RUNNING;
3667 old_down_cnt = sc->down_cnt;
3669 err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3671 device_printf(sc->dev, "Couldn't bring down link\n");
3673 if (old_down_cnt == sc->down_cnt) {
3674 /* wait for down irq */
3675 DELAY(10 * sc->intr_coal_delay);
3678 if (old_down_cnt == sc->down_cnt) {
3679 device_printf(sc->dev, "never got down irq\n");
3682 mxge_free_mbufs(sc);
3688 mxge_setup_cfg_space(mxge_softc_t *sc)
3690 device_t dev = sc->dev;
3692 uint16_t cmd, lnk, pectl;
3694 /* find the PCIe link width and set max read request to 4KB*/
3695 if (pci_find_extcap(dev, PCIY_EXPRESS, ®) == 0) {
3696 lnk = pci_read_config(dev, reg + 0x12, 2);
3697 sc->link_width = (lnk >> 4) & 0x3f;
3699 pectl = pci_read_config(dev, reg + 0x8, 2);
3700 pectl = (pectl & ~0x7000) | (5 << 12);
3701 pci_write_config(dev, reg + 0x8, pectl, 2);
3704 /* Enable DMA and Memory space access */
3705 pci_enable_busmaster(dev);
3706 cmd = pci_read_config(dev, PCIR_COMMAND, 2);
3707 cmd |= PCIM_CMD_MEMEN;
3708 pci_write_config(dev, PCIR_COMMAND, cmd, 2);
3712 mxge_read_reboot(mxge_softc_t *sc)
3714 device_t dev = sc->dev;
3717 /* find the vendor specific offset */
3718 if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
3719 device_printf(sc->dev,
3720 "could not find vendor specific offset\n");
3721 return (uint32_t)-1;
3723 /* enable read32 mode */
3724 pci_write_config(dev, vs + 0x10, 0x3, 1);
3725 /* tell NIC which register to read */
3726 pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3727 return (pci_read_config(dev, vs + 0x14, 4));
3731 mxge_watchdog_reset(mxge_softc_t *sc, int slice)
3733 struct pci_devinfo *dinfo;
3741 device_printf(sc->dev, "Watchdog reset!\n");
3744 * check to see if the NIC rebooted. If it did, then all of
3745 * PCI config space has been reset, and things like the
3746 * busmaster bit will be zero. If this is the case, then we
3747 * must restore PCI config space before the NIC can be used
3750 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3751 if (cmd == 0xffff) {
3753 * maybe the watchdog caught the NIC rebooting; wait
3754 * up to 100ms for it to finish. If it does not come
3755 * back, then give up
3758 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3759 if (cmd == 0xffff) {
3760 device_printf(sc->dev, "NIC disappeared!\n");
3764 if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3765 /* print the reboot status */
3766 reboot = mxge_read_reboot(sc);
3767 device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3769 /* restore PCI configuration space */
3770 dinfo = device_get_ivars(sc->dev);
3771 pci_cfg_restore(sc->dev, dinfo);
3773 /* and redo any changes we made to our config space */
3774 mxge_setup_cfg_space(sc);
3776 if (sc->ifp->if_flags & IFF_RUNNING) {
3778 err = mxge_open(sc);
3781 tx = &sc->ss[slice].tx;
3782 device_printf(sc->dev,
3783 "NIC did not reboot, slice %d ring state:\n",
3785 device_printf(sc->dev,
3786 "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3787 tx->req, tx->done, tx->queue_active);
3788 device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3789 tx->activate, tx->deactivate);
3790 device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3792 be32toh(sc->ss->fw_stats->send_done_count));
3793 device_printf(sc->dev, "not resetting\n");
3799 mxge_watchdog(mxge_softc_t *sc)
3802 uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3805 /* see if we have outstanding transmits, which
3806 have been pending for more than mxge_ticks */
3808 #ifdef IFNET_BUF_RING
3809 (i < sc->num_slices) && (err == 0);
3811 (i < 1) && (err == 0);
3815 if (tx->req != tx->done &&
3816 tx->watchdog_req != tx->watchdog_done &&
3817 tx->done == tx->watchdog_done) {
3818 /* check for pause blocking before resetting */
3819 if (tx->watchdog_rx_pause == rx_pause)
3820 err = mxge_watchdog_reset(sc, i);
3822 device_printf(sc->dev, "Flow control blocking "
3823 "xmits, check link partner\n");
3826 tx->watchdog_req = tx->req;
3827 tx->watchdog_done = tx->done;
3828 tx->watchdog_rx_pause = rx_pause;
3831 if (sc->need_media_probe)
3832 mxge_media_probe(sc);
3837 mxge_update_stats(mxge_softc_t *sc)
3839 struct mxge_slice_state *ss;
3840 u_long ipackets = 0;
3841 u_long opackets = 0;
3842 #ifdef IFNET_BUF_RING
3850 for (slice = 0; slice < sc->num_slices; slice++) {
3851 ss = &sc->ss[slice];
3852 ipackets += ss->ipackets;
3853 opackets += ss->opackets;
3854 #ifdef IFNET_BUF_RING
3855 obytes += ss->obytes;
3856 omcasts += ss->omcasts;
3857 odrops += ss->tx.br->br_drops;
3859 oerrors += ss->oerrors;
3861 sc->ifp->if_ipackets = ipackets;
3862 sc->ifp->if_opackets = opackets;
3863 #ifdef IFNET_BUF_RING
3864 sc->ifp->if_obytes = obytes;
3865 sc->ifp->if_omcasts = omcasts;
3866 sc->ifp->if_snd.ifq_drops = odrops;
3868 sc->ifp->if_oerrors = oerrors;
3872 mxge_tick(void *arg)
3874 mxge_softc_t *sc = arg;
3877 lwkt_serialize_enter(sc->ifp->if_serializer);
3878 /* aggregate stats from different slices */
3879 mxge_update_stats(sc);
3880 if (!sc->watchdog_countdown) {
3881 err = mxge_watchdog(sc);
3882 sc->watchdog_countdown = 4;
3884 sc->watchdog_countdown--;
3886 callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3887 lwkt_serialize_exit(sc->ifp->if_serializer);
3891 mxge_media_change(struct ifnet *ifp)
3897 mxge_change_mtu(mxge_softc_t *sc, int mtu)
3899 struct ifnet *ifp = sc->ifp;
3900 int real_mtu, old_mtu;
3904 real_mtu = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
3905 if ((real_mtu > sc->max_mtu) || real_mtu < 60)
3907 lwkt_serialize_enter(ifp->if_serializer);
3908 old_mtu = ifp->if_mtu;
3910 if (ifp->if_flags & IFF_RUNNING) {
3912 err = mxge_open(sc);
3914 ifp->if_mtu = old_mtu;
3916 (void) mxge_open(sc);
3919 lwkt_serialize_exit(ifp->if_serializer);
3924 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
3926 mxge_softc_t *sc = ifp->if_softc;
3931 ifmr->ifm_status = IFM_AVALID;
3932 ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
3933 ifmr->ifm_active = IFM_AUTO | IFM_ETHER;
3934 ifmr->ifm_active |= sc->link_state ? IFM_FDX : 0;
3938 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data, struct ucred *cr)
3940 mxge_softc_t *sc = ifp->if_softc;
3941 struct ifreq *ifr = (struct ifreq *)data;
3949 err = ether_ioctl(ifp, command, data);
3953 err = mxge_change_mtu(sc, ifr->ifr_mtu);
3957 lwkt_serialize_enter(sc->ifp->if_serializer);
3959 lwkt_serialize_exit(ifp->if_serializer);
3962 if (ifp->if_flags & IFF_UP) {
3963 if (!(ifp->if_flags & IFF_RUNNING)) {
3964 err = mxge_open(sc);
3966 /* take care of promis can allmulti
3968 mxge_change_promisc(sc,
3969 ifp->if_flags & IFF_PROMISC);
3970 mxge_set_multicast_list(sc);
3973 if (ifp->if_flags & IFF_RUNNING) {
3977 lwkt_serialize_exit(ifp->if_serializer);
3982 lwkt_serialize_enter(sc->ifp->if_serializer);
3983 mxge_set_multicast_list(sc);
3984 lwkt_serialize_exit(sc->ifp->if_serializer);
3988 lwkt_serialize_enter(sc->ifp->if_serializer);
3989 mask = ifr->ifr_reqcap ^ ifp->if_capenable;
3990 if (mask & IFCAP_TXCSUM) {
3991 if (IFCAP_TXCSUM & ifp->if_capenable) {
3992 ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
3993 ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
3996 ifp->if_capenable |= IFCAP_TXCSUM;
3997 ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
3999 } else if (mask & IFCAP_RXCSUM) {
4000 if (IFCAP_RXCSUM & ifp->if_capenable) {
4001 ifp->if_capenable &= ~IFCAP_RXCSUM;
4004 ifp->if_capenable |= IFCAP_RXCSUM;
4008 if (mask & IFCAP_TSO4) {
4009 if (IFCAP_TSO4 & ifp->if_capenable) {
4010 ifp->if_capenable &= ~IFCAP_TSO4;
4011 ifp->if_hwassist &= ~CSUM_TSO;
4012 } else if (IFCAP_TXCSUM & ifp->if_capenable) {
4013 ifp->if_capenable |= IFCAP_TSO4;
4014 ifp->if_hwassist |= CSUM_TSO;
4016 kprintf("mxge requires tx checksum offload"
4017 " be enabled to use TSO\n");
4021 if (mask & IFCAP_LRO) {
4022 if (IFCAP_LRO & ifp->if_capenable)
4023 err = mxge_change_lro_locked(sc, 0);
4025 err = mxge_change_lro_locked(sc, mxge_lro_cnt);
4027 if (mask & IFCAP_VLAN_HWTAGGING)
4028 ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
4029 lwkt_serialize_exit(sc->ifp->if_serializer);
4030 VLAN_CAPABILITIES(ifp);
4035 err = ifmedia_ioctl(ifp, (struct ifreq *)data,
4036 &sc->media, command);
4046 mxge_fetch_tunables(mxge_softc_t *sc)
4049 TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4050 TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4051 &mxge_flow_control);
4052 TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4053 &mxge_intr_coal_delay);
4054 TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4055 &mxge_nvidia_ecrc_enable);
4056 TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4057 &mxge_force_firmware);
4058 TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4059 &mxge_deassert_wait);
4060 TUNABLE_INT_FETCH("hw.mxge.verbose",
4062 TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4063 TUNABLE_INT_FETCH("hw.mxge.lro_cnt", &sc->lro_cnt);
4064 TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4065 TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4066 TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4067 if (sc->lro_cnt != 0)
4068 mxge_lro_cnt = sc->lro_cnt;
4072 if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4073 mxge_intr_coal_delay = 30;
4074 if (mxge_ticks == 0)
4075 mxge_ticks = hz / 2;
4076 sc->pause = mxge_flow_control;
4077 if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4078 || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4079 mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
4081 if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4082 mxge_initial_mtu < ETHER_MIN_LEN)
4083 mxge_initial_mtu = ETHERMTU_JUMBO;
4088 mxge_free_slices(mxge_softc_t *sc)
4090 struct mxge_slice_state *ss;
4097 for (i = 0; i < sc->num_slices; i++) {
4099 if (ss->fw_stats != NULL) {
4100 mxge_dma_free(&ss->fw_stats_dma);
4101 ss->fw_stats = NULL;
4102 #ifdef IFNET_BUF_RING
4103 if (ss->tx.br != NULL) {
4104 drbr_free(ss->tx.br, M_DEVBUF);
4109 if (ss->rx_done.entry != NULL) {
4110 mxge_dma_free(&ss->rx_done.dma);
4111 ss->rx_done.entry = NULL;
4114 kfree(sc->ss, M_DEVBUF);
4119 mxge_alloc_slices(mxge_softc_t *sc)
4122 struct mxge_slice_state *ss;
4124 int err, i, max_intr_slots;
4126 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4128 device_printf(sc->dev, "Cannot determine rx ring size\n");
4131 sc->rx_ring_size = cmd.data0;
4132 max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4134 bytes = sizeof (*sc->ss) * sc->num_slices;
4135 sc->ss = kmalloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4138 for (i = 0; i < sc->num_slices; i++) {
4143 /* allocate per-slice rx interrupt queues */
4145 bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4146 err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4149 ss->rx_done.entry = ss->rx_done.dma.addr;
4150 bzero(ss->rx_done.entry, bytes);
4153 * allocate the per-slice firmware stats; stats
4154 * (including tx) are used used only on the first
4157 #ifndef IFNET_BUF_RING
4162 bytes = sizeof (*ss->fw_stats);
4163 err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4164 sizeof (*ss->fw_stats), 64);
4167 ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4168 #ifdef IFNET_BUF_RING
4169 ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4177 mxge_free_slices(sc);
4182 mxge_slice_probe(mxge_softc_t *sc)
4186 int msix_cnt, status, max_intr_slots;
4190 * don't enable multiple slices if they are not enabled,
4191 * or if this is not an SMP system
4194 if (mxge_max_slices == 0 || mxge_max_slices == 1 || ncpus < 2)
4197 /* see how many MSI-X interrupts are available */
4198 msix_cnt = pci_msix_count(sc->dev);
4202 /* now load the slice aware firmware see what it supports */
4203 old_fw = sc->fw_name;
4204 if (old_fw == mxge_fw_aligned)
4205 sc->fw_name = mxge_fw_rss_aligned;
4207 sc->fw_name = mxge_fw_rss_unaligned;
4208 status = mxge_load_firmware(sc, 0);
4210 device_printf(sc->dev, "Falling back to a single slice\n");
4214 /* try to send a reset command to the card to see if it
4216 memset(&cmd, 0, sizeof (cmd));
4217 status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4219 device_printf(sc->dev, "failed reset\n");
4223 /* get rx ring size */
4224 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4226 device_printf(sc->dev, "Cannot determine rx ring size\n");
4229 max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4231 /* tell it the size of the interrupt queues */
4232 cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4233 status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4235 device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4239 /* ask the maximum number of slices it supports */
4240 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4242 device_printf(sc->dev,
4243 "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4246 sc->num_slices = cmd.data0;
4247 if (sc->num_slices > msix_cnt)
4248 sc->num_slices = msix_cnt;
4250 if (mxge_max_slices == -1) {
4251 /* cap to number of CPUs in system */
4252 if (sc->num_slices > ncpus)
4253 sc->num_slices = ncpus;
4255 if (sc->num_slices > mxge_max_slices)
4256 sc->num_slices = mxge_max_slices;
4258 /* make sure it is a power of two */
4259 while (sc->num_slices & (sc->num_slices - 1))
4263 device_printf(sc->dev, "using %d slices\n",
4269 sc->fw_name = old_fw;
4270 (void) mxge_load_firmware(sc, 0);
4274 mxge_add_msix_irqs(mxge_softc_t *sc)
4277 int count, err, i, rid;
4280 sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4283 if (sc->msix_table_res == NULL) {
4284 device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4288 count = sc->num_slices;
4289 err = pci_alloc_msix(sc->dev, &count);
4291 device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4292 "err = %d \n", sc->num_slices, err);
4293 goto abort_with_msix_table;
4295 if (count < sc->num_slices) {
4296 device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4297 count, sc->num_slices);
4298 device_printf(sc->dev,
4299 "Try setting hw.mxge.max_slices to %d\n",
4302 goto abort_with_msix;
4304 bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4305 sc->msix_irq_res = kmalloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4306 if (sc->msix_irq_res == NULL) {
4308 goto abort_with_msix;
4311 for (i = 0; i < sc->num_slices; i++) {
4313 sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4316 if (sc->msix_irq_res[i] == NULL) {
4317 device_printf(sc->dev, "couldn't allocate IRQ res"
4318 " for message %d\n", i);
4320 goto abort_with_res;
4324 bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4325 sc->msix_ih = kmalloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4327 for (i = 0; i < sc->num_slices; i++) {
4328 err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4330 mxge_intr, &sc->ss[i], &sc->msix_ih[i],
4331 sc->ifp->if_serializer);
4333 device_printf(sc->dev, "couldn't setup intr for "
4335 goto abort_with_intr;
4340 device_printf(sc->dev, "using %d msix IRQs:",
4342 for (i = 0; i < sc->num_slices; i++)
4343 kprintf(" %ld", rman_get_start(sc->msix_irq_res[i]));
4349 for (i = 0; i < sc->num_slices; i++) {
4350 if (sc->msix_ih[i] != NULL) {
4351 bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4353 sc->msix_ih[i] = NULL;
4356 kfree(sc->msix_ih, M_DEVBUF);
4360 for (i = 0; i < sc->num_slices; i++) {
4362 if (sc->msix_irq_res[i] != NULL)
4363 bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4364 sc->msix_irq_res[i]);
4365 sc->msix_irq_res[i] = NULL;
4367 kfree(sc->msix_irq_res, M_DEVBUF);
4371 pci_release_msi(sc->dev);
4373 abort_with_msix_table:
4374 bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4375 sc->msix_table_res);
4381 mxge_add_single_irq(mxge_softc_t *sc)
4383 int count, err, rid;
4385 count = pci_msi_count(sc->dev);
4386 if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4392 sc->irq_res = bus_alloc_resource(sc->dev, SYS_RES_IRQ, &rid, 0, ~0,
4393 1, RF_SHAREABLE | RF_ACTIVE);
4394 if (sc->irq_res == NULL) {
4395 device_printf(sc->dev, "could not alloc interrupt\n");
4399 device_printf(sc->dev, "using %s irq %ld\n",
4400 sc->legacy_irq ? "INTx" : "MSI",
4401 rman_get_start(sc->irq_res));
4402 err = bus_setup_intr(sc->dev, sc->irq_res,
4404 mxge_intr, &sc->ss[0], &sc->ih,
4405 sc->ifp->if_serializer);
4407 bus_release_resource(sc->dev, SYS_RES_IRQ,
4408 sc->legacy_irq ? 0 : 1, sc->irq_res);
4409 if (!sc->legacy_irq)
4410 pci_release_msi(sc->dev);
4416 mxge_rem_msix_irqs(mxge_softc_t *sc)
4420 for (i = 0; i < sc->num_slices; i++) {
4421 if (sc->msix_ih[i] != NULL) {
4422 bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4424 sc->msix_ih[i] = NULL;
4427 kfree(sc->msix_ih, M_DEVBUF);
4429 for (i = 0; i < sc->num_slices; i++) {
4431 if (sc->msix_irq_res[i] != NULL)
4432 bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4433 sc->msix_irq_res[i]);
4434 sc->msix_irq_res[i] = NULL;
4436 kfree(sc->msix_irq_res, M_DEVBUF);
4438 bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4439 sc->msix_table_res);
4441 pci_release_msi(sc->dev);
4446 mxge_rem_single_irq(mxge_softc_t *sc)
4448 bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4449 bus_release_resource(sc->dev, SYS_RES_IRQ,
4450 sc->legacy_irq ? 0 : 1, sc->irq_res);
4451 if (!sc->legacy_irq)
4452 pci_release_msi(sc->dev);
4456 mxge_rem_irq(mxge_softc_t *sc)
4458 if (sc->num_slices > 1)
4459 mxge_rem_msix_irqs(sc);
4461 mxge_rem_single_irq(sc);
4465 mxge_add_irq(mxge_softc_t *sc)
4469 if (sc->num_slices > 1)
4470 err = mxge_add_msix_irqs(sc);
4472 err = mxge_add_single_irq(sc);
4474 if (0 && err == 0 && sc->num_slices > 1) {
4475 mxge_rem_msix_irqs(sc);
4476 err = mxge_add_msix_irqs(sc);
4483 mxge_attach(device_t dev)
4485 mxge_softc_t *sc = device_get_softc(dev);
4486 struct ifnet *ifp = &sc->arpcom.ac_if;
4490 * avoid rewriting half the lines in this file to use
4491 * &sc->arpcom.ac_if instead
4495 mxge_fetch_tunables(sc);
4497 err = bus_dma_tag_create(NULL, /* parent */
4500 BUS_SPACE_MAXADDR, /* low */
4501 BUS_SPACE_MAXADDR, /* high */
4502 NULL, NULL, /* filter */
4503 65536 + 256, /* maxsize */
4504 MXGE_MAX_SEND_DESC, /* num segs */
4505 65536, /* maxsegsize */
4507 &sc->parent_dmat); /* tag */
4510 device_printf(sc->dev, "Err %d allocating parent dmat\n",
4512 goto abort_with_nothing;
4516 if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4518 callout_init_mp(&sc->co_hdl);
4520 mxge_setup_cfg_space(sc);
4522 /* Map the board into the kernel */
4524 sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
4526 if (sc->mem_res == NULL) {
4527 device_printf(dev, "could not map memory\n");
4529 goto abort_with_nothing;
4531 sc->sram = rman_get_virtual(sc->mem_res);
4532 sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4533 if (sc->sram_size > rman_get_size(sc->mem_res)) {
4534 device_printf(dev, "impossible memory region size %ld\n",
4535 rman_get_size(sc->mem_res));
4537 goto abort_with_mem_res;
4540 /* make NULL terminated copy of the EEPROM strings section of
4542 bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4543 bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4544 rman_get_bushandle(sc->mem_res),
4545 sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4547 MXGE_EEPROM_STRINGS_SIZE - 2);
4548 err = mxge_parse_strings(sc);
4550 goto abort_with_mem_res;
4552 /* Enable write combining for efficient use of PCIe bus */
4555 /* Allocate the out of band dma memory */
4556 err = mxge_dma_alloc(sc, &sc->cmd_dma,
4557 sizeof (mxge_cmd_t), 64);
4559 goto abort_with_mem_res;
4560 sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4561 err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4563 goto abort_with_cmd_dma;
4565 err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4567 goto abort_with_zeropad_dma;
4569 /* select & load the firmware */
4570 err = mxge_select_firmware(sc);
4572 goto abort_with_dmabench;
4573 sc->intr_coal_delay = mxge_intr_coal_delay;
4575 mxge_slice_probe(sc);
4576 err = mxge_alloc_slices(sc);
4578 goto abort_with_dmabench;
4580 err = mxge_reset(sc, 0);
4582 goto abort_with_slices;
4584 err = mxge_alloc_rings(sc);
4586 device_printf(sc->dev, "failed to allocate rings\n");
4587 goto abort_with_dmabench;
4590 err = mxge_add_irq(sc);
4592 device_printf(sc->dev, "failed to add irq\n");
4593 goto abort_with_rings;
4596 ifp->if_baudrate = IF_Gbps(10UL);
4597 ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4600 ifp->if_capabilities |= IFCAP_LRO;
4603 #ifdef MXGE_NEW_VLAN_API
4604 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4607 sc->max_mtu = mxge_max_mtu(sc);
4608 if (sc->max_mtu >= 9000)
4609 ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4611 device_printf(dev, "MTU limited to %d. Install "
4612 "latest firmware for 9000 byte jumbo support\n",
4613 sc->max_mtu - ETHER_HDR_LEN);
4614 ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4615 ifp->if_capenable = ifp->if_capabilities;
4616 if (sc->lro_cnt == 0)
4617 ifp->if_capenable &= ~IFCAP_LRO;
4619 ifp->if_init = mxge_init;
4621 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4622 ifp->if_ioctl = mxge_ioctl;
4623 ifp->if_start = mxge_start;
4624 /* Initialise the ifmedia structure */
4625 ifmedia_init(&sc->media, 0, mxge_media_change,
4627 mxge_set_media(sc, IFM_ETHER | IFM_AUTO);
4628 mxge_media_probe(sc);
4630 ether_ifattach(ifp, sc->mac_addr, NULL);
4631 /* ether_ifattach sets mtu to ETHERMTU */
4632 if (mxge_initial_mtu != ETHERMTU)
4633 mxge_change_mtu(sc, mxge_initial_mtu);
4635 mxge_add_sysctls(sc);
4636 #ifdef IFNET_BUF_RING
4637 ifp->if_transmit = mxge_transmit;
4638 ifp->if_qflush = mxge_qflush;
4643 mxge_free_rings(sc);
4645 mxge_free_slices(sc);
4646 abort_with_dmabench:
4647 mxge_dma_free(&sc->dmabench_dma);
4648 abort_with_zeropad_dma:
4649 mxge_dma_free(&sc->zeropad_dma);
4651 mxge_dma_free(&sc->cmd_dma);
4653 bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4654 pci_disable_busmaster(dev);
4655 bus_dma_tag_destroy(sc->parent_dmat);
4661 mxge_detach(device_t dev)
4663 mxge_softc_t *sc = device_get_softc(dev);
4665 lwkt_serialize_enter(sc->ifp->if_serializer);
4667 if (sc->ifp->if_flags & IFF_RUNNING)
4670 * XXX: race: the callout callback could be spinning on
4671 * the serializer and run anyway
4673 callout_stop(&sc->co_hdl);
4674 lwkt_serialize_exit(sc->ifp->if_serializer);
4676 ether_ifdetach(sc->ifp);
4677 ifmedia_removeall(&sc->media);
4678 mxge_dummy_rdma(sc, 0);
4679 mxge_rem_sysctls(sc);
4681 mxge_free_rings(sc);
4682 mxge_free_slices(sc);
4683 mxge_dma_free(&sc->dmabench_dma);
4684 mxge_dma_free(&sc->zeropad_dma);
4685 mxge_dma_free(&sc->cmd_dma);
4686 bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4687 pci_disable_busmaster(dev);
4688 bus_dma_tag_destroy(sc->parent_dmat);
4693 mxge_shutdown(device_t dev)
4699 This file uses Myri10GE driver indentation.
4702 c-file-style:"linux"