1 /******************************************************************************
3 Copyright (c) 2006-2009, Myricom Inc.
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
9 1. Redistributions of source code must retain the above copyright notice,
10 this list of conditions and the following disclaimer.
12 2. Neither the name of the Myricom Inc, nor the names of its
13 contributors may be used to endorse or promote products derived from
14 this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
28 ***************************************************************************/
30 #include <sys/cdefs.h>
31 /*__FBSDID("$FreeBSD: src/sys/dev/mxge/if_mxge.c,v 1.63 2009/06/26 11:45:06 rwatson Exp $");*/
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/linker.h>
36 #include <sys/firmware.h>
37 #include <sys/endian.h>
38 #include <sys/in_cksum.h>
39 #include <sys/sockio.h>
41 #include <sys/malloc.h>
42 #include <sys/kernel.h>
43 #include <sys/module.h>
44 #include <sys/serialize.h>
45 #include <sys/socket.h>
46 #include <sys/sysctl.h>
48 /* count xmits ourselves, rather than via drbr */
51 #include <net/if_arp.h>
52 #include <net/ifq_var.h>
53 #include <net/ethernet.h>
54 #include <net/if_dl.h>
55 #include <net/if_media.h>
59 #include <net/if_types.h>
60 #include <net/vlan/if_vlan_var.h>
63 #include <netinet/in_systm.h>
64 #include <netinet/in.h>
65 #include <netinet/ip.h>
66 #include <netinet/tcp.h>
71 #include <bus/pci/pcireg.h>
72 #include <bus/pci/pcivar.h>
73 #include <bus/pci/pci_private.h> /* XXX for pci_cfg_restore */
75 #include <vm/vm.h> /* for pmap_mapdev() */
78 #if defined(__i386) || defined(__amd64)
79 #include <machine/specialreg.h>
82 #include <dev/netif/mxge/mxge_mcp.h>
83 #include <dev/netif/mxge/mcp_gen_header.h>
84 /*#define MXGE_FAKE_IFP*/
85 #include <dev/netif/mxge/if_mxge_var.h>
87 #include <sys/buf_ring.h>
93 static int mxge_nvidia_ecrc_enable = 1;
94 static int mxge_force_firmware = 0;
95 static int mxge_intr_coal_delay = 30;
96 static int mxge_deassert_wait = 1;
97 static int mxge_flow_control = 1;
98 static int mxge_verbose = 0;
99 static int mxge_lro_cnt = 8;
100 static int mxge_ticks;
101 static int mxge_max_slices = 1;
102 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
103 static int mxge_always_promisc = 0;
104 static int mxge_initial_mtu = ETHERMTU_JUMBO;
105 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
106 static char *mxge_fw_aligned = "mxge_eth_z8e";
107 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
108 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
110 static int mxge_probe(device_t dev);
111 static int mxge_attach(device_t dev);
112 static int mxge_detach(device_t dev);
113 static int mxge_shutdown(device_t dev);
114 static void mxge_intr(void *arg);
116 static device_method_t mxge_methods[] =
118 /* Device interface */
119 DEVMETHOD(device_probe, mxge_probe),
120 DEVMETHOD(device_attach, mxge_attach),
121 DEVMETHOD(device_detach, mxge_detach),
122 DEVMETHOD(device_shutdown, mxge_shutdown),
126 static driver_t mxge_driver =
130 sizeof(mxge_softc_t),
133 static devclass_t mxge_devclass;
135 /* Declare ourselves to be a child of the PCI bus.*/
136 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
137 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
138 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
140 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
141 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
142 static int mxge_close(mxge_softc_t *sc);
143 static int mxge_open(mxge_softc_t *sc);
144 static void mxge_tick(void *arg);
146 /* XXX: we don't have Large Receive Offload support yet */
148 mxge_lro_rx(struct mxge_slice_state *ss, struct mbuf *m_head, uint32_t csum)
157 mxge_lro_flush(struct mxge_slice_state *ss, struct lro_entry *lro)
164 mxge_probe(device_t dev)
169 if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
170 ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
171 (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
172 rev = pci_get_revid(dev);
174 case MXGE_PCI_REV_Z8E:
175 device_set_desc(dev, "Myri10G-PCIE-8A");
177 case MXGE_PCI_REV_Z8ES:
178 device_set_desc(dev, "Myri10G-PCIE-8B");
181 device_set_desc(dev, "Myri10G-PCIE-8??");
182 device_printf(dev, "Unrecognized rev %d NIC\n",
192 mxge_enable_wc(mxge_softc_t *sc)
195 #if defined(__i386) || defined(__amd64)
200 len = rman_get_size(sc->mem_res);
201 err = pmap_change_attr((vm_offset_t) sc->sram,
202 len, PAT_WRITE_COMBINING);
204 device_printf(sc->dev, "pmap_change_attr failed, %d\n",
210 sc->wc = 0; /* TBD: PAT support */
215 /* callback to get our DMA address */
217 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
221 *(bus_addr_t *) arg = segs->ds_addr;
226 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
227 bus_size_t alignment)
230 device_t dev = sc->dev;
231 bus_size_t boundary, maxsegsize;
233 if (bytes > 4096 && alignment == 4096) {
241 /* allocate DMAable memory tags */
242 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
243 alignment, /* alignment */
244 boundary, /* boundary */
245 BUS_SPACE_MAXADDR, /* low */
246 BUS_SPACE_MAXADDR, /* high */
247 NULL, NULL, /* filter */
250 maxsegsize, /* maxsegsize */
251 BUS_DMA_COHERENT, /* flags */
252 &dma->dmat); /* tag */
254 device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
258 /* allocate DMAable memory & map */
259 err = bus_dmamem_alloc(dma->dmat, &dma->addr,
260 (BUS_DMA_WAITOK | BUS_DMA_COHERENT
261 | BUS_DMA_ZERO), &dma->map);
263 device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
264 goto abort_with_dmat;
267 /* load the memory */
268 err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
269 mxge_dmamap_callback,
270 (void *)&dma->bus_addr, 0);
272 device_printf(dev, "couldn't load map (err = %d)\n", err);
278 bus_dmamem_free(dma->dmat, dma->addr, dma->map);
280 (void)bus_dma_tag_destroy(dma->dmat);
286 mxge_dma_free(mxge_dma_t *dma)
288 bus_dmamap_unload(dma->dmat, dma->map);
289 bus_dmamem_free(dma->dmat, dma->addr, dma->map);
290 (void)bus_dma_tag_destroy(dma->dmat);
294 * The eeprom strings on the lanaiX have the format
301 mxge_parse_strings(mxge_softc_t *sc)
303 #define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
308 ptr = sc->eeprom_strings;
309 limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
311 while (ptr < limit && *ptr != '\0') {
312 if (memcmp(ptr, "MAC=", 4) == 0) {
314 sc->mac_addr_string = ptr;
315 for (i = 0; i < 6; i++) {
317 if ((ptr + 2) > limit)
319 sc->mac_addr[i] = strtoul(ptr, NULL, 16);
322 } else if (memcmp(ptr, "PC=", 3) == 0) {
324 strncpy(sc->product_code_string, ptr,
325 sizeof (sc->product_code_string) - 1);
326 } else if (memcmp(ptr, "SN=", 3) == 0) {
328 strncpy(sc->serial_number_string, ptr,
329 sizeof (sc->serial_number_string) - 1);
331 MXGE_NEXT_STRING(ptr);
338 device_printf(sc->dev, "failed to parse eeprom_strings\n");
343 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
345 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
348 unsigned long base, off;
350 device_t pdev, mcp55;
351 uint16_t vendor_id, device_id, word;
352 uintptr_t bus, slot, func, ivend, idev;
356 if (!mxge_nvidia_ecrc_enable)
359 pdev = device_get_parent(device_get_parent(sc->dev));
361 device_printf(sc->dev, "could not find parent?\n");
364 vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
365 device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
367 if (vendor_id != 0x10de)
372 if (device_id == 0x005d) {
373 /* ck804, base address is magic */
375 } else if (device_id >= 0x0374 && device_id <= 0x378) {
376 /* mcp55, base address stored in chipset */
377 mcp55 = pci_find_bsf(0, 0, 0);
379 0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
380 0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
381 word = pci_read_config(mcp55, 0x90, 2);
382 base = ((unsigned long)word & 0x7ffeU) << 25;
389 Test below is commented because it is believed that doing
390 config read/write beyond 0xff will access the config space
391 for the next larger function. Uncomment this and remove
392 the hacky pmap_mapdev() way of accessing config space when
393 FreeBSD grows support for extended pcie config space access
396 /* See if we can, by some miracle, access the extended
398 val = pci_read_config(pdev, 0x178, 4);
399 if (val != 0xffffffff) {
401 pci_write_config(pdev, 0x178, val, 4);
405 /* Rather than using normal pci config space writes, we must
406 * map the Nvidia config space ourselves. This is because on
407 * opteron/nvidia class machine the 0xe000000 mapping is
408 * handled by the nvidia chipset, that means the internal PCI
409 * device (the on-chip northbridge), or the amd-8131 bridge
410 * and things behind them are not visible by this method.
413 BUS_READ_IVAR(device_get_parent(pdev), pdev,
415 BUS_READ_IVAR(device_get_parent(pdev), pdev,
416 PCI_IVAR_SLOT, &slot);
417 BUS_READ_IVAR(device_get_parent(pdev), pdev,
418 PCI_IVAR_FUNCTION, &func);
419 BUS_READ_IVAR(device_get_parent(pdev), pdev,
420 PCI_IVAR_VENDOR, &ivend);
421 BUS_READ_IVAR(device_get_parent(pdev), pdev,
422 PCI_IVAR_DEVICE, &idev);
425 + 0x00100000UL * (unsigned long)bus
426 + 0x00001000UL * (unsigned long)(func
429 /* map it into the kernel */
430 va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
434 device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
437 /* get a pointer to the config space mapped into the kernel */
438 cfgptr = va + (off & PAGE_MASK);
440 /* make sure that we can really access it */
441 vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
442 device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
443 if (! (vendor_id == ivend && device_id == idev)) {
444 device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
445 vendor_id, device_id);
446 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
450 ptr32 = (uint32_t*)(cfgptr + 0x178);
453 if (val == 0xffffffff) {
454 device_printf(sc->dev, "extended mapping failed\n");
455 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
459 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
461 device_printf(sc->dev,
462 "Enabled ECRC on upstream Nvidia bridge "
464 (int)bus, (int)slot, (int)func);
469 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
471 device_printf(sc->dev,
472 "Nforce 4 chipset on non-x86/amd64!?!?!\n");
479 mxge_dma_test(mxge_softc_t *sc, int test_type)
482 bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
488 /* Run a small DMA test.
489 * The magic multipliers to the length tell the firmware
490 * to do DMA read, write, or read+write tests. The
491 * results are returned in cmd.data0. The upper 16
492 * bits of the return is the number of transfers completed.
493 * The lower 16 bits is the time in 0.5us ticks that the
494 * transfers took to complete.
497 len = sc->tx_boundary;
499 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
500 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
501 cmd.data2 = len * 0x10000;
502 status = mxge_send_cmd(sc, test_type, &cmd);
507 sc->read_dma = ((cmd.data0>>16) * len * 2) /
508 (cmd.data0 & 0xffff);
509 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
510 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
511 cmd.data2 = len * 0x1;
512 status = mxge_send_cmd(sc, test_type, &cmd);
517 sc->write_dma = ((cmd.data0>>16) * len * 2) /
518 (cmd.data0 & 0xffff);
520 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
521 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
522 cmd.data2 = len * 0x10001;
523 status = mxge_send_cmd(sc, test_type, &cmd);
528 sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
529 (cmd.data0 & 0xffff);
532 if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
533 device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
540 * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
541 * when the PCI-E Completion packets are aligned on an 8-byte
542 * boundary. Some PCI-E chip sets always align Completion packets; on
543 * the ones that do not, the alignment can be enforced by enabling
544 * ECRC generation (if supported).
546 * When PCI-E Completion packets are not aligned, it is actually more
547 * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
549 * If the driver can neither enable ECRC nor verify that it has
550 * already been enabled, then it must use a firmware image which works
551 * around unaligned completion packets (ethp_z8e.dat), and it should
552 * also ensure that it never gives the device a Read-DMA which is
553 * larger than 2KB by setting the tx_boundary to 2KB. If ECRC is
554 * enabled, then the driver should use the aligned (eth_z8e.dat)
555 * firmware image, and set tx_boundary to 4KB.
559 mxge_firmware_probe(mxge_softc_t *sc)
561 device_t dev = sc->dev;
565 sc->tx_boundary = 4096;
567 * Verify the max read request size was set to 4KB
568 * before trying the test with 4KB.
570 if (pci_find_extcap(dev, PCIY_EXPRESS, ®) == 0) {
571 pectl = pci_read_config(dev, reg + 0x8, 2);
572 if ((pectl & (5 << 12)) != (5 << 12)) {
573 device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
575 sc->tx_boundary = 2048;
580 * load the optimized firmware (which assumes aligned PCIe
581 * completions) in order to see if it works on this host.
583 sc->fw_name = mxge_fw_aligned;
584 status = mxge_load_firmware(sc, 1);
590 * Enable ECRC if possible
592 mxge_enable_nvidia_ecrc(sc);
595 * Run a DMA test which watches for unaligned completions and
596 * aborts on the first one seen.
599 status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
601 return 0; /* keep the aligned firmware */
604 device_printf(dev, "DMA test failed: %d\n", status);
605 if (status == ENOSYS)
606 device_printf(dev, "Falling back to ethp! "
607 "Please install up to date fw\n");
612 mxge_select_firmware(mxge_softc_t *sc)
617 if (mxge_force_firmware != 0) {
618 if (mxge_force_firmware == 1)
623 device_printf(sc->dev,
624 "Assuming %s completions (forced)\n",
625 aligned ? "aligned" : "unaligned");
629 /* if the PCIe link width is 4 or less, we can use the aligned
630 firmware and skip any checks */
631 if (sc->link_width != 0 && sc->link_width <= 4) {
632 device_printf(sc->dev,
633 "PCIe x%d Link, expect reduced performance\n",
639 if (0 == mxge_firmware_probe(sc))
644 sc->fw_name = mxge_fw_aligned;
645 sc->tx_boundary = 4096;
647 sc->fw_name = mxge_fw_unaligned;
648 sc->tx_boundary = 2048;
650 return (mxge_load_firmware(sc, 0));
660 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
664 if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
665 device_printf(sc->dev, "Bad firmware type: 0x%x\n",
666 be32toh(hdr->mcp_type));
670 /* save firmware version for sysctl */
671 strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
673 device_printf(sc->dev, "firmware id: %s\n", hdr->version);
675 ksscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
676 &sc->fw_ver_minor, &sc->fw_ver_tiny);
678 if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
679 && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
680 device_printf(sc->dev, "Found firmware version %s\n",
682 device_printf(sc->dev, "Driver needs %d.%d\n",
683 MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
692 z_alloc(void *nil, u_int items, u_int size)
696 ptr = kmalloc(items * size, M_TEMP, M_NOWAIT);
701 z_free(void *nil, void *ptr)
708 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
711 const mcp_gen_header_t *hdr;
718 fw = firmware_image_load(sc->fw_name, NULL);
720 device_printf(sc->dev, "Could not find firmware image %s\n",
725 /* setup zlib and decompress f/w */
726 bzero(&zs, sizeof (zs));
729 status = inflateInit(&zs);
730 if (status != Z_OK) {
735 /* the uncompressed size is stored as the firmware version,
736 which would otherwise go unused */
737 fw_len = (size_t) fw->version;
738 inflate_buffer = kmalloc(fw_len, M_TEMP, M_NOWAIT);
739 if (inflate_buffer == NULL)
741 zs.avail_in = fw->datasize;
742 zs.next_in = __DECONST(char *, fw->data);
743 zs.avail_out = fw_len;
744 zs.next_out = inflate_buffer;
745 status = inflate(&zs, Z_FINISH);
746 if (status != Z_STREAM_END) {
747 device_printf(sc->dev, "zlib %d\n", status);
749 goto abort_with_buffer;
752 fw_len = fw->fw_imglen;
754 hdr_offset = htobe32(*(const uint32_t *)
755 (fw->fw_image + MCP_HEADER_PTR_OFFSET));
756 if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
757 device_printf(sc->dev, "Bad firmware file");
761 hdr = (const void*)(fw->fw_image + hdr_offset);
763 status = mxge_validate_firmware(sc, hdr);
767 /* Copy the inflated firmware to NIC SRAM. */
768 for (i = 0; i < fw_len; i += 256) {
769 mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
771 min(256U, (unsigned)(fw_len - i)));
781 kfree(inflate_buffer, M_TEMP);
786 firmware_image_unload(fw);
791 * Enable or disable periodic RDMAs from the host to make certain
792 * chipsets resend dropped PCIe messages
796 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
799 volatile uint32_t *confirm;
800 volatile char *submit;
801 uint32_t *buf, dma_low, dma_high;
804 buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
806 /* clear confirmation addr */
807 confirm = (volatile uint32_t *)sc->cmd;
811 /* send an rdma command to the PCIe engine, and wait for the
812 response in the confirmation address. The firmware should
813 write a -1 there to indicate it is alive and well
816 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
817 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
818 buf[0] = htobe32(dma_high); /* confirm addr MSW */
819 buf[1] = htobe32(dma_low); /* confirm addr LSW */
820 buf[2] = htobe32(0xffffffff); /* confirm data */
821 dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
822 dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
823 buf[3] = htobe32(dma_high); /* dummy addr MSW */
824 buf[4] = htobe32(dma_low); /* dummy addr LSW */
825 buf[5] = htobe32(enable); /* enable? */
828 submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
830 mxge_pio_copy(submit, buf, 64);
835 while (*confirm != 0xffffffff && i < 20) {
839 if (*confirm != 0xffffffff) {
840 device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
841 (enable ? "enable" : "disable"), confirm,
848 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
851 char buf_bytes[sizeof(*buf) + 8];
852 volatile mcp_cmd_response_t *response = sc->cmd;
853 volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
854 uint32_t dma_low, dma_high;
855 int err, sleep_total = 0;
858 * We may be called during attach, before if_serializer is available.
859 * This is not a fast path, just check for NULL
862 if (sc->ifp->if_serializer)
863 ASSERT_SERIALIZED(sc->ifp->if_serializer);
865 /* ensure buf is aligned to 8 bytes */
866 buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
868 buf->data0 = htobe32(data->data0);
869 buf->data1 = htobe32(data->data1);
870 buf->data2 = htobe32(data->data2);
871 buf->cmd = htobe32(cmd);
872 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
873 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
875 buf->response_addr.low = htobe32(dma_low);
876 buf->response_addr.high = htobe32(dma_high);
879 response->result = 0xffffffff;
881 mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
883 /* wait up to 20ms */
885 for (sleep_total = 0; sleep_total < 20; sleep_total++) {
886 bus_dmamap_sync(sc->cmd_dma.dmat,
887 sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
889 switch (be32toh(response->result)) {
891 data->data0 = be32toh(response->data);
897 case MXGEFW_CMD_UNKNOWN:
900 case MXGEFW_CMD_ERROR_UNALIGNED:
903 case MXGEFW_CMD_ERROR_BUSY:
907 device_printf(sc->dev,
909 "failed, result = %d\n",
910 cmd, be32toh(response->result));
918 device_printf(sc->dev, "mxge: command %d timed out"
920 cmd, be32toh(response->result));
925 mxge_adopt_running_firmware(mxge_softc_t *sc)
927 struct mcp_gen_header *hdr;
928 const size_t bytes = sizeof (struct mcp_gen_header);
932 /* find running firmware header */
933 hdr_offset = htobe32(*(volatile uint32_t *)
934 (sc->sram + MCP_HEADER_PTR_OFFSET));
936 if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
937 device_printf(sc->dev,
938 "Running firmware has bad header offset (%d)\n",
943 /* copy header of running firmware from SRAM to host memory to
944 * validate firmware */
945 hdr = kmalloc(bytes, M_DEVBUF, M_NOWAIT);
947 device_printf(sc->dev, "could not kmalloc firmware hdr\n");
950 bus_space_read_region_1(rman_get_bustag(sc->mem_res),
951 rman_get_bushandle(sc->mem_res),
952 hdr_offset, (char *)hdr, bytes);
953 status = mxge_validate_firmware(sc, hdr);
954 kfree(hdr, M_DEVBUF);
957 * check to see if adopted firmware has bug where adopting
958 * it will cause broadcasts to be filtered unless the NIC
959 * is kept in ALLMULTI mode
961 if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
962 sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
963 sc->adopted_rx_filter_bug = 1;
964 device_printf(sc->dev, "Adopting fw %d.%d.%d: "
965 "working around rx filter bug\n",
966 sc->fw_ver_major, sc->fw_ver_minor,
975 mxge_load_firmware(mxge_softc_t *sc, int adopt)
977 volatile uint32_t *confirm;
978 volatile char *submit;
980 uint32_t *buf, size, dma_low, dma_high;
983 buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
985 size = sc->sram_size;
986 status = mxge_load_firmware_helper(sc, &size);
990 /* Try to use the currently running firmware, if
992 status = mxge_adopt_running_firmware(sc);
994 device_printf(sc->dev,
995 "failed to adopt running firmware\n");
998 device_printf(sc->dev,
999 "Successfully adopted running firmware\n");
1000 if (sc->tx_boundary == 4096) {
1001 device_printf(sc->dev,
1002 "Using firmware currently running on NIC"
1004 device_printf(sc->dev,
1005 "performance consider loading optimized "
1008 sc->fw_name = mxge_fw_unaligned;
1009 sc->tx_boundary = 2048;
1012 /* clear confirmation addr */
1013 confirm = (volatile uint32_t *)sc->cmd;
1016 /* send a reload command to the bootstrap MCP, and wait for the
1017 response in the confirmation address. The firmware should
1018 write a -1 there to indicate it is alive and well
1021 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
1022 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
1024 buf[0] = htobe32(dma_high); /* confirm addr MSW */
1025 buf[1] = htobe32(dma_low); /* confirm addr LSW */
1026 buf[2] = htobe32(0xffffffff); /* confirm data */
1028 /* FIX: All newest firmware should un-protect the bottom of
1029 the sram before handoff. However, the very first interfaces
1030 do not. Therefore the handoff copy must skip the first 8 bytes
1032 /* where the code starts*/
1033 buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1034 buf[4] = htobe32(size - 8); /* length of code */
1035 buf[5] = htobe32(8); /* where to copy to */
1036 buf[6] = htobe32(0); /* where to jump to */
1038 submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1039 mxge_pio_copy(submit, buf, 64);
1044 while (*confirm != 0xffffffff && i < 20) {
1047 bus_dmamap_sync(sc->cmd_dma.dmat,
1048 sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1050 if (*confirm != 0xffffffff) {
1051 device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1060 mxge_update_mac_address(mxge_softc_t *sc)
1063 uint8_t *addr = sc->mac_addr;
1067 cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1068 | (addr[2] << 8) | addr[3]);
1070 cmd.data1 = ((addr[4] << 8) | (addr[5]));
1072 status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1077 mxge_change_pause(mxge_softc_t *sc, int pause)
1083 status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1086 status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1090 device_printf(sc->dev, "Failed to set flow control mode\n");
1098 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1103 if( sc->ifp->if_serializer)
1104 ASSERT_SERIALIZED(sc->ifp->if_serializer);
1105 if (mxge_always_promisc)
1109 status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1112 status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1116 device_printf(sc->dev, "Failed to set promisc mode\n");
1121 mxge_set_multicast_list(mxge_softc_t *sc)
1124 struct ifmultiaddr *ifma;
1125 struct ifnet *ifp = sc->ifp;
1128 if (ifp->if_serializer)
1129 ASSERT_SERIALIZED(ifp->if_serializer);
1131 /* This firmware is known to not support multicast */
1132 if (!sc->fw_multicast_support)
1135 /* Disable multicast filtering while we play with the lists*/
1136 err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1138 device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1139 " error status: %d\n", err);
1143 if (sc->adopted_rx_filter_bug)
1146 if (ifp->if_flags & IFF_ALLMULTI)
1147 /* request to disable multicast filtering, so quit here */
1150 /* Flush all the filters */
1152 err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1154 device_printf(sc->dev,
1155 "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1156 ", error status: %d\n", err);
1160 /* Walk the multicast list, and add each address */
1162 LIST_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1163 if (ifma->ifma_addr->sa_family != AF_LINK)
1165 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1167 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1169 cmd.data0 = htonl(cmd.data0);
1170 cmd.data1 = htonl(cmd.data1);
1171 err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1173 device_printf(sc->dev, "Failed "
1174 "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1176 /* abort, leaving multicast filtering off */
1180 /* Enable multicast filtering */
1181 err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1183 device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1184 ", error status: %d\n", err);
1189 mxge_max_mtu(mxge_softc_t *sc)
1194 if (MJUMPAGESIZE - MXGEFW_PAD > MXGEFW_MAX_MTU)
1195 return MXGEFW_MAX_MTU - MXGEFW_PAD;
1197 /* try to set nbufs to see if it we can
1198 use virtually contiguous jumbos */
1200 status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1203 return MXGEFW_MAX_MTU - MXGEFW_PAD;
1205 /* otherwise, we're limited to MJUMPAGESIZE */
1206 return MJUMPAGESIZE - MXGEFW_PAD;
1210 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1212 struct mxge_slice_state *ss;
1213 mxge_rx_done_t *rx_done;
1214 volatile uint32_t *irq_claim;
1218 /* try to send a reset command to the card to see if it
1220 memset(&cmd, 0, sizeof (cmd));
1221 status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1223 device_printf(sc->dev, "failed reset\n");
1227 mxge_dummy_rdma(sc, 1);
1230 /* set the intrq size */
1231 cmd.data0 = sc->rx_ring_size;
1232 status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1235 * Even though we already know how many slices are supported
1236 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1237 * has magic side effects, and must be called after a reset.
1238 * It must be called prior to calling any RSS related cmds,
1239 * including assigning an interrupt queue for anything but
1240 * slice 0. It must also be called *after*
1241 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1242 * the firmware to compute offsets.
1245 if (sc->num_slices > 1) {
1246 /* ask the maximum number of slices it supports */
1247 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1250 device_printf(sc->dev,
1251 "failed to get number of slices\n");
1255 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1256 * to setting up the interrupt queue DMA
1258 cmd.data0 = sc->num_slices;
1259 cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1260 #ifdef IFNET_BUF_RING
1261 cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1263 status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1266 device_printf(sc->dev,
1267 "failed to set number of slices\n");
1273 if (interrupts_setup) {
1274 /* Now exchange information about interrupts */
1275 for (slice = 0; slice < sc->num_slices; slice++) {
1276 rx_done = &sc->ss[slice].rx_done;
1277 memset(rx_done->entry, 0, sc->rx_ring_size);
1278 cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1279 cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1281 status |= mxge_send_cmd(sc,
1282 MXGEFW_CMD_SET_INTRQ_DMA,
1287 status |= mxge_send_cmd(sc,
1288 MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1291 sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1293 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1294 irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1297 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1299 sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1301 device_printf(sc->dev, "failed set interrupt parameters\n");
1306 *sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1309 /* run a DMA benchmark */
1310 (void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1312 for (slice = 0; slice < sc->num_slices; slice++) {
1313 ss = &sc->ss[slice];
1315 ss->irq_claim = irq_claim + (2 * slice);
1316 /* reset mcp/driver shared state back to 0 */
1317 ss->rx_done.idx = 0;
1318 ss->rx_done.cnt = 0;
1321 ss->tx.pkt_done = 0;
1322 ss->tx.queue_active = 0;
1323 ss->tx.activate = 0;
1324 ss->tx.deactivate = 0;
1329 ss->rx_small.cnt = 0;
1330 ss->lro_bad_csum = 0;
1332 ss->lro_flushed = 0;
1333 if (ss->fw_stats != NULL) {
1334 ss->fw_stats->valid = 0;
1335 ss->fw_stats->send_done_count = 0;
1338 sc->rdma_tags_available = 15;
1339 status = mxge_update_mac_address(sc);
1340 mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1341 mxge_change_pause(sc, sc->pause);
1342 mxge_set_multicast_list(sc);
1347 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1350 unsigned int intr_coal_delay;
1354 intr_coal_delay = sc->intr_coal_delay;
1355 err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1359 if (intr_coal_delay == sc->intr_coal_delay)
1362 if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1365 lwkt_serialize_enter(sc->ifp->if_serializer);
1366 *sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1367 sc->intr_coal_delay = intr_coal_delay;
1369 lwkt_serialize_exit(sc->ifp->if_serializer);
1374 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1377 unsigned int enabled;
1381 enabled = sc->pause;
1382 err = sysctl_handle_int(oidp, &enabled, arg2, req);
1386 if (enabled == sc->pause)
1389 lwkt_serialize_enter(sc->ifp->if_serializer);
1390 err = mxge_change_pause(sc, enabled);
1391 lwkt_serialize_exit(sc->ifp->if_serializer);
1396 mxge_change_lro_locked(mxge_softc_t *sc, int lro_cnt)
1403 ifp->if_capenable &= ~IFCAP_LRO;
1405 ifp->if_capenable |= IFCAP_LRO;
1406 sc->lro_cnt = lro_cnt;
1407 if (ifp->if_flags & IFF_RUNNING) {
1409 err = mxge_open(sc);
1415 mxge_change_lro(SYSCTL_HANDLER_ARGS)
1418 unsigned int lro_cnt;
1422 lro_cnt = sc->lro_cnt;
1423 err = sysctl_handle_int(oidp, &lro_cnt, arg2, req);
1427 if (lro_cnt == sc->lro_cnt)
1433 lwkt_serialize_enter(sc->ifp->if_serializer);
1434 err = mxge_change_lro_locked(sc, lro_cnt);
1435 lwkt_serialize_exit(sc->ifp->if_serializer);
1440 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1446 arg2 = be32toh(*(int *)arg1);
1448 err = sysctl_handle_int(oidp, arg1, arg2, req);
1454 mxge_rem_sysctls(mxge_softc_t *sc)
1456 struct mxge_slice_state *ss;
1459 if (sc->slice_sysctl_tree == NULL)
1462 for (slice = 0; slice < sc->num_slices; slice++) {
1463 ss = &sc->ss[slice];
1464 if (ss == NULL || ss->sysctl_tree == NULL)
1466 sysctl_ctx_free(&ss->sysctl_ctx);
1467 ss->sysctl_tree = NULL;
1469 sysctl_ctx_free(&sc->slice_sysctl_ctx);
1470 sc->slice_sysctl_tree = NULL;
1471 sysctl_ctx_free(&sc->sysctl_ctx);
1472 sc->sysctl_tree = NULL;
1477 mxge_add_sysctls(mxge_softc_t *sc)
1479 struct sysctl_ctx_list *ctx;
1480 struct sysctl_oid_list *children;
1482 struct mxge_slice_state *ss;
1486 ctx = &sc->sysctl_ctx;
1487 sysctl_ctx_init(ctx);
1488 sc->sysctl_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_STATIC_CHILDREN(_hw),
1490 device_get_nameunit(sc->dev),
1492 if (sc->sysctl_tree == NULL) {
1493 device_printf(sc->dev, "can't add sysctl node\n");
1497 children = SYSCTL_CHILDREN(sc->sysctl_tree);
1498 fw = sc->ss[0].fw_stats;
1500 /* random information */
1501 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1503 CTLFLAG_RD, &sc->fw_version,
1504 0, "firmware version");
1505 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1507 CTLFLAG_RD, &sc->serial_number_string,
1508 0, "serial number");
1509 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1511 CTLFLAG_RD, &sc->product_code_string,
1513 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1515 CTLFLAG_RD, &sc->link_width,
1517 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1519 CTLFLAG_RD, &sc->tx_boundary,
1521 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1523 CTLFLAG_RD, &sc->wc,
1524 0, "write combining PIO?");
1525 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1527 CTLFLAG_RD, &sc->read_dma,
1528 0, "DMA Read speed in MB/s");
1529 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1531 CTLFLAG_RD, &sc->write_dma,
1532 0, "DMA Write speed in MB/s");
1533 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1534 "read_write_dma_MBs",
1535 CTLFLAG_RD, &sc->read_write_dma,
1536 0, "DMA concurrent Read/Write speed in MB/s");
1539 /* performance related tunables */
1540 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1542 CTLTYPE_INT|CTLFLAG_RW, sc,
1543 0, mxge_change_intr_coal,
1544 "I", "interrupt coalescing delay in usecs");
1546 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1547 "flow_control_enabled",
1548 CTLTYPE_INT|CTLFLAG_RW, sc,
1549 0, mxge_change_flow_control,
1550 "I", "interrupt coalescing delay in usecs");
1552 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1554 CTLFLAG_RW, &mxge_deassert_wait,
1555 0, "Wait for IRQ line to go low in ihandler");
1557 /* stats block from firmware is in network byte order.
1559 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1561 CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1562 0, mxge_handle_be32,
1564 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1565 "rdma_tags_available",
1566 CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1567 0, mxge_handle_be32,
1568 "I", "rdma_tags_available");
1569 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1570 "dropped_bad_crc32",
1571 CTLTYPE_INT|CTLFLAG_RD,
1572 &fw->dropped_bad_crc32,
1573 0, mxge_handle_be32,
1574 "I", "dropped_bad_crc32");
1575 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1577 CTLTYPE_INT|CTLFLAG_RD,
1578 &fw->dropped_bad_phy,
1579 0, mxge_handle_be32,
1580 "I", "dropped_bad_phy");
1581 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1582 "dropped_link_error_or_filtered",
1583 CTLTYPE_INT|CTLFLAG_RD,
1584 &fw->dropped_link_error_or_filtered,
1585 0, mxge_handle_be32,
1586 "I", "dropped_link_error_or_filtered");
1587 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1588 "dropped_link_overflow",
1589 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1590 0, mxge_handle_be32,
1591 "I", "dropped_link_overflow");
1592 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1593 "dropped_multicast_filtered",
1594 CTLTYPE_INT|CTLFLAG_RD,
1595 &fw->dropped_multicast_filtered,
1596 0, mxge_handle_be32,
1597 "I", "dropped_multicast_filtered");
1598 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1599 "dropped_no_big_buffer",
1600 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1601 0, mxge_handle_be32,
1602 "I", "dropped_no_big_buffer");
1603 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1604 "dropped_no_small_buffer",
1605 CTLTYPE_INT|CTLFLAG_RD,
1606 &fw->dropped_no_small_buffer,
1607 0, mxge_handle_be32,
1608 "I", "dropped_no_small_buffer");
1609 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1611 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1612 0, mxge_handle_be32,
1613 "I", "dropped_overrun");
1614 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1616 CTLTYPE_INT|CTLFLAG_RD,
1618 0, mxge_handle_be32,
1619 "I", "dropped_pause");
1620 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1622 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1623 0, mxge_handle_be32,
1624 "I", "dropped_runt");
1626 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1627 "dropped_unicast_filtered",
1628 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1629 0, mxge_handle_be32,
1630 "I", "dropped_unicast_filtered");
1632 /* verbose printing? */
1633 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1635 CTLFLAG_RW, &mxge_verbose,
1636 0, "verbose printing");
1639 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1641 CTLTYPE_INT|CTLFLAG_RW, sc,
1643 "I", "number of lro merge queues");
1646 /* add counters exported for debugging from all slices */
1647 sysctl_ctx_init(&sc->slice_sysctl_ctx);
1648 sc->slice_sysctl_tree =
1649 SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1650 "slice", CTLFLAG_RD, 0, "");
1652 for (slice = 0; slice < sc->num_slices; slice++) {
1653 ss = &sc->ss[slice];
1654 sysctl_ctx_init(&ss->sysctl_ctx);
1655 ctx = &ss->sysctl_ctx;
1656 children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1657 ksprintf(slice_num, "%d", slice);
1659 SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1661 children = SYSCTL_CHILDREN(ss->sysctl_tree);
1662 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1664 CTLFLAG_RD, &ss->rx_small.cnt,
1666 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1668 CTLFLAG_RD, &ss->rx_big.cnt,
1670 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1671 "lro_flushed", CTLFLAG_RD, &ss->lro_flushed,
1672 0, "number of lro merge queues flushed");
1674 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1675 "lro_queued", CTLFLAG_RD, &ss->lro_queued,
1676 0, "number of frames appended to lro merge"
1679 #ifndef IFNET_BUF_RING
1680 /* only transmit from slice 0 for now */
1684 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1686 CTLFLAG_RD, &ss->tx.req,
1689 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1691 CTLFLAG_RD, &ss->tx.done,
1693 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1695 CTLFLAG_RD, &ss->tx.pkt_done,
1697 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1699 CTLFLAG_RD, &ss->tx.stall,
1701 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1703 CTLFLAG_RD, &ss->tx.wake,
1705 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1707 CTLFLAG_RD, &ss->tx.defrag,
1709 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1711 CTLFLAG_RD, &ss->tx.queue_active,
1712 0, "tx_queue_active");
1713 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1715 CTLFLAG_RD, &ss->tx.activate,
1717 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1719 CTLFLAG_RD, &ss->tx.deactivate,
1720 0, "tx_deactivate");
1724 /* copy an array of mcp_kreq_ether_send_t's to the mcp. Copy
1725 backwards one at a time and handle ring wraps */
1728 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1729 mcp_kreq_ether_send_t *src, int cnt)
1731 int idx, starting_slot;
1732 starting_slot = tx->req;
1735 idx = (starting_slot + cnt) & tx->mask;
1736 mxge_pio_copy(&tx->lanai[idx],
1737 &src[cnt], sizeof(*src));
1743 * copy an array of mcp_kreq_ether_send_t's to the mcp. Copy
1744 * at most 32 bytes at a time, so as to avoid involving the software
1745 * pio handler in the nic. We re-write the first segment's flags
1746 * to mark them valid only after writing the entire chain
1750 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1755 volatile uint32_t *dst_ints;
1756 mcp_kreq_ether_send_t *srcp;
1757 volatile mcp_kreq_ether_send_t *dstp, *dst;
1760 idx = tx->req & tx->mask;
1762 last_flags = src->flags;
1765 dst = dstp = &tx->lanai[idx];
1768 if ((idx + cnt) < tx->mask) {
1769 for (i = 0; i < (cnt - 1); i += 2) {
1770 mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1771 wmb(); /* force write every 32 bytes */
1776 /* submit all but the first request, and ensure
1777 that it is submitted below */
1778 mxge_submit_req_backwards(tx, src, cnt);
1782 /* submit the first request */
1783 mxge_pio_copy(dstp, srcp, sizeof(*src));
1784 wmb(); /* barrier before setting valid flag */
1787 /* re-write the last 32-bits with the valid flags */
1788 src->flags = last_flags;
1789 src_ints = (uint32_t *)src;
1791 dst_ints = (volatile uint32_t *)dst;
1793 *dst_ints = *src_ints;
1801 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1802 int busdma_seg_cnt, int ip_off)
1805 mcp_kreq_ether_send_t *req;
1806 bus_dma_segment_t *seg;
1809 uint32_t low, high_swapped;
1810 int len, seglen, cum_len, cum_len_next;
1811 int next_is_first, chop, cnt, rdma_count, small;
1812 uint16_t pseudo_hdr_offset, cksum_offset, mss;
1813 uint8_t flags, flags_next;
1816 mss = m->m_pkthdr.tso_segsz;
1818 /* negative cum_len signifies to the
1819 * send loop that we are still in the
1820 * header portion of the TSO packet.
1823 /* ensure we have the ethernet, IP and TCP
1824 header together in the first mbuf, copy
1825 it to a scratch buffer if not */
1826 if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1827 m_copydata(m, 0, ip_off + sizeof (*ip),
1829 ip = (struct ip *)(ss->scratch + ip_off);
1831 ip = (struct ip *)(mtod(m, char *) + ip_off);
1833 if (__predict_false(m->m_len < ip_off + (ip->ip_hl << 2)
1835 m_copydata(m, 0, ip_off + (ip->ip_hl << 2)
1836 + sizeof (*tcp), ss->scratch);
1837 ip = (struct ip *)(mtod(m, char *) + ip_off);
1840 tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1841 cum_len = -(ip_off + ((ip->ip_hl + tcp->th_off) << 2));
1843 /* TSO implies checksum offload on this hardware */
1844 cksum_offset = ip_off + (ip->ip_hl << 2);
1845 flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1848 /* for TSO, pseudo_hdr_offset holds mss.
1849 * The firmware figures out where to put
1850 * the checksum by parsing the header. */
1851 pseudo_hdr_offset = htobe16(mss);
1858 /* "rdma_count" is the number of RDMAs belonging to the
1859 * current packet BEFORE the current send request. For
1860 * non-TSO packets, this is equal to "count".
1861 * For TSO packets, rdma_count needs to be reset
1862 * to 0 after a segment cut.
1864 * The rdma_count field of the send request is
1865 * the number of RDMAs of the packet starting at
1866 * that request. For TSO send requests with one ore more cuts
1867 * in the middle, this is the number of RDMAs starting
1868 * after the last cut in the request. All previous
1869 * segments before the last cut implicitly have 1 RDMA.
1871 * Since the number of RDMAs is not known beforehand,
1872 * it must be filled-in retroactively - after each
1873 * segmentation cut or at the end of the entire packet.
1876 while (busdma_seg_cnt) {
1877 /* Break the busdma segment up into pieces*/
1878 low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1879 high_swapped = htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1883 flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1885 cum_len_next = cum_len + seglen;
1886 (req-rdma_count)->rdma_count = rdma_count + 1;
1887 if (__predict_true(cum_len >= 0)) {
1889 chop = (cum_len_next > mss);
1890 cum_len_next = cum_len_next % mss;
1891 next_is_first = (cum_len_next == 0);
1892 flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1893 flags_next |= next_is_first *
1895 rdma_count |= -(chop | next_is_first);
1896 rdma_count += chop & !next_is_first;
1897 } else if (cum_len_next >= 0) {
1902 small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1903 flags_next = MXGEFW_FLAGS_TSO_PLD |
1904 MXGEFW_FLAGS_FIRST |
1905 (small * MXGEFW_FLAGS_SMALL);
1908 req->addr_high = high_swapped;
1909 req->addr_low = htobe32(low);
1910 req->pseudo_hdr_offset = pseudo_hdr_offset;
1912 req->rdma_count = 1;
1913 req->length = htobe16(seglen);
1914 req->cksum_offset = cksum_offset;
1915 req->flags = flags | ((cum_len & 1) *
1916 MXGEFW_FLAGS_ALIGN_ODD);
1919 cum_len = cum_len_next;
1924 if (__predict_false(cksum_offset > seglen))
1925 cksum_offset -= seglen;
1928 if (__predict_false(cnt > tx->max_desc))
1934 (req-rdma_count)->rdma_count = rdma_count;
1938 req->flags |= MXGEFW_FLAGS_TSO_LAST;
1939 } while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1941 tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1942 mxge_submit_req(tx, tx->req_list, cnt);
1943 #ifdef IFNET_BUF_RING
1944 if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1945 /* tell the NIC to start polling this slice */
1947 tx->queue_active = 1;
1955 bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1959 kprintf("tx->max_desc exceeded via TSO!\n");
1960 kprintf("mss = %d, %ld, %d!\n", mss,
1961 (long)seg - (long)tx->seg_list, tx->max_desc);
1968 #endif /* IFCAP_TSO4 */
1970 #ifdef MXGE_NEW_VLAN_API
1972 * We reproduce the software vlan tag insertion from
1973 * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
1974 * vlan tag insertion. We need to advertise this in order to have the
1975 * vlan interface respect our csum offload flags.
1977 static struct mbuf *
1978 mxge_vlan_tag_insert(struct mbuf *m)
1980 struct ether_vlan_header *evl;
1982 M_PREPEND(m, EVL_ENCAPLEN, MB_DONTWAIT);
1983 if (__predict_false(m == NULL))
1985 if (m->m_len < sizeof(*evl)) {
1986 m = m_pullup(m, sizeof(*evl));
1987 if (__predict_false(m == NULL))
1991 * Transform the Ethernet header into an Ethernet header
1992 * with 802.1Q encapsulation.
1994 evl = mtod(m, struct ether_vlan_header *);
1995 bcopy((char *)evl + EVL_ENCAPLEN,
1996 (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
1997 evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
1998 evl->evl_tag = htons(m->m_pkthdr.ether_vlantag);
1999 m->m_flags &= ~M_VLANTAG;
2002 #endif /* MXGE_NEW_VLAN_API */
2005 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
2008 mcp_kreq_ether_send_t *req;
2009 bus_dma_segment_t *seg;
2014 int cnt, cum_len, err, i, idx, odd_flag, ip_off;
2015 uint16_t pseudo_hdr_offset;
2016 uint8_t flags, cksum_offset;
2023 ip_off = sizeof (struct ether_header);
2024 #ifdef MXGE_NEW_VLAN_API
2025 if (m->m_flags & M_VLANTAG) {
2026 m = mxge_vlan_tag_insert(m);
2027 if (__predict_false(m == NULL))
2029 ip_off += EVL_ENCAPLEN;
2032 /* (try to) map the frame for DMA */
2033 idx = tx->req & tx->mask;
2034 err = bus_dmamap_load_mbuf_segment(tx->dmat, tx->info[idx].map,
2035 m, tx->seg_list, 1, &cnt,
2037 if (__predict_false(err == EFBIG)) {
2038 /* Too many segments in the chain. Try
2040 m_tmp = m_defrag(m, M_NOWAIT);
2041 if (m_tmp == NULL) {
2046 err = bus_dmamap_load_mbuf_segment(tx->dmat,
2048 m, tx->seg_list, 1, &cnt,
2051 if (__predict_false(err != 0)) {
2052 device_printf(sc->dev, "bus_dmamap_load_mbuf_segment returned %d"
2053 " packet len = %d\n", err, m->m_pkthdr.len);
2056 bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2057 BUS_DMASYNC_PREWRITE);
2058 tx->info[idx].m = m;
2061 /* TSO is different enough, we handle it in another routine */
2062 if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2063 mxge_encap_tso(ss, m, cnt, ip_off);
2070 pseudo_hdr_offset = 0;
2071 flags = MXGEFW_FLAGS_NO_TSO;
2073 /* checksum offloading? */
2074 if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
2075 /* ensure ip header is in first mbuf, copy
2076 it to a scratch buffer if not */
2077 if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
2078 m_copydata(m, 0, ip_off + sizeof (*ip),
2080 ip = (struct ip *)(ss->scratch + ip_off);
2082 ip = (struct ip *)(mtod(m, char *) + ip_off);
2084 cksum_offset = ip_off + (ip->ip_hl << 2);
2085 pseudo_hdr_offset = cksum_offset + m->m_pkthdr.csum_data;
2086 pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2087 req->cksum_offset = cksum_offset;
2088 flags |= MXGEFW_FLAGS_CKSUM;
2089 odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2093 if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2094 flags |= MXGEFW_FLAGS_SMALL;
2096 /* convert segments into a request list */
2099 req->flags = MXGEFW_FLAGS_FIRST;
2100 for (i = 0; i < cnt; i++) {
2102 htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2104 htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2105 req->length = htobe16(seg->ds_len);
2106 req->cksum_offset = cksum_offset;
2107 if (cksum_offset > seg->ds_len)
2108 cksum_offset -= seg->ds_len;
2111 req->pseudo_hdr_offset = pseudo_hdr_offset;
2112 req->pad = 0; /* complete solid 16-byte block */
2113 req->rdma_count = 1;
2114 req->flags |= flags | ((cum_len & 1) * odd_flag);
2115 cum_len += seg->ds_len;
2121 /* pad runts to 60 bytes */
2125 htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2127 htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2128 req->length = htobe16(60 - cum_len);
2129 req->cksum_offset = 0;
2130 req->pseudo_hdr_offset = pseudo_hdr_offset;
2131 req->pad = 0; /* complete solid 16-byte block */
2132 req->rdma_count = 1;
2133 req->flags |= flags | ((cum_len & 1) * odd_flag);
2137 tx->req_list[0].rdma_count = cnt;
2139 /* print what the firmware will see */
2140 for (i = 0; i < cnt; i++) {
2141 kprintf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2142 "cso:%d, flags:0x%x, rdma:%d\n",
2143 i, (int)ntohl(tx->req_list[i].addr_high),
2144 (int)ntohl(tx->req_list[i].addr_low),
2145 (int)ntohs(tx->req_list[i].length),
2146 (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2147 tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2148 tx->req_list[i].rdma_count);
2150 kprintf("--------------\n");
2152 tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2153 mxge_submit_req(tx, tx->req_list, cnt);
2154 #ifdef IFNET_BUF_RING
2155 if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2156 /* tell the NIC to start polling this slice */
2158 tx->queue_active = 1;
2171 #ifdef IFNET_BUF_RING
2173 mxge_qflush(struct ifnet *ifp)
2175 mxge_softc_t *sc = ifp->if_softc;
2180 for (slice = 0; slice < sc->num_slices; slice++) {
2181 tx = &sc->ss[slice].tx;
2182 lwkt_serialize_enter(sc->ifp->if_serializer);
2183 while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2185 lwkt_serialize_exit(sc->ifp->if_serializer);
2191 mxge_start_locked(struct mxge_slice_state *ss)
2202 while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2203 m = drbr_dequeue(ifp, tx->br);
2207 /* let BPF see it */
2210 /* give it to the nic */
2213 /* ran out of transmit slots */
2214 if (((ss->if_flags & IFF_OACTIVE) == 0)
2215 && (!drbr_empty(ifp, tx->br))) {
2216 ss->if_flags |= IFF_OACTIVE;
2222 mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2233 if ((ss->if_flags & (IFF_RUNNING|IFF_OACTIVE)) !=
2235 err = drbr_enqueue(ifp, tx->br, m);
2239 if (drbr_empty(ifp, tx->br) &&
2240 ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2241 /* let BPF see it */
2243 /* give it to the nic */
2245 } else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2248 if (!drbr_empty(ifp, tx->br))
2249 mxge_start_locked(ss);
2254 mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2256 mxge_softc_t *sc = ifp->if_softc;
2257 struct mxge_slice_state *ss;
2263 slice = m->m_pkthdr.flowid;
2265 slice &= (sc->num_slices - 1); /* num_slices always power of 2 */
2267 ss = &sc->ss[slice];
2270 if(lwkt_serialize_try(ifp->if_serializer)) {
2271 err = mxge_transmit_locked(ss, m);
2272 lwkt_serialize_exit(ifp->if_serializer);
2274 err = drbr_enqueue(ifp, tx->br, m);
2283 mxge_start_locked(struct mxge_slice_state *ss)
2293 while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2294 m = ifq_dequeue(&ifp->if_snd, NULL);
2298 /* let BPF see it */
2301 /* give it to the nic */
2304 /* ran out of transmit slots */
2305 if ((sc->ifp->if_flags & IFF_OACTIVE) == 0) {
2306 sc->ifp->if_flags |= IFF_OACTIVE;
2312 mxge_start(struct ifnet *ifp)
2314 mxge_softc_t *sc = ifp->if_softc;
2315 struct mxge_slice_state *ss;
2317 ASSERT_SERIALIZED(sc->ifp->if_serializer);
2318 /* only use the first slice for now */
2320 mxge_start_locked(ss);
2324 * copy an array of mcp_kreq_ether_recv_t's to the mcp. Copy
2325 * at most 32 bytes at a time, so as to avoid involving the software
2326 * pio handler in the nic. We re-write the first segment's low
2327 * DMA address to mark it valid only after we write the entire chunk
2331 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2332 mcp_kreq_ether_recv_t *src)
2336 low = src->addr_low;
2337 src->addr_low = 0xffffffff;
2338 mxge_pio_copy(dst, src, 4 * sizeof (*src));
2340 mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2342 src->addr_low = low;
2343 dst->addr_low = low;
2348 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2350 bus_dma_segment_t seg;
2352 mxge_rx_ring_t *rx = &ss->rx_small;
2355 m = m_gethdr(MB_DONTWAIT, MT_DATA);
2362 err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2363 &seg, 1, &cnt, BUS_DMA_NOWAIT);
2368 rx->info[idx].m = m;
2369 rx->shadow[idx].addr_low =
2370 htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2371 rx->shadow[idx].addr_high =
2372 htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2376 mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2382 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2384 bus_dma_segment_t seg[3];
2386 mxge_rx_ring_t *rx = &ss->rx_big;
2389 if (rx->cl_size == MCLBYTES)
2390 m = m_getcl(MB_DONTWAIT, MT_DATA, M_PKTHDR);
2393 m = m_getjcl(MB_DONTWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2396 * XXX: allocate normal sized buffers for big buffers.
2397 * We should be fine as long as we don't get any jumbo frames
2399 m = m_getcl(MB_DONTWAIT, MT_DATA, M_PKTHDR);
2407 m->m_len = rx->mlen;
2408 err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2409 seg, 1, &cnt, BUS_DMA_NOWAIT);
2414 rx->info[idx].m = m;
2415 rx->shadow[idx].addr_low =
2416 htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2417 rx->shadow[idx].addr_high =
2418 htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2420 #if MXGE_VIRT_JUMBOS
2421 for (i = 1; i < cnt; i++) {
2422 rx->shadow[idx + i].addr_low =
2423 htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2424 rx->shadow[idx + i].addr_high =
2425 htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2430 for (i = 0; i < rx->nbufs; i++) {
2431 if ((idx & 7) == 7) {
2432 mxge_submit_8rx(&rx->lanai[idx - 7],
2433 &rx->shadow[idx - 7]);
2441 * Myri10GE hardware checksums are not valid if the sender
2442 * padded the frame with non-zero padding. This is because
2443 * the firmware just does a simple 16-bit 1s complement
2444 * checksum across the entire frame, excluding the first 14
2445 * bytes. It is best to simply to check the checksum and
2446 * tell the stack about it only if the checksum is good
2449 static inline uint16_t
2450 mxge_rx_csum(struct mbuf *m, int csum)
2452 struct ether_header *eh;
2456 eh = mtod(m, struct ether_header *);
2458 /* only deal with IPv4 TCP & UDP for now */
2459 if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2461 ip = (struct ip *)(eh + 1);
2462 if (__predict_false(ip->ip_p != IPPROTO_TCP &&
2463 ip->ip_p != IPPROTO_UDP))
2466 c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2467 htonl(ntohs(csum) + ntohs(ip->ip_len) +
2468 - (ip->ip_hl << 2) + ip->ip_p));
2477 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2479 struct ether_vlan_header *evl;
2480 struct ether_header *eh;
2483 evl = mtod(m, struct ether_vlan_header *);
2484 eh = mtod(m, struct ether_header *);
2487 * fix checksum by subtracting EVL_ENCAPLEN bytes
2488 * after what the firmware thought was the end of the ethernet
2492 /* put checksum into host byte order */
2493 *csum = ntohs(*csum);
2494 partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2495 (*csum) += ~partial;
2496 (*csum) += ((*csum) < ~partial);
2497 (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2498 (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2500 /* restore checksum to network byte order;
2501 later consumers expect this */
2502 *csum = htons(*csum);
2505 #ifdef MXGE_NEW_VLAN_API
2506 m->m_pkthdr.ether_vlantag = ntohs(evl->evl_tag);
2510 mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2514 VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2515 m_tag_prepend(m, mtag);
2519 m->m_flags |= M_VLANTAG;
2522 * Remove the 802.1q header by copying the Ethernet
2523 * addresses over it and adjusting the beginning of
2524 * the data in the mbuf. The encapsulated Ethernet
2525 * type field is already in place.
2527 bcopy((char *)evl, (char *)evl + EVL_ENCAPLEN,
2528 ETHER_HDR_LEN - ETHER_TYPE_LEN);
2529 m_adj(m, EVL_ENCAPLEN);
2534 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2539 struct ether_header *eh;
2541 bus_dmamap_t old_map;
2543 uint16_t tcpudp_csum;
2548 idx = rx->cnt & rx->mask;
2549 rx->cnt += rx->nbufs;
2550 /* save a pointer to the received mbuf */
2551 m = rx->info[idx].m;
2552 /* try to replace the received mbuf */
2553 if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2554 /* drop the frame -- the old mbuf is re-cycled */
2559 /* unmap the received buffer */
2560 old_map = rx->info[idx].map;
2561 bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2562 bus_dmamap_unload(rx->dmat, old_map);
2564 /* swap the bus_dmamap_t's */
2565 rx->info[idx].map = rx->extra_map;
2566 rx->extra_map = old_map;
2568 /* mcp implicitly skips 1st 2 bytes so that packet is properly
2570 m->m_data += MXGEFW_PAD;
2572 m->m_pkthdr.rcvif = ifp;
2573 m->m_len = m->m_pkthdr.len = len;
2575 eh = mtod(m, struct ether_header *);
2576 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2577 mxge_vlan_tag_remove(m, &csum);
2579 /* if the checksum is valid, mark it in the mbuf header */
2580 if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2581 if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2583 /* otherwise, it was a UDP frame, or a TCP frame which
2584 we could not do LRO on. Tell the stack that the
2586 m->m_pkthdr.csum_data = 0xffff;
2587 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2590 /* flowid only valid if RSS hashing is enabled */
2591 if (sc->num_slices > 1) {
2592 m->m_pkthdr.flowid = (ss - sc->ss);
2593 m->m_flags |= M_FLOWID;
2596 /* pass the frame up the stack */
2597 (*ifp->if_input)(ifp, m);
2601 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2605 struct ether_header *eh;
2608 bus_dmamap_t old_map;
2610 uint16_t tcpudp_csum;
2615 idx = rx->cnt & rx->mask;
2617 /* save a pointer to the received mbuf */
2618 m = rx->info[idx].m;
2619 /* try to replace the received mbuf */
2620 if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2621 /* drop the frame -- the old mbuf is re-cycled */
2626 /* unmap the received buffer */
2627 old_map = rx->info[idx].map;
2628 bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2629 bus_dmamap_unload(rx->dmat, old_map);
2631 /* swap the bus_dmamap_t's */
2632 rx->info[idx].map = rx->extra_map;
2633 rx->extra_map = old_map;
2635 /* mcp implicitly skips 1st 2 bytes so that packet is properly
2637 m->m_data += MXGEFW_PAD;
2639 m->m_pkthdr.rcvif = ifp;
2640 m->m_len = m->m_pkthdr.len = len;
2642 eh = mtod(m, struct ether_header *);
2643 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2644 mxge_vlan_tag_remove(m, &csum);
2646 /* if the checksum is valid, mark it in the mbuf header */
2647 if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2648 if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2650 /* otherwise, it was a UDP frame, or a TCP frame which
2651 we could not do LRO on. Tell the stack that the
2653 m->m_pkthdr.csum_data = 0xffff;
2654 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2657 /* flowid only valid if RSS hashing is enabled */
2658 if (sc->num_slices > 1) {
2659 m->m_pkthdr.flowid = (ss - sc->ss);
2660 m->m_flags |= M_FLOWID;
2663 /* pass the frame up the stack */
2664 (*ifp->if_input)(ifp, m);
2668 mxge_clean_rx_done(struct mxge_slice_state *ss)
2670 mxge_rx_done_t *rx_done = &ss->rx_done;
2676 while (rx_done->entry[rx_done->idx].length != 0) {
2677 length = ntohs(rx_done->entry[rx_done->idx].length);
2678 rx_done->entry[rx_done->idx].length = 0;
2679 checksum = rx_done->entry[rx_done->idx].checksum;
2680 if (length <= (MHLEN - MXGEFW_PAD))
2681 mxge_rx_done_small(ss, length, checksum);
2683 mxge_rx_done_big(ss, length, checksum);
2685 rx_done->idx = rx_done->cnt & rx_done->mask;
2687 /* limit potential for livelock */
2688 if (__predict_false(++limit > rx_done->mask / 2))
2692 while (!SLIST_EMPTY(&ss->lro_active)) {
2693 struct lro_entry *lro = SLIST_FIRST(&ss->lro_active);
2694 SLIST_REMOVE_HEAD(&ss->lro_active, next);
2695 mxge_lro_flush(ss, lro);
2702 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2713 ASSERT_SERIALIZED(ifp->if_serializer);
2714 while (tx->pkt_done != mcp_idx) {
2715 idx = tx->done & tx->mask;
2717 m = tx->info[idx].m;
2718 /* mbuf and DMA map only attached to the first
2721 ss->obytes += m->m_pkthdr.len;
2722 if (m->m_flags & M_MCAST)
2725 tx->info[idx].m = NULL;
2726 map = tx->info[idx].map;
2727 bus_dmamap_unload(tx->dmat, map);
2730 if (tx->info[idx].flag) {
2731 tx->info[idx].flag = 0;
2736 /* If we have space, clear IFF_OACTIVE to tell the stack that
2737 its OK to send packets */
2738 #ifdef IFNET_BUF_RING
2739 flags = &ss->if_flags;
2741 flags = &ifp->if_flags;
2743 if ((*flags) & IFF_OACTIVE &&
2744 tx->req - tx->done < (tx->mask + 1)/4) {
2745 *(flags) &= ~IFF_OACTIVE;
2747 mxge_start_locked(ss);
2749 #ifdef IFNET_BUF_RING
2750 if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2751 /* let the NIC stop polling this queue, since there
2752 * are no more transmits pending */
2753 if (tx->req == tx->done) {
2755 tx->queue_active = 0;
2764 static struct mxge_media_type mxge_xfp_media_types[] =
2766 {IFM_10G_CX4, 0x7f, "10GBASE-CX4 (module)"},
2767 {IFM_10G_SR, (1 << 7), "10GBASE-SR"},
2768 {IFM_10G_LR, (1 << 6), "10GBASE-LR"},
2769 {0, (1 << 5), "10GBASE-ER"},
2770 {IFM_10G_LRM, (1 << 4), "10GBASE-LRM"},
2771 {0, (1 << 3), "10GBASE-SW"},
2772 {0, (1 << 2), "10GBASE-LW"},
2773 {0, (1 << 1), "10GBASE-EW"},
2774 {0, (1 << 0), "Reserved"}
2776 static struct mxge_media_type mxge_sfp_media_types[] =
2778 {0, (1 << 7), "Reserved"},
2779 {IFM_10G_LRM, (1 << 6), "10GBASE-LRM"},
2780 {IFM_10G_LR, (1 << 5), "10GBASE-LR"},
2781 {IFM_10G_SR, (1 << 4), "10GBASE-SR"}
2785 mxge_set_media(mxge_softc_t *sc, int type)
2787 sc->media_flags |= type;
2788 ifmedia_add(&sc->media, sc->media_flags, 0, NULL);
2789 ifmedia_set(&sc->media, sc->media_flags);
2794 * Determine the media type for a NIC. Some XFPs will identify
2795 * themselves only when their link is up, so this is initiated via a
2796 * link up interrupt. However, this can potentially take up to
2797 * several milliseconds, so it is run via the watchdog routine, rather
2798 * than in the interrupt handler itself. This need only be done
2799 * once, not each time the link is up.
2802 mxge_media_probe(mxge_softc_t *sc)
2807 struct mxge_media_type *mxge_media_types = NULL;
2808 int i, err, ms, mxge_media_type_entries;
2811 sc->need_media_probe = 0;
2813 /* if we've already set a media type, we're done */
2814 if (sc->media_flags != (IFM_ETHER | IFM_AUTO))
2818 * parse the product code to deterimine the interface type
2819 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2820 * after the 3rd dash in the driver's cached copy of the
2821 * EEPROM's product code string.
2823 ptr = sc->product_code_string;
2825 device_printf(sc->dev, "Missing product code\n");
2828 for (i = 0; i < 3; i++, ptr++) {
2829 ptr = index(ptr, '-');
2831 device_printf(sc->dev,
2832 "only %d dashes in PC?!?\n", i);
2838 mxge_set_media(sc, IFM_10G_CX4);
2841 else if (*ptr == 'Q') {
2842 /* -Q is Quad Ribbon Fiber */
2843 device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2844 /* FreeBSD has no media type for Quad ribbon fiber */
2850 mxge_media_types = mxge_xfp_media_types;
2851 mxge_media_type_entries =
2852 sizeof (mxge_xfp_media_types) /
2853 sizeof (mxge_xfp_media_types[0]);
2854 byte = MXGE_XFP_COMPLIANCE_BYTE;
2858 if (*ptr == 'S' || *(ptr +1) == 'S') {
2859 /* -S or -2S is SFP+ */
2860 mxge_media_types = mxge_sfp_media_types;
2861 mxge_media_type_entries =
2862 sizeof (mxge_sfp_media_types) /
2863 sizeof (mxge_sfp_media_types[0]);
2868 if (mxge_media_types == NULL) {
2869 device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2874 * At this point we know the NIC has an XFP cage, so now we
2875 * try to determine what is in the cage by using the
2876 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2877 * register. We read just one byte, which may take over
2881 cmd.data0 = 0; /* just fetch 1 byte, not all 256 */
2883 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2884 if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
2885 device_printf(sc->dev, "failed to read XFP\n");
2887 if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
2888 device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
2890 if (err != MXGEFW_CMD_OK) {
2894 /* now we wait for the data to be cached */
2896 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2897 for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
2900 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2902 if (err != MXGEFW_CMD_OK) {
2903 device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
2904 cage_type, err, ms);
2908 if (cmd.data0 == mxge_media_types[0].bitmask) {
2910 device_printf(sc->dev, "%s:%s\n", cage_type,
2911 mxge_media_types[0].name);
2912 mxge_set_media(sc, IFM_10G_CX4);
2915 for (i = 1; i < mxge_media_type_entries; i++) {
2916 if (cmd.data0 & mxge_media_types[i].bitmask) {
2918 device_printf(sc->dev, "%s:%s\n",
2920 mxge_media_types[i].name);
2922 mxge_set_media(sc, mxge_media_types[i].flag);
2926 device_printf(sc->dev, "%s media 0x%x unknown\n", cage_type,
2933 mxge_intr(void *arg)
2935 struct mxge_slice_state *ss = arg;
2936 mxge_softc_t *sc = ss->sc;
2937 mcp_irq_data_t *stats = ss->fw_stats;
2938 mxge_tx_ring_t *tx = &ss->tx;
2939 mxge_rx_done_t *rx_done = &ss->rx_done;
2940 uint32_t send_done_count;
2944 #ifndef IFNET_BUF_RING
2945 /* an interrupt on a non-zero slice is implicitly valid
2946 since MSI-X irqs are not shared */
2948 mxge_clean_rx_done(ss);
2949 *ss->irq_claim = be32toh(3);
2954 /* make sure the DMA has finished */
2955 if (!stats->valid) {
2958 valid = stats->valid;
2960 if (sc->legacy_irq) {
2961 /* lower legacy IRQ */
2962 *sc->irq_deassert = 0;
2963 if (!mxge_deassert_wait)
2964 /* don't wait for conf. that irq is low */
2970 /* loop while waiting for legacy irq deassertion */
2972 /* check for transmit completes and receives */
2973 send_done_count = be32toh(stats->send_done_count);
2974 while ((send_done_count != tx->pkt_done) ||
2975 (rx_done->entry[rx_done->idx].length != 0)) {
2976 if (send_done_count != tx->pkt_done)
2977 mxge_tx_done(ss, (int)send_done_count);
2978 mxge_clean_rx_done(ss);
2979 send_done_count = be32toh(stats->send_done_count);
2981 if (sc->legacy_irq && mxge_deassert_wait)
2983 } while (*((volatile uint8_t *) &stats->valid));
2985 /* fw link & error stats meaningful only on the first slice */
2986 if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
2987 if (sc->link_state != stats->link_up) {
2988 sc->link_state = stats->link_up;
2989 if (sc->link_state) {
2990 sc->ifp->if_link_state = LINK_STATE_UP;
2991 if_link_state_change(sc->ifp);
2993 device_printf(sc->dev, "link up\n");
2995 sc->ifp->if_link_state = LINK_STATE_DOWN;
2996 if_link_state_change(sc->ifp);
2998 device_printf(sc->dev, "link down\n");
3000 sc->need_media_probe = 1;
3002 if (sc->rdma_tags_available !=
3003 be32toh(stats->rdma_tags_available)) {
3004 sc->rdma_tags_available =
3005 be32toh(stats->rdma_tags_available);
3006 device_printf(sc->dev, "RDMA timed out! %d tags "
3007 "left\n", sc->rdma_tags_available);
3010 if (stats->link_down) {
3011 sc->down_cnt += stats->link_down;
3013 sc->ifp->if_link_state = LINK_STATE_DOWN;
3014 if_link_state_change(sc->ifp);
3018 /* check to see if we have rx token to pass back */
3020 *ss->irq_claim = be32toh(3);
3021 *(ss->irq_claim + 1) = be32toh(3);
3025 mxge_init(void *arg)
3032 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
3034 struct lro_entry *lro_entry;
3037 while (!SLIST_EMPTY(&ss->lro_free)) {
3038 lro_entry = SLIST_FIRST(&ss->lro_free);
3039 SLIST_REMOVE_HEAD(&ss->lro_free, next);
3040 kfree(lro_entry, M_DEVBUF);
3043 for (i = 0; i <= ss->rx_big.mask; i++) {
3044 if (ss->rx_big.info[i].m == NULL)
3046 bus_dmamap_unload(ss->rx_big.dmat,
3047 ss->rx_big.info[i].map);
3048 m_freem(ss->rx_big.info[i].m);
3049 ss->rx_big.info[i].m = NULL;
3052 for (i = 0; i <= ss->rx_small.mask; i++) {
3053 if (ss->rx_small.info[i].m == NULL)
3055 bus_dmamap_unload(ss->rx_small.dmat,
3056 ss->rx_small.info[i].map);
3057 m_freem(ss->rx_small.info[i].m);
3058 ss->rx_small.info[i].m = NULL;
3061 /* transmit ring used only on the first slice */
3062 if (ss->tx.info == NULL)
3065 for (i = 0; i <= ss->tx.mask; i++) {
3066 ss->tx.info[i].flag = 0;
3067 if (ss->tx.info[i].m == NULL)
3069 bus_dmamap_unload(ss->tx.dmat,
3070 ss->tx.info[i].map);
3071 m_freem(ss->tx.info[i].m);
3072 ss->tx.info[i].m = NULL;
3077 mxge_free_mbufs(mxge_softc_t *sc)
3081 for (slice = 0; slice < sc->num_slices; slice++)
3082 mxge_free_slice_mbufs(&sc->ss[slice]);
3086 mxge_free_slice_rings(struct mxge_slice_state *ss)
3091 if (ss->rx_done.entry != NULL)
3092 mxge_dma_free(&ss->rx_done.dma);
3093 ss->rx_done.entry = NULL;
3095 if (ss->tx.req_bytes != NULL)
3096 kfree(ss->tx.req_bytes, M_DEVBUF);
3097 ss->tx.req_bytes = NULL;
3099 if (ss->tx.seg_list != NULL)
3100 kfree(ss->tx.seg_list, M_DEVBUF);
3101 ss->tx.seg_list = NULL;
3103 if (ss->rx_small.shadow != NULL)
3104 kfree(ss->rx_small.shadow, M_DEVBUF);
3105 ss->rx_small.shadow = NULL;
3107 if (ss->rx_big.shadow != NULL)
3108 kfree(ss->rx_big.shadow, M_DEVBUF);
3109 ss->rx_big.shadow = NULL;
3111 if (ss->tx.info != NULL) {
3112 if (ss->tx.dmat != NULL) {
3113 for (i = 0; i <= ss->tx.mask; i++) {
3114 bus_dmamap_destroy(ss->tx.dmat,
3115 ss->tx.info[i].map);
3117 bus_dma_tag_destroy(ss->tx.dmat);
3119 kfree(ss->tx.info, M_DEVBUF);
3123 if (ss->rx_small.info != NULL) {
3124 if (ss->rx_small.dmat != NULL) {
3125 for (i = 0; i <= ss->rx_small.mask; i++) {
3126 bus_dmamap_destroy(ss->rx_small.dmat,
3127 ss->rx_small.info[i].map);
3129 bus_dmamap_destroy(ss->rx_small.dmat,
3130 ss->rx_small.extra_map);
3131 bus_dma_tag_destroy(ss->rx_small.dmat);
3133 kfree(ss->rx_small.info, M_DEVBUF);
3135 ss->rx_small.info = NULL;
3137 if (ss->rx_big.info != NULL) {
3138 if (ss->rx_big.dmat != NULL) {
3139 for (i = 0; i <= ss->rx_big.mask; i++) {
3140 bus_dmamap_destroy(ss->rx_big.dmat,
3141 ss->rx_big.info[i].map);
3143 bus_dmamap_destroy(ss->rx_big.dmat,
3144 ss->rx_big.extra_map);
3145 bus_dma_tag_destroy(ss->rx_big.dmat);
3147 kfree(ss->rx_big.info, M_DEVBUF);
3149 ss->rx_big.info = NULL;
3153 mxge_free_rings(mxge_softc_t *sc)
3157 for (slice = 0; slice < sc->num_slices; slice++)
3158 mxge_free_slice_rings(&sc->ss[slice]);
3162 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3163 int tx_ring_entries)
3165 mxge_softc_t *sc = ss->sc;
3171 /* allocate per-slice receive resources */
3173 ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3174 ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3176 /* allocate the rx shadow rings */
3177 bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3178 ss->rx_small.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3179 if (ss->rx_small.shadow == NULL)
3182 bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3183 ss->rx_big.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3184 if (ss->rx_big.shadow == NULL)
3187 /* allocate the rx host info rings */
3188 bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3189 ss->rx_small.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3190 if (ss->rx_small.info == NULL)
3193 bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3194 ss->rx_big.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3195 if (ss->rx_big.info == NULL)
3198 /* allocate the rx busdma resources */
3199 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3201 4096, /* boundary */
3202 BUS_SPACE_MAXADDR, /* low */
3203 BUS_SPACE_MAXADDR, /* high */
3204 NULL, NULL, /* filter */
3205 MHLEN, /* maxsize */
3207 MHLEN, /* maxsegsize */
3208 BUS_DMA_ALLOCNOW, /* flags */
3209 &ss->rx_small.dmat); /* tag */
3211 device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3216 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3218 #if MXGE_VIRT_JUMBOS
3219 4096, /* boundary */
3223 BUS_SPACE_MAXADDR, /* low */
3224 BUS_SPACE_MAXADDR, /* high */
3225 NULL, NULL, /* filter */
3226 3*4096, /* maxsize */
3227 #if MXGE_VIRT_JUMBOS
3229 4096, /* maxsegsize*/
3232 MJUM9BYTES, /* maxsegsize*/
3234 BUS_DMA_ALLOCNOW, /* flags */
3235 &ss->rx_big.dmat); /* tag */
3237 device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3241 for (i = 0; i <= ss->rx_small.mask; i++) {
3242 err = bus_dmamap_create(ss->rx_small.dmat, 0,
3243 &ss->rx_small.info[i].map);
3245 device_printf(sc->dev, "Err %d rx_small dmamap\n",
3250 err = bus_dmamap_create(ss->rx_small.dmat, 0,
3251 &ss->rx_small.extra_map);
3253 device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3258 for (i = 0; i <= ss->rx_big.mask; i++) {
3259 err = bus_dmamap_create(ss->rx_big.dmat, 0,
3260 &ss->rx_big.info[i].map);
3262 device_printf(sc->dev, "Err %d rx_big dmamap\n",
3267 err = bus_dmamap_create(ss->rx_big.dmat, 0,
3268 &ss->rx_big.extra_map);
3270 device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3275 /* now allocate TX resouces */
3277 #ifndef IFNET_BUF_RING
3278 /* only use a single TX ring for now */
3279 if (ss != ss->sc->ss)
3283 ss->tx.mask = tx_ring_entries - 1;
3284 ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3287 /* allocate the tx request copy block */
3289 sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3290 ss->tx.req_bytes = kmalloc(bytes, M_DEVBUF, M_WAITOK);
3291 if (ss->tx.req_bytes == NULL)
3293 /* ensure req_list entries are aligned to 8 bytes */
3294 ss->tx.req_list = (mcp_kreq_ether_send_t *)
3295 ((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3297 /* allocate the tx busdma segment list */
3298 bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3299 ss->tx.seg_list = (bus_dma_segment_t *)
3300 kmalloc(bytes, M_DEVBUF, M_WAITOK);
3301 if (ss->tx.seg_list == NULL)
3304 /* allocate the tx host info ring */
3305 bytes = tx_ring_entries * sizeof (*ss->tx.info);
3306 ss->tx.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3307 if (ss->tx.info == NULL)
3310 /* allocate the tx busdma resources */
3311 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3313 sc->tx_boundary, /* boundary */
3314 BUS_SPACE_MAXADDR, /* low */
3315 BUS_SPACE_MAXADDR, /* high */
3316 NULL, NULL, /* filter */
3317 65536 + 256, /* maxsize */
3318 ss->tx.max_desc - 2, /* num segs */
3319 sc->tx_boundary, /* maxsegsz */
3320 BUS_DMA_ALLOCNOW, /* flags */
3321 &ss->tx.dmat); /* tag */
3324 device_printf(sc->dev, "Err %d allocating tx dmat\n",
3329 /* now use these tags to setup dmamaps for each slot
3331 for (i = 0; i <= ss->tx.mask; i++) {
3332 err = bus_dmamap_create(ss->tx.dmat, 0,
3333 &ss->tx.info[i].map);
3335 device_printf(sc->dev, "Err %d tx dmamap\n",
3345 mxge_alloc_rings(mxge_softc_t *sc)
3349 int tx_ring_entries, rx_ring_entries;
3352 /* get ring sizes */
3353 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3354 tx_ring_size = cmd.data0;
3356 device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3360 tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3361 rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3362 ifq_set_maxlen(&sc->ifp->if_snd, tx_ring_entries - 1);
3363 ifq_set_ready(&sc->ifp->if_snd);
3365 for (slice = 0; slice < sc->num_slices; slice++) {
3366 err = mxge_alloc_slice_rings(&sc->ss[slice],
3375 mxge_free_rings(sc);
3382 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3384 int bufsize = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN + MXGEFW_PAD;
3386 if (bufsize < MCLBYTES) {
3387 /* easy, everything fits in a single buffer */
3388 *big_buf_size = MCLBYTES;
3389 *cl_size = MCLBYTES;
3394 if (bufsize < MJUMPAGESIZE) {
3395 /* still easy, everything still fits in a single buffer */
3396 *big_buf_size = MJUMPAGESIZE;
3397 *cl_size = MJUMPAGESIZE;
3401 #if MXGE_VIRT_JUMBOS
3402 /* now we need to use virtually contiguous buffers */
3403 *cl_size = MJUM9BYTES;
3404 *big_buf_size = 4096;
3405 *nbufs = mtu / 4096 + 1;
3406 /* needs to be a power of two, so round up */
3410 *cl_size = MJUM9BYTES;
3411 *big_buf_size = MJUM9BYTES;
3417 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3422 struct lro_entry *lro_entry;
3427 slice = ss - sc->ss;
3429 SLIST_INIT(&ss->lro_free);
3430 SLIST_INIT(&ss->lro_active);
3432 for (i = 0; i < sc->lro_cnt; i++) {
3433 lro_entry = (struct lro_entry *)
3434 kmalloc(sizeof (*lro_entry), M_DEVBUF,
3436 if (lro_entry == NULL) {
3440 SLIST_INSERT_HEAD(&ss->lro_free, lro_entry, next);
3442 /* get the lanai pointers to the send and receive rings */
3445 #ifndef IFNET_BUF_RING
3446 /* We currently only send from the first slice */
3450 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3452 (volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3453 ss->tx.send_go = (volatile uint32_t *)
3454 (sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3455 ss->tx.send_stop = (volatile uint32_t *)
3456 (sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3457 #ifndef IFNET_BUF_RING
3461 err |= mxge_send_cmd(sc,
3462 MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3463 ss->rx_small.lanai =
3464 (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3466 err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3468 (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3471 device_printf(sc->dev,
3472 "failed to get ring sizes or locations\n");
3476 /* stock receive rings */
3477 for (i = 0; i <= ss->rx_small.mask; i++) {
3478 map = ss->rx_small.info[i].map;
3479 err = mxge_get_buf_small(ss, map, i);
3481 device_printf(sc->dev, "alloced %d/%d smalls\n",
3482 i, ss->rx_small.mask + 1);
3486 for (i = 0; i <= ss->rx_big.mask; i++) {
3487 ss->rx_big.shadow[i].addr_low = 0xffffffff;
3488 ss->rx_big.shadow[i].addr_high = 0xffffffff;
3490 ss->rx_big.nbufs = nbufs;
3491 ss->rx_big.cl_size = cl_size;
3492 ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3493 EVL_ENCAPLEN + MXGEFW_PAD;
3494 for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3495 map = ss->rx_big.info[i].map;
3496 err = mxge_get_buf_big(ss, map, i);
3498 device_printf(sc->dev, "alloced %d/%d bigs\n",
3499 i, ss->rx_big.mask + 1);
3507 mxge_open(mxge_softc_t *sc)
3510 int err, big_bytes, nbufs, slice, cl_size, i;
3512 volatile uint8_t *itable;
3513 struct mxge_slice_state *ss;
3515 ASSERT_SERIALIZED(sc->ifp->if_serializer);
3516 /* Copy the MAC address in case it was overridden */
3517 bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3519 err = mxge_reset(sc, 1);
3521 device_printf(sc->dev, "failed to reset\n");
3525 if (sc->num_slices > 1) {
3526 /* setup the indirection table */
3527 cmd.data0 = sc->num_slices;
3528 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3531 err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3534 device_printf(sc->dev,
3535 "failed to setup rss tables\n");
3539 /* just enable an identity mapping */
3540 itable = sc->sram + cmd.data0;
3541 for (i = 0; i < sc->num_slices; i++)
3542 itable[i] = (uint8_t)i;
3545 cmd.data1 = mxge_rss_hash_type;
3546 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3548 device_printf(sc->dev, "failed to enable slices\n");
3554 mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3557 err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3559 /* error is only meaningful if we're trying to set
3560 MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3561 if (err && nbufs > 1) {
3562 device_printf(sc->dev,
3563 "Failed to set alway-use-n to %d\n",
3567 /* Give the firmware the mtu and the big and small buffer
3568 sizes. The firmware wants the big buf size to be a power
3569 of two. Luckily, FreeBSD's clusters are powers of two */
3570 cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
3571 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3572 cmd.data0 = MHLEN - MXGEFW_PAD;
3573 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3575 cmd.data0 = big_bytes;
3576 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3579 device_printf(sc->dev, "failed to setup params\n");
3583 /* Now give him the pointer to the stats block */
3585 #ifdef IFNET_BUF_RING
3586 slice < sc->num_slices;
3591 ss = &sc->ss[slice];
3593 MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3595 MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3596 cmd.data2 = sizeof(struct mcp_irq_data);
3597 cmd.data2 |= (slice << 16);
3598 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3602 bus = sc->ss->fw_stats_dma.bus_addr;
3603 bus += offsetof(struct mcp_irq_data, send_done_count);
3604 cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3605 cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3606 err = mxge_send_cmd(sc,
3607 MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3609 /* Firmware cannot support multicast without STATS_DMA_V2 */
3610 sc->fw_multicast_support = 0;
3612 sc->fw_multicast_support = 1;
3616 device_printf(sc->dev, "failed to setup params\n");
3620 for (slice = 0; slice < sc->num_slices; slice++) {
3621 err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3623 device_printf(sc->dev, "couldn't open slice %d\n",
3629 /* Finally, start the firmware running */
3630 err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3632 device_printf(sc->dev, "Couldn't bring up link\n");
3635 #ifdef IFNET_BUF_RING
3636 for (slice = 0; slice < sc->num_slices; slice++) {
3637 ss = &sc->ss[slice];
3638 ss->if_flags |= IFF_RUNNING;
3639 ss->if_flags &= ~IFF_OACTIVE;
3642 sc->ifp->if_flags |= IFF_RUNNING;
3643 sc->ifp->if_flags &= ~IFF_OACTIVE;
3644 callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3650 mxge_free_mbufs(sc);
3656 mxge_close(mxge_softc_t *sc)
3659 int err, old_down_cnt;
3660 #ifdef IFNET_BUF_RING
3661 struct mxge_slice_state *ss;
3665 ASSERT_SERIALIZED(sc->ifp->if_serializer);
3666 callout_stop(&sc->co_hdl);
3667 #ifdef IFNET_BUF_RING
3668 for (slice = 0; slice < sc->num_slices; slice++) {
3669 ss = &sc->ss[slice];
3670 ss->if_flags &= ~IFF_RUNNING;
3673 sc->ifp->if_flags &= ~IFF_RUNNING;
3674 old_down_cnt = sc->down_cnt;
3676 err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3678 device_printf(sc->dev, "Couldn't bring down link\n");
3680 if (old_down_cnt == sc->down_cnt) {
3681 /* wait for down irq */
3682 DELAY(10 * sc->intr_coal_delay);
3685 if (old_down_cnt == sc->down_cnt) {
3686 device_printf(sc->dev, "never got down irq\n");
3689 mxge_free_mbufs(sc);
3695 mxge_setup_cfg_space(mxge_softc_t *sc)
3697 device_t dev = sc->dev;
3699 uint16_t cmd, lnk, pectl;
3701 /* find the PCIe link width and set max read request to 4KB*/
3702 if (pci_find_extcap(dev, PCIY_EXPRESS, ®) == 0) {
3703 lnk = pci_read_config(dev, reg + 0x12, 2);
3704 sc->link_width = (lnk >> 4) & 0x3f;
3706 pectl = pci_read_config(dev, reg + 0x8, 2);
3707 pectl = (pectl & ~0x7000) | (5 << 12);
3708 pci_write_config(dev, reg + 0x8, pectl, 2);
3711 /* Enable DMA and Memory space access */
3712 pci_enable_busmaster(dev);
3713 cmd = pci_read_config(dev, PCIR_COMMAND, 2);
3714 cmd |= PCIM_CMD_MEMEN;
3715 pci_write_config(dev, PCIR_COMMAND, cmd, 2);
3719 mxge_read_reboot(mxge_softc_t *sc)
3721 device_t dev = sc->dev;
3724 /* find the vendor specific offset */
3725 if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
3726 device_printf(sc->dev,
3727 "could not find vendor specific offset\n");
3728 return (uint32_t)-1;
3730 /* enable read32 mode */
3731 pci_write_config(dev, vs + 0x10, 0x3, 1);
3732 /* tell NIC which register to read */
3733 pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3734 return (pci_read_config(dev, vs + 0x14, 4));
3738 mxge_watchdog_reset(mxge_softc_t *sc, int slice)
3740 struct pci_devinfo *dinfo;
3748 device_printf(sc->dev, "Watchdog reset!\n");
3751 * check to see if the NIC rebooted. If it did, then all of
3752 * PCI config space has been reset, and things like the
3753 * busmaster bit will be zero. If this is the case, then we
3754 * must restore PCI config space before the NIC can be used
3757 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3758 if (cmd == 0xffff) {
3760 * maybe the watchdog caught the NIC rebooting; wait
3761 * up to 100ms for it to finish. If it does not come
3762 * back, then give up
3765 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3766 if (cmd == 0xffff) {
3767 device_printf(sc->dev, "NIC disappeared!\n");
3771 if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3772 /* print the reboot status */
3773 reboot = mxge_read_reboot(sc);
3774 device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3776 /* restore PCI configuration space */
3777 dinfo = device_get_ivars(sc->dev);
3778 pci_cfg_restore(sc->dev, dinfo);
3780 /* and redo any changes we made to our config space */
3781 mxge_setup_cfg_space(sc);
3783 if (sc->ifp->if_flags & IFF_RUNNING) {
3785 err = mxge_open(sc);
3788 tx = &sc->ss[slice].tx;
3789 device_printf(sc->dev,
3790 "NIC did not reboot, slice %d ring state:\n",
3792 device_printf(sc->dev,
3793 "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3794 tx->req, tx->done, tx->queue_active);
3795 device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3796 tx->activate, tx->deactivate);
3797 device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3799 be32toh(sc->ss->fw_stats->send_done_count));
3800 device_printf(sc->dev, "not resetting\n");
3806 mxge_watchdog(mxge_softc_t *sc)
3809 uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3812 /* see if we have outstanding transmits, which
3813 have been pending for more than mxge_ticks */
3815 #ifdef IFNET_BUF_RING
3816 (i < sc->num_slices) && (err == 0);
3818 (i < 1) && (err == 0);
3822 if (tx->req != tx->done &&
3823 tx->watchdog_req != tx->watchdog_done &&
3824 tx->done == tx->watchdog_done) {
3825 /* check for pause blocking before resetting */
3826 if (tx->watchdog_rx_pause == rx_pause)
3827 err = mxge_watchdog_reset(sc, i);
3829 device_printf(sc->dev, "Flow control blocking "
3830 "xmits, check link partner\n");
3833 tx->watchdog_req = tx->req;
3834 tx->watchdog_done = tx->done;
3835 tx->watchdog_rx_pause = rx_pause;
3838 if (sc->need_media_probe)
3839 mxge_media_probe(sc);
3844 mxge_update_stats(mxge_softc_t *sc)
3846 struct mxge_slice_state *ss;
3847 u_long ipackets = 0;
3848 u_long opackets = 0;
3849 #ifdef IFNET_BUF_RING
3857 for (slice = 0; slice < sc->num_slices; slice++) {
3858 ss = &sc->ss[slice];
3859 ipackets += ss->ipackets;
3860 opackets += ss->opackets;
3861 #ifdef IFNET_BUF_RING
3862 obytes += ss->obytes;
3863 omcasts += ss->omcasts;
3864 odrops += ss->tx.br->br_drops;
3866 oerrors += ss->oerrors;
3868 sc->ifp->if_ipackets = ipackets;
3869 sc->ifp->if_opackets = opackets;
3870 #ifdef IFNET_BUF_RING
3871 sc->ifp->if_obytes = obytes;
3872 sc->ifp->if_omcasts = omcasts;
3873 sc->ifp->if_snd.ifq_drops = odrops;
3875 sc->ifp->if_oerrors = oerrors;
3879 mxge_tick(void *arg)
3881 mxge_softc_t *sc = arg;
3884 lwkt_serialize_enter(sc->ifp->if_serializer);
3885 /* aggregate stats from different slices */
3886 mxge_update_stats(sc);
3887 if (!sc->watchdog_countdown) {
3888 err = mxge_watchdog(sc);
3889 sc->watchdog_countdown = 4;
3891 sc->watchdog_countdown--;
3893 callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3894 lwkt_serialize_exit(sc->ifp->if_serializer);
3898 mxge_media_change(struct ifnet *ifp)
3904 mxge_change_mtu(mxge_softc_t *sc, int mtu)
3906 struct ifnet *ifp = sc->ifp;
3907 int real_mtu, old_mtu;
3910 if (ifp->if_serializer)
3911 ASSERT_SERIALIZED(ifp->if_serializer);
3913 real_mtu = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
3914 if ((real_mtu > sc->max_mtu) || real_mtu < 60)
3916 old_mtu = ifp->if_mtu;
3918 if (ifp->if_flags & IFF_RUNNING) {
3920 err = mxge_open(sc);
3922 ifp->if_mtu = old_mtu;
3924 (void) mxge_open(sc);
3931 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
3933 mxge_softc_t *sc = ifp->if_softc;
3938 ifmr->ifm_status = IFM_AVALID;
3939 ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
3940 ifmr->ifm_active = IFM_AUTO | IFM_ETHER;
3941 ifmr->ifm_active |= sc->link_state ? IFM_FDX : 0;
3945 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data, struct ucred *cr)
3947 mxge_softc_t *sc = ifp->if_softc;
3948 struct ifreq *ifr = (struct ifreq *)data;
3953 ASSERT_SERIALIZED(ifp->if_serializer);
3957 err = ether_ioctl(ifp, command, data);
3961 err = mxge_change_mtu(sc, ifr->ifr_mtu);
3968 if (ifp->if_flags & IFF_UP) {
3969 if (!(ifp->if_flags & IFF_RUNNING)) {
3970 err = mxge_open(sc);
3972 /* take care of promis can allmulti
3974 mxge_change_promisc(sc,
3975 ifp->if_flags & IFF_PROMISC);
3976 mxge_set_multicast_list(sc);
3979 if (ifp->if_flags & IFF_RUNNING) {
3987 mxge_set_multicast_list(sc);
3991 mask = ifr->ifr_reqcap ^ ifp->if_capenable;
3992 if (mask & IFCAP_TXCSUM) {
3993 if (IFCAP_TXCSUM & ifp->if_capenable) {
3994 ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
3995 ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
3998 ifp->if_capenable |= IFCAP_TXCSUM;
3999 ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
4001 } else if (mask & IFCAP_RXCSUM) {
4002 if (IFCAP_RXCSUM & ifp->if_capenable) {
4003 ifp->if_capenable &= ~IFCAP_RXCSUM;
4006 ifp->if_capenable |= IFCAP_RXCSUM;
4010 if (mask & IFCAP_TSO4) {
4011 if (IFCAP_TSO4 & ifp->if_capenable) {
4012 ifp->if_capenable &= ~IFCAP_TSO4;
4013 ifp->if_hwassist &= ~CSUM_TSO;
4014 } else if (IFCAP_TXCSUM & ifp->if_capenable) {
4015 ifp->if_capenable |= IFCAP_TSO4;
4016 ifp->if_hwassist |= CSUM_TSO;
4018 kprintf("mxge requires tx checksum offload"
4019 " be enabled to use TSO\n");
4023 if (mask & IFCAP_LRO) {
4024 if (IFCAP_LRO & ifp->if_capenable)
4025 err = mxge_change_lro_locked(sc, 0);
4027 err = mxge_change_lro_locked(sc, mxge_lro_cnt);
4029 if (mask & IFCAP_VLAN_HWTAGGING)
4030 ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
4031 VLAN_CAPABILITIES(ifp);
4036 err = ifmedia_ioctl(ifp, (struct ifreq *)data,
4037 &sc->media, command);
4047 mxge_fetch_tunables(mxge_softc_t *sc)
4050 TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4051 TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4052 &mxge_flow_control);
4053 TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4054 &mxge_intr_coal_delay);
4055 TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4056 &mxge_nvidia_ecrc_enable);
4057 TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4058 &mxge_force_firmware);
4059 TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4060 &mxge_deassert_wait);
4061 TUNABLE_INT_FETCH("hw.mxge.verbose",
4063 TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4064 TUNABLE_INT_FETCH("hw.mxge.lro_cnt", &sc->lro_cnt);
4065 TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4066 TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4067 TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4068 if (sc->lro_cnt != 0)
4069 mxge_lro_cnt = sc->lro_cnt;
4073 if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4074 mxge_intr_coal_delay = 30;
4075 if (mxge_ticks == 0)
4076 mxge_ticks = hz / 2;
4077 sc->pause = mxge_flow_control;
4078 if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4079 || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4080 mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
4082 if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4083 mxge_initial_mtu < ETHER_MIN_LEN)
4084 mxge_initial_mtu = ETHERMTU_JUMBO;
4089 mxge_free_slices(mxge_softc_t *sc)
4091 struct mxge_slice_state *ss;
4098 for (i = 0; i < sc->num_slices; i++) {
4100 if (ss->fw_stats != NULL) {
4101 mxge_dma_free(&ss->fw_stats_dma);
4102 ss->fw_stats = NULL;
4103 #ifdef IFNET_BUF_RING
4104 if (ss->tx.br != NULL) {
4105 drbr_free(ss->tx.br, M_DEVBUF);
4110 if (ss->rx_done.entry != NULL) {
4111 mxge_dma_free(&ss->rx_done.dma);
4112 ss->rx_done.entry = NULL;
4115 kfree(sc->ss, M_DEVBUF);
4120 mxge_alloc_slices(mxge_softc_t *sc)
4123 struct mxge_slice_state *ss;
4125 int err, i, max_intr_slots;
4127 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4129 device_printf(sc->dev, "Cannot determine rx ring size\n");
4132 sc->rx_ring_size = cmd.data0;
4133 max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4135 bytes = sizeof (*sc->ss) * sc->num_slices;
4136 sc->ss = kmalloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4139 for (i = 0; i < sc->num_slices; i++) {
4144 /* allocate per-slice rx interrupt queues */
4146 bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4147 err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4150 ss->rx_done.entry = ss->rx_done.dma.addr;
4151 bzero(ss->rx_done.entry, bytes);
4154 * allocate the per-slice firmware stats; stats
4155 * (including tx) are used used only on the first
4158 #ifndef IFNET_BUF_RING
4163 bytes = sizeof (*ss->fw_stats);
4164 err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4165 sizeof (*ss->fw_stats), 64);
4168 ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4169 #ifdef IFNET_BUF_RING
4170 ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4178 mxge_free_slices(sc);
4183 mxge_slice_probe(mxge_softc_t *sc)
4187 int msix_cnt, status, max_intr_slots;
4191 * don't enable multiple slices if they are not enabled,
4192 * or if this is not an SMP system
4195 if (mxge_max_slices == 0 || mxge_max_slices == 1 || ncpus < 2)
4198 /* see how many MSI-X interrupts are available */
4199 msix_cnt = pci_msix_count(sc->dev);
4203 /* now load the slice aware firmware see what it supports */
4204 old_fw = sc->fw_name;
4205 if (old_fw == mxge_fw_aligned)
4206 sc->fw_name = mxge_fw_rss_aligned;
4208 sc->fw_name = mxge_fw_rss_unaligned;
4209 status = mxge_load_firmware(sc, 0);
4211 device_printf(sc->dev, "Falling back to a single slice\n");
4215 /* try to send a reset command to the card to see if it
4217 memset(&cmd, 0, sizeof (cmd));
4218 status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4220 device_printf(sc->dev, "failed reset\n");
4224 /* get rx ring size */
4225 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4227 device_printf(sc->dev, "Cannot determine rx ring size\n");
4230 max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4232 /* tell it the size of the interrupt queues */
4233 cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4234 status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4236 device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4240 /* ask the maximum number of slices it supports */
4241 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4243 device_printf(sc->dev,
4244 "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4247 sc->num_slices = cmd.data0;
4248 if (sc->num_slices > msix_cnt)
4249 sc->num_slices = msix_cnt;
4251 if (mxge_max_slices == -1) {
4252 /* cap to number of CPUs in system */
4253 if (sc->num_slices > ncpus)
4254 sc->num_slices = ncpus;
4256 if (sc->num_slices > mxge_max_slices)
4257 sc->num_slices = mxge_max_slices;
4259 /* make sure it is a power of two */
4260 while (sc->num_slices & (sc->num_slices - 1))
4264 device_printf(sc->dev, "using %d slices\n",
4270 sc->fw_name = old_fw;
4271 (void) mxge_load_firmware(sc, 0);
4275 mxge_add_msix_irqs(mxge_softc_t *sc)
4278 int count, err, i, rid;
4281 sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4284 if (sc->msix_table_res == NULL) {
4285 device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4289 count = sc->num_slices;
4290 err = pci_alloc_msix(sc->dev, &count);
4292 device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4293 "err = %d \n", sc->num_slices, err);
4294 goto abort_with_msix_table;
4296 if (count < sc->num_slices) {
4297 device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4298 count, sc->num_slices);
4299 device_printf(sc->dev,
4300 "Try setting hw.mxge.max_slices to %d\n",
4303 goto abort_with_msix;
4305 bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4306 sc->msix_irq_res = kmalloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4307 if (sc->msix_irq_res == NULL) {
4309 goto abort_with_msix;
4312 for (i = 0; i < sc->num_slices; i++) {
4314 sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4317 if (sc->msix_irq_res[i] == NULL) {
4318 device_printf(sc->dev, "couldn't allocate IRQ res"
4319 " for message %d\n", i);
4321 goto abort_with_res;
4325 bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4326 sc->msix_ih = kmalloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4328 for (i = 0; i < sc->num_slices; i++) {
4329 err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4331 mxge_intr, &sc->ss[i], &sc->msix_ih[i],
4332 sc->ifp->if_serializer);
4334 device_printf(sc->dev, "couldn't setup intr for "
4336 goto abort_with_intr;
4341 device_printf(sc->dev, "using %d msix IRQs:",
4343 for (i = 0; i < sc->num_slices; i++)
4344 kprintf(" %ld", rman_get_start(sc->msix_irq_res[i]));
4350 for (i = 0; i < sc->num_slices; i++) {
4351 if (sc->msix_ih[i] != NULL) {
4352 bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4354 sc->msix_ih[i] = NULL;
4357 kfree(sc->msix_ih, M_DEVBUF);
4361 for (i = 0; i < sc->num_slices; i++) {
4363 if (sc->msix_irq_res[i] != NULL)
4364 bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4365 sc->msix_irq_res[i]);
4366 sc->msix_irq_res[i] = NULL;
4368 kfree(sc->msix_irq_res, M_DEVBUF);
4372 pci_release_msi(sc->dev);
4374 abort_with_msix_table:
4375 bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4376 sc->msix_table_res);
4382 mxge_add_single_irq(mxge_softc_t *sc)
4384 int count, err, rid;
4386 count = pci_msi_count(sc->dev);
4387 if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4393 sc->irq_res = bus_alloc_resource(sc->dev, SYS_RES_IRQ, &rid, 0, ~0,
4394 1, RF_SHAREABLE | RF_ACTIVE);
4395 if (sc->irq_res == NULL) {
4396 device_printf(sc->dev, "could not alloc interrupt\n");
4400 device_printf(sc->dev, "using %s irq %ld\n",
4401 sc->legacy_irq ? "INTx" : "MSI",
4402 rman_get_start(sc->irq_res));
4403 err = bus_setup_intr(sc->dev, sc->irq_res,
4405 mxge_intr, &sc->ss[0], &sc->ih,
4406 sc->ifp->if_serializer);
4408 bus_release_resource(sc->dev, SYS_RES_IRQ,
4409 sc->legacy_irq ? 0 : 1, sc->irq_res);
4410 if (!sc->legacy_irq)
4411 pci_release_msi(sc->dev);
4417 mxge_rem_msix_irqs(mxge_softc_t *sc)
4421 for (i = 0; i < sc->num_slices; i++) {
4422 if (sc->msix_ih[i] != NULL) {
4423 bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4425 sc->msix_ih[i] = NULL;
4428 kfree(sc->msix_ih, M_DEVBUF);
4430 for (i = 0; i < sc->num_slices; i++) {
4432 if (sc->msix_irq_res[i] != NULL)
4433 bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4434 sc->msix_irq_res[i]);
4435 sc->msix_irq_res[i] = NULL;
4437 kfree(sc->msix_irq_res, M_DEVBUF);
4439 bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4440 sc->msix_table_res);
4442 pci_release_msi(sc->dev);
4447 mxge_rem_single_irq(mxge_softc_t *sc)
4449 bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4450 bus_release_resource(sc->dev, SYS_RES_IRQ,
4451 sc->legacy_irq ? 0 : 1, sc->irq_res);
4452 if (!sc->legacy_irq)
4453 pci_release_msi(sc->dev);
4457 mxge_rem_irq(mxge_softc_t *sc)
4459 if (sc->num_slices > 1)
4460 mxge_rem_msix_irqs(sc);
4462 mxge_rem_single_irq(sc);
4466 mxge_add_irq(mxge_softc_t *sc)
4470 if (sc->num_slices > 1)
4471 err = mxge_add_msix_irqs(sc);
4473 err = mxge_add_single_irq(sc);
4475 if (0 && err == 0 && sc->num_slices > 1) {
4476 mxge_rem_msix_irqs(sc);
4477 err = mxge_add_msix_irqs(sc);
4484 mxge_attach(device_t dev)
4486 mxge_softc_t *sc = device_get_softc(dev);
4487 struct ifnet *ifp = &sc->arpcom.ac_if;
4491 * avoid rewriting half the lines in this file to use
4492 * &sc->arpcom.ac_if instead
4496 mxge_fetch_tunables(sc);
4498 err = bus_dma_tag_create(NULL, /* parent */
4501 BUS_SPACE_MAXADDR, /* low */
4502 BUS_SPACE_MAXADDR, /* high */
4503 NULL, NULL, /* filter */
4504 65536 + 256, /* maxsize */
4505 MXGE_MAX_SEND_DESC, /* num segs */
4506 65536, /* maxsegsize */
4508 &sc->parent_dmat); /* tag */
4511 device_printf(sc->dev, "Err %d allocating parent dmat\n",
4513 goto abort_with_nothing;
4517 if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4519 callout_init_mp(&sc->co_hdl);
4521 mxge_setup_cfg_space(sc);
4523 /* Map the board into the kernel */
4525 sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
4527 if (sc->mem_res == NULL) {
4528 device_printf(dev, "could not map memory\n");
4530 goto abort_with_nothing;
4532 sc->sram = rman_get_virtual(sc->mem_res);
4533 sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4534 if (sc->sram_size > rman_get_size(sc->mem_res)) {
4535 device_printf(dev, "impossible memory region size %ld\n",
4536 rman_get_size(sc->mem_res));
4538 goto abort_with_mem_res;
4541 /* make NULL terminated copy of the EEPROM strings section of
4543 bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4544 bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4545 rman_get_bushandle(sc->mem_res),
4546 sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4548 MXGE_EEPROM_STRINGS_SIZE - 2);
4549 err = mxge_parse_strings(sc);
4551 goto abort_with_mem_res;
4553 /* Enable write combining for efficient use of PCIe bus */
4556 /* Allocate the out of band dma memory */
4557 err = mxge_dma_alloc(sc, &sc->cmd_dma,
4558 sizeof (mxge_cmd_t), 64);
4560 goto abort_with_mem_res;
4561 sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4562 err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4564 goto abort_with_cmd_dma;
4566 err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4568 goto abort_with_zeropad_dma;
4570 /* select & load the firmware */
4571 err = mxge_select_firmware(sc);
4573 goto abort_with_dmabench;
4574 sc->intr_coal_delay = mxge_intr_coal_delay;
4576 mxge_slice_probe(sc);
4577 err = mxge_alloc_slices(sc);
4579 goto abort_with_dmabench;
4581 err = mxge_reset(sc, 0);
4583 goto abort_with_slices;
4585 err = mxge_alloc_rings(sc);
4587 device_printf(sc->dev, "failed to allocate rings\n");
4588 goto abort_with_dmabench;
4591 err = mxge_add_irq(sc);
4593 device_printf(sc->dev, "failed to add irq\n");
4594 goto abort_with_rings;
4597 ifp->if_baudrate = IF_Gbps(10UL);
4598 ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4601 ifp->if_capabilities |= IFCAP_LRO;
4604 #ifdef MXGE_NEW_VLAN_API
4605 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4608 sc->max_mtu = mxge_max_mtu(sc);
4609 if (sc->max_mtu >= 9000)
4610 ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4612 device_printf(dev, "MTU limited to %d. Install "
4613 "latest firmware for 9000 byte jumbo support\n",
4614 sc->max_mtu - ETHER_HDR_LEN);
4615 ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4616 ifp->if_capenable = ifp->if_capabilities;
4617 if (sc->lro_cnt == 0)
4618 ifp->if_capenable &= ~IFCAP_LRO;
4620 ifp->if_init = mxge_init;
4622 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4623 ifp->if_ioctl = mxge_ioctl;
4624 ifp->if_start = mxge_start;
4625 /* Initialise the ifmedia structure */
4626 ifmedia_init(&sc->media, 0, mxge_media_change,
4628 mxge_set_media(sc, IFM_ETHER | IFM_AUTO);
4629 mxge_media_probe(sc);
4631 ether_ifattach(ifp, sc->mac_addr, NULL);
4632 /* ether_ifattach sets mtu to ETHERMTU */
4633 if (mxge_initial_mtu != ETHERMTU) {
4634 lwkt_serialize_enter(ifp->if_serializer);
4635 mxge_change_mtu(sc, mxge_initial_mtu);
4636 lwkt_serialize_exit(ifp->if_serializer);
4639 mxge_add_sysctls(sc);
4640 #ifdef IFNET_BUF_RING
4641 ifp->if_transmit = mxge_transmit;
4642 ifp->if_qflush = mxge_qflush;
4647 mxge_free_rings(sc);
4649 mxge_free_slices(sc);
4650 abort_with_dmabench:
4651 mxge_dma_free(&sc->dmabench_dma);
4652 abort_with_zeropad_dma:
4653 mxge_dma_free(&sc->zeropad_dma);
4655 mxge_dma_free(&sc->cmd_dma);
4657 bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4658 pci_disable_busmaster(dev);
4659 bus_dma_tag_destroy(sc->parent_dmat);
4665 mxge_detach(device_t dev)
4667 mxge_softc_t *sc = device_get_softc(dev);
4669 lwkt_serialize_enter(sc->ifp->if_serializer);
4671 if (sc->ifp->if_flags & IFF_RUNNING)
4674 * XXX: race: the callout callback could be spinning on
4675 * the serializer and run anyway
4677 callout_stop(&sc->co_hdl);
4678 lwkt_serialize_exit(sc->ifp->if_serializer);
4680 ether_ifdetach(sc->ifp);
4681 ifmedia_removeall(&sc->media);
4682 mxge_dummy_rdma(sc, 0);
4683 mxge_rem_sysctls(sc);
4685 mxge_free_rings(sc);
4686 mxge_free_slices(sc);
4687 mxge_dma_free(&sc->dmabench_dma);
4688 mxge_dma_free(&sc->zeropad_dma);
4689 mxge_dma_free(&sc->cmd_dma);
4690 bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4691 pci_disable_busmaster(dev);
4692 bus_dma_tag_destroy(sc->parent_dmat);
4697 mxge_shutdown(device_t dev)
4703 This file uses Myri10GE driver indentation.
4706 c-file-style:"linux"