1 /******************************************************************************
3 Copyright (c) 2006-2009, Myricom Inc.
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
9 1. Redistributions of source code must retain the above copyright notice,
10 this list of conditions and the following disclaimer.
12 2. Neither the name of the Myricom Inc, nor the names of its
13 contributors may be used to endorse or promote products derived from
14 this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
28 $FreeBSD: src/sys/dev/mxge/if_mxge.c,v 1.63 2009/06/26 11:45:06 rwatson Exp $
30 ***************************************************************************/
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/linker.h>
35 #include <sys/firmware.h>
36 #include <sys/endian.h>
37 #include <sys/in_cksum.h>
38 #include <sys/sockio.h>
40 #include <sys/malloc.h>
41 #include <sys/kernel.h>
42 #include <sys/module.h>
43 #include <sys/serialize.h>
44 #include <sys/socket.h>
45 #include <sys/sysctl.h>
47 /* count xmits ourselves, rather than via drbr */
50 #include <net/if_arp.h>
51 #include <net/ifq_var.h>
52 #include <net/ethernet.h>
53 #include <net/if_dl.h>
54 #include <net/if_media.h>
58 #include <net/if_types.h>
59 #include <net/vlan/if_vlan_var.h>
62 #include <netinet/in_systm.h>
63 #include <netinet/in.h>
64 #include <netinet/ip.h>
65 #include <netinet/tcp.h>
70 #include <bus/pci/pcireg.h>
71 #include <bus/pci/pcivar.h>
72 #include <bus/pci/pci_private.h> /* XXX for pci_cfg_restore */
74 #include <vm/vm.h> /* for pmap_mapdev() */
77 #if defined(__i386) || defined(__x86_64)
78 #include <machine/specialreg.h>
81 #include <dev/netif/mxge/mxge_mcp.h>
82 #include <dev/netif/mxge/mcp_gen_header.h>
83 /*#define MXGE_FAKE_IFP*/
84 #include <dev/netif/mxge/if_mxge_var.h>
86 #include <sys/buf_ring.h>
92 static int mxge_nvidia_ecrc_enable = 1;
93 static int mxge_force_firmware = 0;
94 static int mxge_intr_coal_delay = 30;
95 static int mxge_deassert_wait = 1;
96 static int mxge_flow_control = 1;
97 static int mxge_verbose = 0;
98 static int mxge_lro_cnt = 8;
99 static int mxge_ticks;
100 static int mxge_max_slices = 1;
101 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
102 static int mxge_always_promisc = 0;
104 /* static int mxge_initial_mtu = ETHERMTU_JUMBO; */
105 static int mxge_initial_mtu = ETHERMTU;
106 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
107 static char *mxge_fw_aligned = "mxge_eth_z8e";
108 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
109 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
111 static int mxge_probe(device_t dev);
112 static int mxge_attach(device_t dev);
113 static int mxge_detach(device_t dev);
114 static int mxge_shutdown(device_t dev);
115 static void mxge_intr(void *arg);
117 static device_method_t mxge_methods[] =
119 /* Device interface */
120 DEVMETHOD(device_probe, mxge_probe),
121 DEVMETHOD(device_attach, mxge_attach),
122 DEVMETHOD(device_detach, mxge_detach),
123 DEVMETHOD(device_shutdown, mxge_shutdown),
127 static driver_t mxge_driver =
131 sizeof(mxge_softc_t),
134 static devclass_t mxge_devclass;
136 /* Declare ourselves to be a child of the PCI bus.*/
137 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
138 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
139 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
141 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
142 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
143 static int mxge_close(mxge_softc_t *sc);
144 static int mxge_open(mxge_softc_t *sc);
145 static void mxge_tick(void *arg);
147 /* XXX: we don't have Large Receive Offload support yet */
149 mxge_lro_rx(struct mxge_slice_state *ss, struct mbuf *m_head, uint32_t csum)
158 mxge_lro_flush(struct mxge_slice_state *ss, struct lro_entry *lro)
165 mxge_probe(device_t dev)
170 if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
171 ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
172 (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
173 rev = pci_get_revid(dev);
175 case MXGE_PCI_REV_Z8E:
176 device_set_desc(dev, "Myri10G-PCIE-8A");
178 case MXGE_PCI_REV_Z8ES:
179 device_set_desc(dev, "Myri10G-PCIE-8B");
182 device_set_desc(dev, "Myri10G-PCIE-8??");
183 device_printf(dev, "Unrecognized rev %d NIC\n",
193 mxge_enable_wc(mxge_softc_t *sc)
196 #if defined(__i386) || defined(__x86_64)
201 len = rman_get_size(sc->mem_res);
202 err = pmap_change_attr((vm_offset_t) sc->sram,
203 len, PAT_WRITE_COMBINING);
205 device_printf(sc->dev, "pmap_change_attr failed, %d\n",
211 sc->wc = 0; /* TBD: PAT support */
216 /* callback to get our DMA address */
218 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
222 *(bus_addr_t *) arg = segs->ds_addr;
227 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
228 bus_size_t alignment)
231 device_t dev = sc->dev;
232 bus_size_t boundary, maxsegsize;
234 if (bytes > 4096 && alignment == 4096) {
242 /* allocate DMAable memory tags */
243 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
244 alignment, /* alignment */
245 boundary, /* boundary */
246 BUS_SPACE_MAXADDR, /* low */
247 BUS_SPACE_MAXADDR, /* high */
248 NULL, NULL, /* filter */
251 maxsegsize, /* maxsegsize */
252 BUS_DMA_COHERENT, /* flags */
253 &dma->dmat); /* tag */
255 device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
259 /* allocate DMAable memory & map */
260 err = bus_dmamem_alloc(dma->dmat, &dma->addr,
261 (BUS_DMA_WAITOK | BUS_DMA_COHERENT
262 | BUS_DMA_ZERO), &dma->map);
264 device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
265 goto abort_with_dmat;
268 /* load the memory */
269 err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
270 mxge_dmamap_callback,
271 (void *)&dma->bus_addr, 0);
273 device_printf(dev, "couldn't load map (err = %d)\n", err);
279 bus_dmamem_free(dma->dmat, dma->addr, dma->map);
281 (void)bus_dma_tag_destroy(dma->dmat);
287 mxge_dma_free(mxge_dma_t *dma)
289 bus_dmamap_unload(dma->dmat, dma->map);
290 bus_dmamem_free(dma->dmat, dma->addr, dma->map);
291 (void)bus_dma_tag_destroy(dma->dmat);
295 * The eeprom strings on the lanaiX have the format
302 mxge_parse_strings(mxge_softc_t *sc)
304 #define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
309 ptr = sc->eeprom_strings;
310 limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
312 while (ptr < limit && *ptr != '\0') {
313 if (memcmp(ptr, "MAC=", 4) == 0) {
315 sc->mac_addr_string = ptr;
316 for (i = 0; i < 6; i++) {
318 if ((ptr + 2) > limit)
320 sc->mac_addr[i] = strtoul(ptr, NULL, 16);
323 } else if (memcmp(ptr, "PC=", 3) == 0) {
325 strncpy(sc->product_code_string, ptr,
326 sizeof (sc->product_code_string) - 1);
327 } else if (memcmp(ptr, "SN=", 3) == 0) {
329 strncpy(sc->serial_number_string, ptr,
330 sizeof (sc->serial_number_string) - 1);
332 MXGE_NEXT_STRING(ptr);
339 device_printf(sc->dev, "failed to parse eeprom_strings\n");
344 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
346 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
349 unsigned long base, off;
351 device_t pdev, mcp55;
352 uint16_t vendor_id, device_id, word;
353 uintptr_t bus, slot, func, ivend, idev;
357 if (!mxge_nvidia_ecrc_enable)
360 pdev = device_get_parent(device_get_parent(sc->dev));
362 device_printf(sc->dev, "could not find parent?\n");
365 vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
366 device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
368 if (vendor_id != 0x10de)
373 if (device_id == 0x005d) {
374 /* ck804, base address is magic */
376 } else if (device_id >= 0x0374 && device_id <= 0x378) {
377 /* mcp55, base address stored in chipset */
378 mcp55 = pci_find_bsf(0, 0, 0);
380 0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
381 0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
382 word = pci_read_config(mcp55, 0x90, 2);
383 base = ((unsigned long)word & 0x7ffeU) << 25;
390 Test below is commented because it is believed that doing
391 config read/write beyond 0xff will access the config space
392 for the next larger function. Uncomment this and remove
393 the hacky pmap_mapdev() way of accessing config space when
394 FreeBSD grows support for extended pcie config space access
397 /* See if we can, by some miracle, access the extended
399 val = pci_read_config(pdev, 0x178, 4);
400 if (val != 0xffffffff) {
402 pci_write_config(pdev, 0x178, val, 4);
406 /* Rather than using normal pci config space writes, we must
407 * map the Nvidia config space ourselves. This is because on
408 * opteron/nvidia class machine the 0xe000000 mapping is
409 * handled by the nvidia chipset, that means the internal PCI
410 * device (the on-chip northbridge), or the amd-8131 bridge
411 * and things behind them are not visible by this method.
414 BUS_READ_IVAR(device_get_parent(pdev), pdev,
416 BUS_READ_IVAR(device_get_parent(pdev), pdev,
417 PCI_IVAR_SLOT, &slot);
418 BUS_READ_IVAR(device_get_parent(pdev), pdev,
419 PCI_IVAR_FUNCTION, &func);
420 BUS_READ_IVAR(device_get_parent(pdev), pdev,
421 PCI_IVAR_VENDOR, &ivend);
422 BUS_READ_IVAR(device_get_parent(pdev), pdev,
423 PCI_IVAR_DEVICE, &idev);
426 + 0x00100000UL * (unsigned long)bus
427 + 0x00001000UL * (unsigned long)(func
430 /* map it into the kernel */
431 va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
435 device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
438 /* get a pointer to the config space mapped into the kernel */
439 cfgptr = va + (off & PAGE_MASK);
441 /* make sure that we can really access it */
442 vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
443 device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
444 if (! (vendor_id == ivend && device_id == idev)) {
445 device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
446 vendor_id, device_id);
447 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
451 ptr32 = (uint32_t*)(cfgptr + 0x178);
454 if (val == 0xffffffff) {
455 device_printf(sc->dev, "extended mapping failed\n");
456 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
460 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
462 device_printf(sc->dev,
463 "Enabled ECRC on upstream Nvidia bridge "
465 (int)bus, (int)slot, (int)func);
470 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
472 device_printf(sc->dev,
473 "Nforce 4 chipset on non-x86/x86_64!?!?!\n");
480 mxge_dma_test(mxge_softc_t *sc, int test_type)
483 bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
489 /* Run a small DMA test.
490 * The magic multipliers to the length tell the firmware
491 * to do DMA read, write, or read+write tests. The
492 * results are returned in cmd.data0. The upper 16
493 * bits of the return is the number of transfers completed.
494 * The lower 16 bits is the time in 0.5us ticks that the
495 * transfers took to complete.
498 len = sc->tx_boundary;
500 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
501 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
502 cmd.data2 = len * 0x10000;
503 status = mxge_send_cmd(sc, test_type, &cmd);
508 sc->read_dma = ((cmd.data0>>16) * len * 2) /
509 (cmd.data0 & 0xffff);
510 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
511 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
512 cmd.data2 = len * 0x1;
513 status = mxge_send_cmd(sc, test_type, &cmd);
518 sc->write_dma = ((cmd.data0>>16) * len * 2) /
519 (cmd.data0 & 0xffff);
521 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
522 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
523 cmd.data2 = len * 0x10001;
524 status = mxge_send_cmd(sc, test_type, &cmd);
529 sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
530 (cmd.data0 & 0xffff);
533 if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
534 device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
541 * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
542 * when the PCI-E Completion packets are aligned on an 8-byte
543 * boundary. Some PCI-E chip sets always align Completion packets; on
544 * the ones that do not, the alignment can be enforced by enabling
545 * ECRC generation (if supported).
547 * When PCI-E Completion packets are not aligned, it is actually more
548 * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
550 * If the driver can neither enable ECRC nor verify that it has
551 * already been enabled, then it must use a firmware image which works
552 * around unaligned completion packets (ethp_z8e.dat), and it should
553 * also ensure that it never gives the device a Read-DMA which is
554 * larger than 2KB by setting the tx_boundary to 2KB. If ECRC is
555 * enabled, then the driver should use the aligned (eth_z8e.dat)
556 * firmware image, and set tx_boundary to 4KB.
560 mxge_firmware_probe(mxge_softc_t *sc)
562 device_t dev = sc->dev;
566 sc->tx_boundary = 4096;
568 * Verify the max read request size was set to 4KB
569 * before trying the test with 4KB.
571 if (pci_find_extcap(dev, PCIY_EXPRESS, ®) == 0) {
572 pectl = pci_read_config(dev, reg + 0x8, 2);
573 if ((pectl & (5 << 12)) != (5 << 12)) {
574 device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
576 sc->tx_boundary = 2048;
581 * load the optimized firmware (which assumes aligned PCIe
582 * completions) in order to see if it works on this host.
584 sc->fw_name = mxge_fw_aligned;
585 status = mxge_load_firmware(sc, 1);
591 * Enable ECRC if possible
593 mxge_enable_nvidia_ecrc(sc);
596 * Run a DMA test which watches for unaligned completions and
597 * aborts on the first one seen.
600 status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
602 return 0; /* keep the aligned firmware */
605 device_printf(dev, "DMA test failed: %d\n", status);
606 if (status == ENOSYS)
607 device_printf(dev, "Falling back to ethp! "
608 "Please install up to date fw\n");
613 mxge_select_firmware(mxge_softc_t *sc)
618 if (mxge_force_firmware != 0) {
619 if (mxge_force_firmware == 1)
624 device_printf(sc->dev,
625 "Assuming %s completions (forced)\n",
626 aligned ? "aligned" : "unaligned");
630 /* if the PCIe link width is 4 or less, we can use the aligned
631 firmware and skip any checks */
632 if (sc->link_width != 0 && sc->link_width <= 4) {
633 device_printf(sc->dev,
634 "PCIe x%d Link, expect reduced performance\n",
640 if (0 == mxge_firmware_probe(sc))
645 sc->fw_name = mxge_fw_aligned;
646 sc->tx_boundary = 4096;
648 sc->fw_name = mxge_fw_unaligned;
649 sc->tx_boundary = 2048;
651 return (mxge_load_firmware(sc, 0));
661 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
665 if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
666 device_printf(sc->dev, "Bad firmware type: 0x%x\n",
667 be32toh(hdr->mcp_type));
671 /* save firmware version for sysctl */
672 strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
674 device_printf(sc->dev, "firmware id: %s\n", hdr->version);
676 ksscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
677 &sc->fw_ver_minor, &sc->fw_ver_tiny);
679 if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
680 && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
681 device_printf(sc->dev, "Found firmware version %s\n",
683 device_printf(sc->dev, "Driver needs %d.%d\n",
684 MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
692 z_alloc(void *nil, u_int items, u_int size)
696 ptr = kmalloc(items * size, M_TEMP, M_NOWAIT);
701 z_free(void *nil, void *ptr)
708 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
711 char *inflate_buffer;
712 const struct firmware *fw;
713 const mcp_gen_header_t *hdr;
720 fw = firmware_get(sc->fw_name);
722 device_printf(sc->dev, "Could not find firmware image %s\n",
729 /* setup zlib and decompress f/w */
730 bzero(&zs, sizeof (zs));
733 status = inflateInit(&zs);
734 if (status != Z_OK) {
739 /* the uncompressed size is stored as the firmware version,
740 which would otherwise go unused */
741 fw_len = (size_t) fw->version;
742 inflate_buffer = kmalloc(fw_len, M_TEMP, M_NOWAIT);
743 if (inflate_buffer == NULL)
745 zs.avail_in = fw->datasize;
746 zs.next_in = __DECONST(char *, fw->data);
747 zs.avail_out = fw_len;
748 zs.next_out = inflate_buffer;
749 status = inflate(&zs, Z_FINISH);
750 if (status != Z_STREAM_END) {
751 device_printf(sc->dev, "zlib %d\n", status);
753 goto abort_with_buffer;
757 hdr_offset = htobe32(*(const uint32_t *)
758 (inflate_buffer + MCP_HEADER_PTR_OFFSET));
759 if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
760 device_printf(sc->dev, "Bad firmware file");
762 goto abort_with_buffer;
764 hdr = (const void*)(inflate_buffer + hdr_offset);
766 status = mxge_validate_firmware(sc, hdr);
768 goto abort_with_buffer;
770 /* Copy the inflated firmware to NIC SRAM. */
771 for (i = 0; i < fw_len; i += 256) {
772 mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
774 min(256U, (unsigned)(fw_len - i)));
783 kfree(inflate_buffer, M_TEMP);
787 firmware_put(fw, FIRMWARE_UNLOAD);
792 * Enable or disable periodic RDMAs from the host to make certain
793 * chipsets resend dropped PCIe messages
797 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
800 volatile uint32_t *confirm;
801 volatile char *submit;
802 uint32_t *buf, dma_low, dma_high;
805 buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
807 /* clear confirmation addr */
808 confirm = (volatile uint32_t *)sc->cmd;
812 /* send an rdma command to the PCIe engine, and wait for the
813 response in the confirmation address. The firmware should
814 write a -1 there to indicate it is alive and well
817 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
818 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
819 buf[0] = htobe32(dma_high); /* confirm addr MSW */
820 buf[1] = htobe32(dma_low); /* confirm addr LSW */
821 buf[2] = htobe32(0xffffffff); /* confirm data */
822 dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
823 dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
824 buf[3] = htobe32(dma_high); /* dummy addr MSW */
825 buf[4] = htobe32(dma_low); /* dummy addr LSW */
826 buf[5] = htobe32(enable); /* enable? */
829 submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
831 mxge_pio_copy(submit, buf, 64);
836 while (*confirm != 0xffffffff && i < 20) {
840 if (*confirm != 0xffffffff) {
841 device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
842 (enable ? "enable" : "disable"), confirm,
849 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
852 char buf_bytes[sizeof(*buf) + 8];
853 volatile mcp_cmd_response_t *response = sc->cmd;
854 volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
855 uint32_t dma_low, dma_high;
856 int err, sleep_total = 0;
859 * We may be called during attach, before if_serializer is available.
860 * This is not a fast path, just check for NULL
863 if (sc->ifp->if_serializer)
864 ASSERT_SERIALIZED(sc->ifp->if_serializer);
866 /* ensure buf is aligned to 8 bytes */
867 buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
869 buf->data0 = htobe32(data->data0);
870 buf->data1 = htobe32(data->data1);
871 buf->data2 = htobe32(data->data2);
872 buf->cmd = htobe32(cmd);
873 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
874 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
876 buf->response_addr.low = htobe32(dma_low);
877 buf->response_addr.high = htobe32(dma_high);
880 response->result = 0xffffffff;
882 mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
884 /* wait up to 20ms */
886 for (sleep_total = 0; sleep_total < 20; sleep_total++) {
887 bus_dmamap_sync(sc->cmd_dma.dmat,
888 sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
890 switch (be32toh(response->result)) {
892 data->data0 = be32toh(response->data);
898 case MXGEFW_CMD_UNKNOWN:
901 case MXGEFW_CMD_ERROR_UNALIGNED:
904 case MXGEFW_CMD_ERROR_BUSY:
908 device_printf(sc->dev,
910 "failed, result = %d\n",
911 cmd, be32toh(response->result));
919 device_printf(sc->dev, "mxge: command %d timed out"
921 cmd, be32toh(response->result));
926 mxge_adopt_running_firmware(mxge_softc_t *sc)
928 struct mcp_gen_header *hdr;
929 const size_t bytes = sizeof (struct mcp_gen_header);
933 /* find running firmware header */
934 hdr_offset = htobe32(*(volatile uint32_t *)
935 (sc->sram + MCP_HEADER_PTR_OFFSET));
937 if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
938 device_printf(sc->dev,
939 "Running firmware has bad header offset (%d)\n",
944 /* copy header of running firmware from SRAM to host memory to
945 * validate firmware */
946 hdr = kmalloc(bytes, M_DEVBUF, M_NOWAIT);
948 device_printf(sc->dev, "could not kmalloc firmware hdr\n");
951 bus_space_read_region_1(rman_get_bustag(sc->mem_res),
952 rman_get_bushandle(sc->mem_res),
953 hdr_offset, (char *)hdr, bytes);
954 status = mxge_validate_firmware(sc, hdr);
955 kfree(hdr, M_DEVBUF);
958 * check to see if adopted firmware has bug where adopting
959 * it will cause broadcasts to be filtered unless the NIC
960 * is kept in ALLMULTI mode
962 if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
963 sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
964 sc->adopted_rx_filter_bug = 1;
965 device_printf(sc->dev, "Adopting fw %d.%d.%d: "
966 "working around rx filter bug\n",
967 sc->fw_ver_major, sc->fw_ver_minor,
976 mxge_load_firmware(mxge_softc_t *sc, int adopt)
978 volatile uint32_t *confirm;
979 volatile char *submit;
981 uint32_t *buf, size, dma_low, dma_high;
984 buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
986 size = sc->sram_size;
987 status = mxge_load_firmware_helper(sc, &size);
991 /* Try to use the currently running firmware, if
993 status = mxge_adopt_running_firmware(sc);
995 device_printf(sc->dev,
996 "failed to adopt running firmware\n");
999 device_printf(sc->dev,
1000 "Successfully adopted running firmware\n");
1001 if (sc->tx_boundary == 4096) {
1002 device_printf(sc->dev,
1003 "Using firmware currently running on NIC"
1005 device_printf(sc->dev,
1006 "performance consider loading optimized "
1009 sc->fw_name = mxge_fw_unaligned;
1010 sc->tx_boundary = 2048;
1013 /* clear confirmation addr */
1014 confirm = (volatile uint32_t *)sc->cmd;
1017 /* send a reload command to the bootstrap MCP, and wait for the
1018 response in the confirmation address. The firmware should
1019 write a -1 there to indicate it is alive and well
1022 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
1023 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
1025 buf[0] = htobe32(dma_high); /* confirm addr MSW */
1026 buf[1] = htobe32(dma_low); /* confirm addr LSW */
1027 buf[2] = htobe32(0xffffffff); /* confirm data */
1029 /* FIX: All newest firmware should un-protect the bottom of
1030 the sram before handoff. However, the very first interfaces
1031 do not. Therefore the handoff copy must skip the first 8 bytes
1033 /* where the code starts*/
1034 buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1035 buf[4] = htobe32(size - 8); /* length of code */
1036 buf[5] = htobe32(8); /* where to copy to */
1037 buf[6] = htobe32(0); /* where to jump to */
1039 submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1040 mxge_pio_copy(submit, buf, 64);
1045 while (*confirm != 0xffffffff && i < 20) {
1048 bus_dmamap_sync(sc->cmd_dma.dmat,
1049 sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1051 if (*confirm != 0xffffffff) {
1052 device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1061 mxge_update_mac_address(mxge_softc_t *sc)
1064 uint8_t *addr = sc->mac_addr;
1068 cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1069 | (addr[2] << 8) | addr[3]);
1071 cmd.data1 = ((addr[4] << 8) | (addr[5]));
1073 status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1078 mxge_change_pause(mxge_softc_t *sc, int pause)
1084 status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1087 status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1091 device_printf(sc->dev, "Failed to set flow control mode\n");
1099 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1104 if( sc->ifp->if_serializer)
1105 ASSERT_SERIALIZED(sc->ifp->if_serializer);
1106 if (mxge_always_promisc)
1110 status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1113 status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1117 device_printf(sc->dev, "Failed to set promisc mode\n");
1122 mxge_set_multicast_list(mxge_softc_t *sc)
1125 struct ifmultiaddr *ifma;
1126 struct ifnet *ifp = sc->ifp;
1129 if (ifp->if_serializer)
1130 ASSERT_SERIALIZED(ifp->if_serializer);
1132 /* This firmware is known to not support multicast */
1133 if (!sc->fw_multicast_support)
1136 /* Disable multicast filtering while we play with the lists*/
1137 err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1139 device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1140 " error status: %d\n", err);
1144 if (sc->adopted_rx_filter_bug)
1147 if (ifp->if_flags & IFF_ALLMULTI)
1148 /* request to disable multicast filtering, so quit here */
1151 /* Flush all the filters */
1153 err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1155 device_printf(sc->dev,
1156 "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1157 ", error status: %d\n", err);
1161 /* Walk the multicast list, and add each address */
1163 TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1164 if (ifma->ifma_addr->sa_family != AF_LINK)
1166 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1168 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1170 cmd.data0 = htonl(cmd.data0);
1171 cmd.data1 = htonl(cmd.data1);
1172 err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1174 device_printf(sc->dev, "Failed "
1175 "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1177 /* abort, leaving multicast filtering off */
1181 /* Enable multicast filtering */
1182 err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1184 device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1185 ", error status: %d\n", err);
1190 mxge_max_mtu(mxge_softc_t *sc)
1195 if (MJUMPAGESIZE - MXGEFW_PAD > MXGEFW_MAX_MTU)
1196 return MXGEFW_MAX_MTU - MXGEFW_PAD;
1198 /* try to set nbufs to see if it we can
1199 use virtually contiguous jumbos */
1201 status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1204 return MXGEFW_MAX_MTU - MXGEFW_PAD;
1206 /* otherwise, we're limited to MJUMPAGESIZE */
1207 return MJUMPAGESIZE - MXGEFW_PAD;
1211 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1213 struct mxge_slice_state *ss;
1214 mxge_rx_done_t *rx_done;
1215 volatile uint32_t *irq_claim;
1219 /* try to send a reset command to the card to see if it
1221 memset(&cmd, 0, sizeof (cmd));
1222 status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1224 device_printf(sc->dev, "failed reset\n");
1228 mxge_dummy_rdma(sc, 1);
1231 /* set the intrq size */
1232 cmd.data0 = sc->rx_ring_size;
1233 status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1236 * Even though we already know how many slices are supported
1237 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1238 * has magic side effects, and must be called after a reset.
1239 * It must be called prior to calling any RSS related cmds,
1240 * including assigning an interrupt queue for anything but
1241 * slice 0. It must also be called *after*
1242 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1243 * the firmware to compute offsets.
1246 if (sc->num_slices > 1) {
1247 /* ask the maximum number of slices it supports */
1248 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1251 device_printf(sc->dev,
1252 "failed to get number of slices\n");
1256 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1257 * to setting up the interrupt queue DMA
1259 cmd.data0 = sc->num_slices;
1260 cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1261 #ifdef IFNET_BUF_RING
1262 cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1264 status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1267 device_printf(sc->dev,
1268 "failed to set number of slices\n");
1274 if (interrupts_setup) {
1275 /* Now exchange information about interrupts */
1276 for (slice = 0; slice < sc->num_slices; slice++) {
1277 rx_done = &sc->ss[slice].rx_done;
1278 memset(rx_done->entry, 0, sc->rx_ring_size);
1279 cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1280 cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1282 status |= mxge_send_cmd(sc,
1283 MXGEFW_CMD_SET_INTRQ_DMA,
1288 status |= mxge_send_cmd(sc,
1289 MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1292 sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1294 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1295 irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1298 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1300 sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1302 device_printf(sc->dev, "failed set interrupt parameters\n");
1307 *sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1310 /* run a DMA benchmark */
1311 (void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1313 for (slice = 0; slice < sc->num_slices; slice++) {
1314 ss = &sc->ss[slice];
1316 ss->irq_claim = irq_claim + (2 * slice);
1317 /* reset mcp/driver shared state back to 0 */
1318 ss->rx_done.idx = 0;
1319 ss->rx_done.cnt = 0;
1322 ss->tx.pkt_done = 0;
1323 ss->tx.queue_active = 0;
1324 ss->tx.activate = 0;
1325 ss->tx.deactivate = 0;
1330 ss->rx_small.cnt = 0;
1331 ss->lro_bad_csum = 0;
1333 ss->lro_flushed = 0;
1334 if (ss->fw_stats != NULL) {
1335 ss->fw_stats->valid = 0;
1336 ss->fw_stats->send_done_count = 0;
1339 sc->rdma_tags_available = 15;
1340 status = mxge_update_mac_address(sc);
1341 mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1342 mxge_change_pause(sc, sc->pause);
1343 mxge_set_multicast_list(sc);
1348 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1351 unsigned int intr_coal_delay;
1355 intr_coal_delay = sc->intr_coal_delay;
1356 err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1360 if (intr_coal_delay == sc->intr_coal_delay)
1363 if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1366 lwkt_serialize_enter(sc->ifp->if_serializer);
1367 *sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1368 sc->intr_coal_delay = intr_coal_delay;
1370 lwkt_serialize_exit(sc->ifp->if_serializer);
1375 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1378 unsigned int enabled;
1382 enabled = sc->pause;
1383 err = sysctl_handle_int(oidp, &enabled, arg2, req);
1387 if (enabled == sc->pause)
1390 lwkt_serialize_enter(sc->ifp->if_serializer);
1391 err = mxge_change_pause(sc, enabled);
1392 lwkt_serialize_exit(sc->ifp->if_serializer);
1397 mxge_change_lro_locked(mxge_softc_t *sc, int lro_cnt)
1404 ifp->if_capenable &= ~IFCAP_LRO;
1406 ifp->if_capenable |= IFCAP_LRO;
1407 sc->lro_cnt = lro_cnt;
1408 if (ifp->if_flags & IFF_RUNNING) {
1410 err = mxge_open(sc);
1416 mxge_change_lro(SYSCTL_HANDLER_ARGS)
1419 unsigned int lro_cnt;
1423 lro_cnt = sc->lro_cnt;
1424 err = sysctl_handle_int(oidp, &lro_cnt, arg2, req);
1428 if (lro_cnt == sc->lro_cnt)
1434 lwkt_serialize_enter(sc->ifp->if_serializer);
1435 err = mxge_change_lro_locked(sc, lro_cnt);
1436 lwkt_serialize_exit(sc->ifp->if_serializer);
1441 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1447 arg2 = be32toh(*(int *)arg1);
1449 err = sysctl_handle_int(oidp, arg1, arg2, req);
1455 mxge_rem_sysctls(mxge_softc_t *sc)
1457 struct mxge_slice_state *ss;
1460 if (sc->slice_sysctl_tree == NULL)
1463 for (slice = 0; slice < sc->num_slices; slice++) {
1464 ss = &sc->ss[slice];
1465 if (ss == NULL || ss->sysctl_tree == NULL)
1467 sysctl_ctx_free(&ss->sysctl_ctx);
1468 ss->sysctl_tree = NULL;
1470 sysctl_ctx_free(&sc->slice_sysctl_ctx);
1471 sc->slice_sysctl_tree = NULL;
1472 sysctl_ctx_free(&sc->sysctl_ctx);
1473 sc->sysctl_tree = NULL;
1478 mxge_add_sysctls(mxge_softc_t *sc)
1480 struct sysctl_ctx_list *ctx;
1481 struct sysctl_oid_list *children;
1483 struct mxge_slice_state *ss;
1487 ctx = &sc->sysctl_ctx;
1488 sysctl_ctx_init(ctx);
1489 sc->sysctl_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_STATIC_CHILDREN(_hw),
1491 device_get_nameunit(sc->dev),
1493 if (sc->sysctl_tree == NULL) {
1494 device_printf(sc->dev, "can't add sysctl node\n");
1498 children = SYSCTL_CHILDREN(sc->sysctl_tree);
1499 fw = sc->ss[0].fw_stats;
1501 /* random information */
1502 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1504 CTLFLAG_RD, &sc->fw_version,
1505 0, "firmware version");
1506 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1508 CTLFLAG_RD, &sc->serial_number_string,
1509 0, "serial number");
1510 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1512 CTLFLAG_RD, &sc->product_code_string,
1514 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1516 CTLFLAG_RD, &sc->link_width,
1518 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1520 CTLFLAG_RD, &sc->tx_boundary,
1522 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1524 CTLFLAG_RD, &sc->wc,
1525 0, "write combining PIO?");
1526 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1528 CTLFLAG_RD, &sc->read_dma,
1529 0, "DMA Read speed in MB/s");
1530 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1532 CTLFLAG_RD, &sc->write_dma,
1533 0, "DMA Write speed in MB/s");
1534 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1535 "read_write_dma_MBs",
1536 CTLFLAG_RD, &sc->read_write_dma,
1537 0, "DMA concurrent Read/Write speed in MB/s");
1540 /* performance related tunables */
1541 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1543 CTLTYPE_INT|CTLFLAG_RW, sc,
1544 0, mxge_change_intr_coal,
1545 "I", "interrupt coalescing delay in usecs");
1547 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1548 "flow_control_enabled",
1549 CTLTYPE_INT|CTLFLAG_RW, sc,
1550 0, mxge_change_flow_control,
1551 "I", "interrupt coalescing delay in usecs");
1553 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1555 CTLFLAG_RW, &mxge_deassert_wait,
1556 0, "Wait for IRQ line to go low in ihandler");
1558 /* stats block from firmware is in network byte order.
1560 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1562 CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1563 0, mxge_handle_be32,
1565 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1566 "rdma_tags_available",
1567 CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1568 0, mxge_handle_be32,
1569 "I", "rdma_tags_available");
1570 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1571 "dropped_bad_crc32",
1572 CTLTYPE_INT|CTLFLAG_RD,
1573 &fw->dropped_bad_crc32,
1574 0, mxge_handle_be32,
1575 "I", "dropped_bad_crc32");
1576 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1578 CTLTYPE_INT|CTLFLAG_RD,
1579 &fw->dropped_bad_phy,
1580 0, mxge_handle_be32,
1581 "I", "dropped_bad_phy");
1582 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1583 "dropped_link_error_or_filtered",
1584 CTLTYPE_INT|CTLFLAG_RD,
1585 &fw->dropped_link_error_or_filtered,
1586 0, mxge_handle_be32,
1587 "I", "dropped_link_error_or_filtered");
1588 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1589 "dropped_link_overflow",
1590 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1591 0, mxge_handle_be32,
1592 "I", "dropped_link_overflow");
1593 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1594 "dropped_multicast_filtered",
1595 CTLTYPE_INT|CTLFLAG_RD,
1596 &fw->dropped_multicast_filtered,
1597 0, mxge_handle_be32,
1598 "I", "dropped_multicast_filtered");
1599 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1600 "dropped_no_big_buffer",
1601 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1602 0, mxge_handle_be32,
1603 "I", "dropped_no_big_buffer");
1604 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1605 "dropped_no_small_buffer",
1606 CTLTYPE_INT|CTLFLAG_RD,
1607 &fw->dropped_no_small_buffer,
1608 0, mxge_handle_be32,
1609 "I", "dropped_no_small_buffer");
1610 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1612 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1613 0, mxge_handle_be32,
1614 "I", "dropped_overrun");
1615 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1617 CTLTYPE_INT|CTLFLAG_RD,
1619 0, mxge_handle_be32,
1620 "I", "dropped_pause");
1621 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1623 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1624 0, mxge_handle_be32,
1625 "I", "dropped_runt");
1627 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1628 "dropped_unicast_filtered",
1629 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1630 0, mxge_handle_be32,
1631 "I", "dropped_unicast_filtered");
1633 /* verbose printing? */
1634 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1636 CTLFLAG_RW, &mxge_verbose,
1637 0, "verbose printing");
1640 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1642 CTLTYPE_INT|CTLFLAG_RW, sc,
1644 "I", "number of lro merge queues");
1647 /* add counters exported for debugging from all slices */
1648 sysctl_ctx_init(&sc->slice_sysctl_ctx);
1649 sc->slice_sysctl_tree =
1650 SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1651 "slice", CTLFLAG_RD, 0, "");
1653 for (slice = 0; slice < sc->num_slices; slice++) {
1654 ss = &sc->ss[slice];
1655 sysctl_ctx_init(&ss->sysctl_ctx);
1656 ctx = &ss->sysctl_ctx;
1657 children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1658 ksprintf(slice_num, "%d", slice);
1660 SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1662 children = SYSCTL_CHILDREN(ss->sysctl_tree);
1663 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1665 CTLFLAG_RD, &ss->rx_small.cnt,
1667 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1669 CTLFLAG_RD, &ss->rx_big.cnt,
1671 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1672 "lro_flushed", CTLFLAG_RD, &ss->lro_flushed,
1673 0, "number of lro merge queues flushed");
1675 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1676 "lro_queued", CTLFLAG_RD, &ss->lro_queued,
1677 0, "number of frames appended to lro merge"
1680 #ifndef IFNET_BUF_RING
1681 /* only transmit from slice 0 for now */
1685 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1687 CTLFLAG_RD, &ss->tx.req,
1690 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1692 CTLFLAG_RD, &ss->tx.done,
1694 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1696 CTLFLAG_RD, &ss->tx.pkt_done,
1698 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1700 CTLFLAG_RD, &ss->tx.stall,
1702 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1704 CTLFLAG_RD, &ss->tx.wake,
1706 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1708 CTLFLAG_RD, &ss->tx.defrag,
1710 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1712 CTLFLAG_RD, &ss->tx.queue_active,
1713 0, "tx_queue_active");
1714 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1716 CTLFLAG_RD, &ss->tx.activate,
1718 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1720 CTLFLAG_RD, &ss->tx.deactivate,
1721 0, "tx_deactivate");
1725 /* copy an array of mcp_kreq_ether_send_t's to the mcp. Copy
1726 backwards one at a time and handle ring wraps */
1729 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1730 mcp_kreq_ether_send_t *src, int cnt)
1732 int idx, starting_slot;
1733 starting_slot = tx->req;
1736 idx = (starting_slot + cnt) & tx->mask;
1737 mxge_pio_copy(&tx->lanai[idx],
1738 &src[cnt], sizeof(*src));
1744 * copy an array of mcp_kreq_ether_send_t's to the mcp. Copy
1745 * at most 32 bytes at a time, so as to avoid involving the software
1746 * pio handler in the nic. We re-write the first segment's flags
1747 * to mark them valid only after writing the entire chain
1751 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1756 volatile uint32_t *dst_ints;
1757 mcp_kreq_ether_send_t *srcp;
1758 volatile mcp_kreq_ether_send_t *dstp, *dst;
1761 idx = tx->req & tx->mask;
1763 last_flags = src->flags;
1766 dst = dstp = &tx->lanai[idx];
1769 if ((idx + cnt) < tx->mask) {
1770 for (i = 0; i < (cnt - 1); i += 2) {
1771 mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1772 wmb(); /* force write every 32 bytes */
1777 /* submit all but the first request, and ensure
1778 that it is submitted below */
1779 mxge_submit_req_backwards(tx, src, cnt);
1783 /* submit the first request */
1784 mxge_pio_copy(dstp, srcp, sizeof(*src));
1785 wmb(); /* barrier before setting valid flag */
1788 /* re-write the last 32-bits with the valid flags */
1789 src->flags = last_flags;
1790 src_ints = (uint32_t *)src;
1792 dst_ints = (volatile uint32_t *)dst;
1794 *dst_ints = *src_ints;
1802 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1803 int busdma_seg_cnt, int ip_off)
1806 mcp_kreq_ether_send_t *req;
1807 bus_dma_segment_t *seg;
1810 uint32_t low, high_swapped;
1811 int len, seglen, cum_len, cum_len_next;
1812 int next_is_first, chop, cnt, rdma_count, small;
1813 uint16_t pseudo_hdr_offset, cksum_offset, mss;
1814 uint8_t flags, flags_next;
1817 mss = m->m_pkthdr.tso_segsz;
1819 /* negative cum_len signifies to the
1820 * send loop that we are still in the
1821 * header portion of the TSO packet.
1824 /* ensure we have the ethernet, IP and TCP
1825 header together in the first mbuf, copy
1826 it to a scratch buffer if not */
1827 if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1828 m_copydata(m, 0, ip_off + sizeof (*ip),
1830 ip = (struct ip *)(ss->scratch + ip_off);
1832 ip = (struct ip *)(mtod(m, char *) + ip_off);
1834 if (__predict_false(m->m_len < ip_off + (ip->ip_hl << 2)
1836 m_copydata(m, 0, ip_off + (ip->ip_hl << 2)
1837 + sizeof (*tcp), ss->scratch);
1838 ip = (struct ip *)(mtod(m, char *) + ip_off);
1841 tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1842 cum_len = -(ip_off + ((ip->ip_hl + tcp->th_off) << 2));
1844 /* TSO implies checksum offload on this hardware */
1845 cksum_offset = ip_off + (ip->ip_hl << 2);
1846 flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1849 /* for TSO, pseudo_hdr_offset holds mss.
1850 * The firmware figures out where to put
1851 * the checksum by parsing the header. */
1852 pseudo_hdr_offset = htobe16(mss);
1859 /* "rdma_count" is the number of RDMAs belonging to the
1860 * current packet BEFORE the current send request. For
1861 * non-TSO packets, this is equal to "count".
1862 * For TSO packets, rdma_count needs to be reset
1863 * to 0 after a segment cut.
1865 * The rdma_count field of the send request is
1866 * the number of RDMAs of the packet starting at
1867 * that request. For TSO send requests with one ore more cuts
1868 * in the middle, this is the number of RDMAs starting
1869 * after the last cut in the request. All previous
1870 * segments before the last cut implicitly have 1 RDMA.
1872 * Since the number of RDMAs is not known beforehand,
1873 * it must be filled-in retroactively - after each
1874 * segmentation cut or at the end of the entire packet.
1877 while (busdma_seg_cnt) {
1878 /* Break the busdma segment up into pieces*/
1879 low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1880 high_swapped = htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1884 flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1886 cum_len_next = cum_len + seglen;
1887 (req-rdma_count)->rdma_count = rdma_count + 1;
1888 if (__predict_true(cum_len >= 0)) {
1890 chop = (cum_len_next > mss);
1891 cum_len_next = cum_len_next % mss;
1892 next_is_first = (cum_len_next == 0);
1893 flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1894 flags_next |= next_is_first *
1896 rdma_count |= -(chop | next_is_first);
1897 rdma_count += chop & !next_is_first;
1898 } else if (cum_len_next >= 0) {
1903 small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1904 flags_next = MXGEFW_FLAGS_TSO_PLD |
1905 MXGEFW_FLAGS_FIRST |
1906 (small * MXGEFW_FLAGS_SMALL);
1909 req->addr_high = high_swapped;
1910 req->addr_low = htobe32(low);
1911 req->pseudo_hdr_offset = pseudo_hdr_offset;
1913 req->rdma_count = 1;
1914 req->length = htobe16(seglen);
1915 req->cksum_offset = cksum_offset;
1916 req->flags = flags | ((cum_len & 1) *
1917 MXGEFW_FLAGS_ALIGN_ODD);
1920 cum_len = cum_len_next;
1925 if (__predict_false(cksum_offset > seglen))
1926 cksum_offset -= seglen;
1929 if (__predict_false(cnt > tx->max_desc))
1935 (req-rdma_count)->rdma_count = rdma_count;
1939 req->flags |= MXGEFW_FLAGS_TSO_LAST;
1940 } while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1942 tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1943 mxge_submit_req(tx, tx->req_list, cnt);
1944 #ifdef IFNET_BUF_RING
1945 if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1946 /* tell the NIC to start polling this slice */
1948 tx->queue_active = 1;
1956 bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1960 kprintf("tx->max_desc exceeded via TSO!\n");
1961 kprintf("mss = %d, %ld, %d!\n", mss,
1962 (long)seg - (long)tx->seg_list, tx->max_desc);
1969 #endif /* IFCAP_TSO4 */
1971 #ifdef MXGE_NEW_VLAN_API
1973 * We reproduce the software vlan tag insertion from
1974 * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
1975 * vlan tag insertion. We need to advertise this in order to have the
1976 * vlan interface respect our csum offload flags.
1978 static struct mbuf *
1979 mxge_vlan_tag_insert(struct mbuf *m)
1981 struct ether_vlan_header *evl;
1983 M_PREPEND(m, EVL_ENCAPLEN, MB_DONTWAIT);
1984 if (__predict_false(m == NULL))
1986 if (m->m_len < sizeof(*evl)) {
1987 m = m_pullup(m, sizeof(*evl));
1988 if (__predict_false(m == NULL))
1992 * Transform the Ethernet header into an Ethernet header
1993 * with 802.1Q encapsulation.
1995 evl = mtod(m, struct ether_vlan_header *);
1996 bcopy((char *)evl + EVL_ENCAPLEN,
1997 (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
1998 evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
1999 evl->evl_tag = htons(m->m_pkthdr.ether_vlantag);
2000 m->m_flags &= ~M_VLANTAG;
2003 #endif /* MXGE_NEW_VLAN_API */
2006 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
2009 mcp_kreq_ether_send_t *req;
2010 bus_dma_segment_t *seg;
2015 int cnt, cum_len, err, i, idx, odd_flag, ip_off;
2016 uint16_t pseudo_hdr_offset;
2017 uint8_t flags, cksum_offset;
2024 ip_off = sizeof (struct ether_header);
2025 #ifdef MXGE_NEW_VLAN_API
2026 if (m->m_flags & M_VLANTAG) {
2027 m = mxge_vlan_tag_insert(m);
2028 if (__predict_false(m == NULL))
2030 ip_off += EVL_ENCAPLEN;
2033 /* (try to) map the frame for DMA */
2034 idx = tx->req & tx->mask;
2035 err = bus_dmamap_load_mbuf_segment(tx->dmat, tx->info[idx].map,
2036 m, tx->seg_list, 1, &cnt,
2038 if (__predict_false(err == EFBIG)) {
2039 /* Too many segments in the chain. Try
2041 m_tmp = m_defrag(m, MB_DONTWAIT);
2042 if (m_tmp == NULL) {
2047 err = bus_dmamap_load_mbuf_segment(tx->dmat,
2049 m, tx->seg_list, 1, &cnt,
2052 if (__predict_false(err != 0)) {
2053 device_printf(sc->dev, "bus_dmamap_load_mbuf_segment returned %d"
2054 " packet len = %d\n", err, m->m_pkthdr.len);
2057 bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2058 BUS_DMASYNC_PREWRITE);
2059 tx->info[idx].m = m;
2062 /* TSO is different enough, we handle it in another routine */
2063 if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2064 mxge_encap_tso(ss, m, cnt, ip_off);
2071 pseudo_hdr_offset = 0;
2072 flags = MXGEFW_FLAGS_NO_TSO;
2074 /* checksum offloading? */
2075 if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
2076 /* ensure ip header is in first mbuf, copy
2077 it to a scratch buffer if not */
2078 if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
2079 m_copydata(m, 0, ip_off + sizeof (*ip),
2081 ip = (struct ip *)(ss->scratch + ip_off);
2083 ip = (struct ip *)(mtod(m, char *) + ip_off);
2085 cksum_offset = ip_off + (ip->ip_hl << 2);
2086 pseudo_hdr_offset = cksum_offset + m->m_pkthdr.csum_data;
2087 pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2088 req->cksum_offset = cksum_offset;
2089 flags |= MXGEFW_FLAGS_CKSUM;
2090 odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2094 if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2095 flags |= MXGEFW_FLAGS_SMALL;
2097 /* convert segments into a request list */
2100 req->flags = MXGEFW_FLAGS_FIRST;
2101 for (i = 0; i < cnt; i++) {
2103 htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2105 htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2106 req->length = htobe16(seg->ds_len);
2107 req->cksum_offset = cksum_offset;
2108 if (cksum_offset > seg->ds_len)
2109 cksum_offset -= seg->ds_len;
2112 req->pseudo_hdr_offset = pseudo_hdr_offset;
2113 req->pad = 0; /* complete solid 16-byte block */
2114 req->rdma_count = 1;
2115 req->flags |= flags | ((cum_len & 1) * odd_flag);
2116 cum_len += seg->ds_len;
2122 /* pad runts to 60 bytes */
2126 htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2128 htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2129 req->length = htobe16(60 - cum_len);
2130 req->cksum_offset = 0;
2131 req->pseudo_hdr_offset = pseudo_hdr_offset;
2132 req->pad = 0; /* complete solid 16-byte block */
2133 req->rdma_count = 1;
2134 req->flags |= flags | ((cum_len & 1) * odd_flag);
2138 tx->req_list[0].rdma_count = cnt;
2140 /* print what the firmware will see */
2141 for (i = 0; i < cnt; i++) {
2142 kprintf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2143 "cso:%d, flags:0x%x, rdma:%d\n",
2144 i, (int)ntohl(tx->req_list[i].addr_high),
2145 (int)ntohl(tx->req_list[i].addr_low),
2146 (int)ntohs(tx->req_list[i].length),
2147 (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2148 tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2149 tx->req_list[i].rdma_count);
2151 kprintf("--------------\n");
2153 tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2154 mxge_submit_req(tx, tx->req_list, cnt);
2155 #ifdef IFNET_BUF_RING
2156 if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2157 /* tell the NIC to start polling this slice */
2159 tx->queue_active = 1;
2172 #ifdef IFNET_BUF_RING
2174 mxge_qflush(struct ifnet *ifp)
2176 mxge_softc_t *sc = ifp->if_softc;
2181 for (slice = 0; slice < sc->num_slices; slice++) {
2182 tx = &sc->ss[slice].tx;
2183 lwkt_serialize_enter(sc->ifp->if_serializer);
2184 while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2186 lwkt_serialize_exit(sc->ifp->if_serializer);
2192 mxge_start_locked(struct mxge_slice_state *ss)
2203 while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2204 m = drbr_dequeue(ifp, tx->br);
2208 /* let BPF see it */
2211 /* give it to the nic */
2214 /* ran out of transmit slots */
2215 if (((ss->if_flags & IFF_OACTIVE) == 0)
2216 && (!drbr_empty(ifp, tx->br))) {
2217 ss->if_flags |= IFF_OACTIVE;
2223 mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2234 if ((ss->if_flags & (IFF_RUNNING|IFF_OACTIVE)) !=
2236 err = drbr_enqueue(ifp, tx->br, m);
2240 if (drbr_empty(ifp, tx->br) &&
2241 ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2242 /* let BPF see it */
2244 /* give it to the nic */
2246 } else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2249 if (!drbr_empty(ifp, tx->br))
2250 mxge_start_locked(ss);
2255 mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2257 mxge_softc_t *sc = ifp->if_softc;
2258 struct mxge_slice_state *ss;
2264 slice = m->m_pkthdr.flowid;
2266 slice &= (sc->num_slices - 1); /* num_slices always power of 2 */
2268 ss = &sc->ss[slice];
2271 if(lwkt_serialize_try(ifp->if_serializer)) {
2272 err = mxge_transmit_locked(ss, m);
2273 lwkt_serialize_exit(ifp->if_serializer);
2275 err = drbr_enqueue(ifp, tx->br, m);
2284 mxge_start_locked(struct mxge_slice_state *ss)
2294 while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2295 m = ifq_dequeue(&ifp->if_snd, NULL);
2299 /* let BPF see it */
2302 /* give it to the nic */
2305 /* ran out of transmit slots */
2306 if ((sc->ifp->if_flags & IFF_OACTIVE) == 0) {
2307 sc->ifp->if_flags |= IFF_OACTIVE;
2313 mxge_start(struct ifnet *ifp)
2315 mxge_softc_t *sc = ifp->if_softc;
2316 struct mxge_slice_state *ss;
2318 ASSERT_SERIALIZED(sc->ifp->if_serializer);
2319 /* only use the first slice for now */
2321 mxge_start_locked(ss);
2325 * copy an array of mcp_kreq_ether_recv_t's to the mcp. Copy
2326 * at most 32 bytes at a time, so as to avoid involving the software
2327 * pio handler in the nic. We re-write the first segment's low
2328 * DMA address to mark it valid only after we write the entire chunk
2332 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2333 mcp_kreq_ether_recv_t *src)
2337 low = src->addr_low;
2338 src->addr_low = 0xffffffff;
2339 mxge_pio_copy(dst, src, 4 * sizeof (*src));
2341 mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2343 src->addr_low = low;
2344 dst->addr_low = low;
2349 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2351 bus_dma_segment_t seg;
2353 mxge_rx_ring_t *rx = &ss->rx_small;
2356 m = m_gethdr(MB_DONTWAIT, MT_DATA);
2362 m->m_len = m->m_pkthdr.len = MHLEN;
2363 err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2364 &seg, 1, &cnt, BUS_DMA_NOWAIT);
2366 kprintf("can't dmamap small (%d)\n", err);
2370 rx->info[idx].m = m;
2371 rx->shadow[idx].addr_low =
2372 htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2373 rx->shadow[idx].addr_high =
2374 htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2378 mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2384 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2386 bus_dma_segment_t seg[3];
2388 mxge_rx_ring_t *rx = &ss->rx_big;
2391 if (rx->cl_size == MCLBYTES)
2392 m = m_getcl(MB_DONTWAIT, MT_DATA, M_PKTHDR);
2395 m = m_getjcl(MB_DONTWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2398 * XXX: allocate normal sized buffers for big buffers.
2399 * We should be fine as long as we don't get any jumbo frames
2401 m = m_getcl(MB_DONTWAIT, MT_DATA, M_PKTHDR);
2409 m->m_pkthdr.len = 0;
2410 m->m_len = m->m_pkthdr.len = rx->mlen;
2411 err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2412 seg, 1, &cnt, BUS_DMA_NOWAIT);
2414 kprintf("can't dmamap big (%d)\n", err);
2418 rx->info[idx].m = m;
2419 rx->shadow[idx].addr_low =
2420 htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2421 rx->shadow[idx].addr_high =
2422 htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2424 #if MXGE_VIRT_JUMBOS
2425 for (i = 1; i < cnt; i++) {
2426 rx->shadow[idx + i].addr_low =
2427 htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2428 rx->shadow[idx + i].addr_high =
2429 htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2434 for (i = 0; i < rx->nbufs; i++) {
2435 if ((idx & 7) == 7) {
2436 mxge_submit_8rx(&rx->lanai[idx - 7],
2437 &rx->shadow[idx - 7]);
2445 * Myri10GE hardware checksums are not valid if the sender
2446 * padded the frame with non-zero padding. This is because
2447 * the firmware just does a simple 16-bit 1s complement
2448 * checksum across the entire frame, excluding the first 14
2449 * bytes. It is best to simply to check the checksum and
2450 * tell the stack about it only if the checksum is good
2453 static inline uint16_t
2454 mxge_rx_csum(struct mbuf *m, int csum)
2456 struct ether_header *eh;
2460 eh = mtod(m, struct ether_header *);
2462 /* only deal with IPv4 TCP & UDP for now */
2463 if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2465 ip = (struct ip *)(eh + 1);
2466 if (__predict_false(ip->ip_p != IPPROTO_TCP &&
2467 ip->ip_p != IPPROTO_UDP))
2470 c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2471 htonl(ntohs(csum) + ntohs(ip->ip_len) +
2472 - (ip->ip_hl << 2) + ip->ip_p));
2481 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2483 struct ether_vlan_header *evl;
2484 struct ether_header *eh;
2487 evl = mtod(m, struct ether_vlan_header *);
2488 eh = mtod(m, struct ether_header *);
2491 * fix checksum by subtracting EVL_ENCAPLEN bytes
2492 * after what the firmware thought was the end of the ethernet
2496 /* put checksum into host byte order */
2497 *csum = ntohs(*csum);
2498 partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2499 (*csum) += ~partial;
2500 (*csum) += ((*csum) < ~partial);
2501 (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2502 (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2504 /* restore checksum to network byte order;
2505 later consumers expect this */
2506 *csum = htons(*csum);
2509 #ifdef MXGE_NEW_VLAN_API
2510 m->m_pkthdr.ether_vlantag = ntohs(evl->evl_tag);
2514 mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2518 VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2519 m_tag_prepend(m, mtag);
2523 m->m_flags |= M_VLANTAG;
2526 * Remove the 802.1q header by copying the Ethernet
2527 * addresses over it and adjusting the beginning of
2528 * the data in the mbuf. The encapsulated Ethernet
2529 * type field is already in place.
2531 bcopy((char *)evl, (char *)evl + EVL_ENCAPLEN,
2532 ETHER_HDR_LEN - ETHER_TYPE_LEN);
2533 m_adj(m, EVL_ENCAPLEN);
2538 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len, uint32_t csum,
2539 struct mbuf_chain *chain)
2544 struct ether_header *eh;
2546 bus_dmamap_t old_map;
2548 uint16_t tcpudp_csum;
2553 idx = rx->cnt & rx->mask;
2554 rx->cnt += rx->nbufs;
2555 /* save a pointer to the received mbuf */
2556 m = rx->info[idx].m;
2557 /* try to replace the received mbuf */
2558 if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2559 /* drop the frame -- the old mbuf is re-cycled */
2564 /* unmap the received buffer */
2565 old_map = rx->info[idx].map;
2566 bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2567 bus_dmamap_unload(rx->dmat, old_map);
2569 /* swap the bus_dmamap_t's */
2570 rx->info[idx].map = rx->extra_map;
2571 rx->extra_map = old_map;
2573 /* mcp implicitly skips 1st 2 bytes so that packet is properly
2575 m->m_data += MXGEFW_PAD;
2577 m->m_pkthdr.rcvif = ifp;
2578 m->m_len = m->m_pkthdr.len = len;
2580 eh = mtod(m, struct ether_header *);
2581 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2582 mxge_vlan_tag_remove(m, &csum);
2584 /* if the checksum is valid, mark it in the mbuf header */
2585 if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2586 if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2588 /* otherwise, it was a UDP frame, or a TCP frame which
2589 we could not do LRO on. Tell the stack that the
2591 m->m_pkthdr.csum_data = 0xffff;
2592 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2595 /* flowid only valid if RSS hashing is enabled */
2596 if (sc->num_slices > 1) {
2597 m->m_pkthdr.flowid = (ss - sc->ss);
2598 m->m_flags |= M_FLOWID;
2601 ether_input_chain(ifp, m, NULL, chain);
2605 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len, uint32_t csum,
2606 struct mbuf_chain *chain)
2610 struct ether_header *eh;
2613 bus_dmamap_t old_map;
2615 uint16_t tcpudp_csum;
2620 idx = rx->cnt & rx->mask;
2622 /* save a pointer to the received mbuf */
2623 m = rx->info[idx].m;
2624 /* try to replace the received mbuf */
2625 if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2626 /* drop the frame -- the old mbuf is re-cycled */
2631 /* unmap the received buffer */
2632 old_map = rx->info[idx].map;
2633 bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2634 bus_dmamap_unload(rx->dmat, old_map);
2636 /* swap the bus_dmamap_t's */
2637 rx->info[idx].map = rx->extra_map;
2638 rx->extra_map = old_map;
2640 /* mcp implicitly skips 1st 2 bytes so that packet is properly
2642 m->m_data += MXGEFW_PAD;
2644 m->m_pkthdr.rcvif = ifp;
2645 m->m_len = m->m_pkthdr.len = len;
2647 eh = mtod(m, struct ether_header *);
2648 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2649 mxge_vlan_tag_remove(m, &csum);
2651 /* if the checksum is valid, mark it in the mbuf header */
2652 if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2653 if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2655 /* otherwise, it was a UDP frame, or a TCP frame which
2656 we could not do LRO on. Tell the stack that the
2658 m->m_pkthdr.csum_data = 0xffff;
2659 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2662 /* flowid only valid if RSS hashing is enabled */
2663 if (sc->num_slices > 1) {
2664 m->m_pkthdr.flowid = (ss - sc->ss);
2665 m->m_flags |= M_FLOWID;
2668 ether_input_chain(ifp, m, NULL, chain);
2674 * Inlining the call to this function causes mxge_intr() to grow too large
2675 * for GCC's stack size limits (which shouldn't take into account inlining
2676 * of leaf functions at one call site anyway). Inlining is definitely a
2677 * good idea in this case though, so mark the function appropriately.
2679 static __always_inline void
2680 mxge_clean_rx_done(struct mxge_slice_state *ss)
2682 mxge_rx_done_t *rx_done = &ss->rx_done;
2686 struct mbuf_chain chain[MAXCPU];
2688 ether_input_chain_init(chain);
2689 while (rx_done->entry[rx_done->idx].length != 0) {
2690 length = ntohs(rx_done->entry[rx_done->idx].length);
2691 rx_done->entry[rx_done->idx].length = 0;
2692 checksum = rx_done->entry[rx_done->idx].checksum;
2693 if (length <= (MHLEN - MXGEFW_PAD))
2694 mxge_rx_done_small(ss, length, checksum, chain);
2696 mxge_rx_done_big(ss, length, checksum, chain);
2698 rx_done->idx = rx_done->cnt & rx_done->mask;
2700 /* limit potential for livelock */
2701 if (__predict_false(++limit > rx_done->mask / 2))
2704 ether_input_dispatch(chain);
2706 while (!SLIST_EMPTY(&ss->lro_active)) {
2707 struct lro_entry *lro = SLIST_FIRST(&ss->lro_active);
2708 SLIST_REMOVE_HEAD(&ss->lro_active, next);
2709 mxge_lro_flush(ss, lro);
2716 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2727 ASSERT_SERIALIZED(ifp->if_serializer);
2728 while (tx->pkt_done != mcp_idx) {
2729 idx = tx->done & tx->mask;
2731 m = tx->info[idx].m;
2732 /* mbuf and DMA map only attached to the first
2735 ss->obytes += m->m_pkthdr.len;
2736 if (m->m_flags & M_MCAST)
2739 tx->info[idx].m = NULL;
2740 map = tx->info[idx].map;
2741 bus_dmamap_unload(tx->dmat, map);
2744 if (tx->info[idx].flag) {
2745 tx->info[idx].flag = 0;
2750 /* If we have space, clear IFF_OACTIVE to tell the stack that
2751 its OK to send packets */
2752 #ifdef IFNET_BUF_RING
2753 flags = &ss->if_flags;
2755 flags = &ifp->if_flags;
2757 if ((*flags) & IFF_OACTIVE &&
2758 tx->req - tx->done < (tx->mask + 1)/4) {
2759 *(flags) &= ~IFF_OACTIVE;
2761 mxge_start_locked(ss);
2763 #ifdef IFNET_BUF_RING
2764 if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2765 /* let the NIC stop polling this queue, since there
2766 * are no more transmits pending */
2767 if (tx->req == tx->done) {
2769 tx->queue_active = 0;
2778 static struct mxge_media_type mxge_xfp_media_types[] =
2780 {IFM_10G_CX4, 0x7f, "10GBASE-CX4 (module)"},
2781 {IFM_10G_SR, (1 << 7), "10GBASE-SR"},
2782 {IFM_10G_LR, (1 << 6), "10GBASE-LR"},
2783 {0, (1 << 5), "10GBASE-ER"},
2784 {IFM_10G_LRM, (1 << 4), "10GBASE-LRM"},
2785 {0, (1 << 3), "10GBASE-SW"},
2786 {0, (1 << 2), "10GBASE-LW"},
2787 {0, (1 << 1), "10GBASE-EW"},
2788 {0, (1 << 0), "Reserved"}
2790 static struct mxge_media_type mxge_sfp_media_types[] =
2792 {0, (1 << 7), "Reserved"},
2793 {IFM_10G_LRM, (1 << 6), "10GBASE-LRM"},
2794 {IFM_10G_LR, (1 << 5), "10GBASE-LR"},
2795 {IFM_10G_SR, (1 << 4), "10GBASE-SR"}
2799 mxge_set_media(mxge_softc_t *sc, int type)
2801 sc->media_flags |= type;
2802 ifmedia_add(&sc->media, sc->media_flags, 0, NULL);
2803 ifmedia_set(&sc->media, sc->media_flags);
2808 * Determine the media type for a NIC. Some XFPs will identify
2809 * themselves only when their link is up, so this is initiated via a
2810 * link up interrupt. However, this can potentially take up to
2811 * several milliseconds, so it is run via the watchdog routine, rather
2812 * than in the interrupt handler itself. This need only be done
2813 * once, not each time the link is up.
2816 mxge_media_probe(mxge_softc_t *sc)
2821 struct mxge_media_type *mxge_media_types = NULL;
2822 int i, err, ms, mxge_media_type_entries;
2825 sc->need_media_probe = 0;
2827 /* if we've already set a media type, we're done */
2828 if (sc->media_flags != (IFM_ETHER | IFM_AUTO))
2832 * parse the product code to deterimine the interface type
2833 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2834 * after the 3rd dash in the driver's cached copy of the
2835 * EEPROM's product code string.
2837 ptr = sc->product_code_string;
2839 device_printf(sc->dev, "Missing product code\n");
2842 for (i = 0; i < 3; i++, ptr++) {
2843 ptr = index(ptr, '-');
2845 device_printf(sc->dev,
2846 "only %d dashes in PC?!?\n", i);
2852 mxge_set_media(sc, IFM_10G_CX4);
2855 else if (*ptr == 'Q') {
2856 /* -Q is Quad Ribbon Fiber */
2857 device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2858 /* FreeBSD has no media type for Quad ribbon fiber */
2864 mxge_media_types = mxge_xfp_media_types;
2865 mxge_media_type_entries =
2866 sizeof (mxge_xfp_media_types) /
2867 sizeof (mxge_xfp_media_types[0]);
2868 byte = MXGE_XFP_COMPLIANCE_BYTE;
2872 if (*ptr == 'S' || *(ptr +1) == 'S') {
2873 /* -S or -2S is SFP+ */
2874 mxge_media_types = mxge_sfp_media_types;
2875 mxge_media_type_entries =
2876 sizeof (mxge_sfp_media_types) /
2877 sizeof (mxge_sfp_media_types[0]);
2882 if (mxge_media_types == NULL) {
2883 device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2888 * At this point we know the NIC has an XFP cage, so now we
2889 * try to determine what is in the cage by using the
2890 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2891 * register. We read just one byte, which may take over
2895 cmd.data0 = 0; /* just fetch 1 byte, not all 256 */
2897 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2898 if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
2899 device_printf(sc->dev, "failed to read XFP\n");
2901 if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
2902 device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
2904 if (err != MXGEFW_CMD_OK) {
2908 /* now we wait for the data to be cached */
2910 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2911 for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
2914 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2916 if (err != MXGEFW_CMD_OK) {
2917 device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
2918 cage_type, err, ms);
2922 if (cmd.data0 == mxge_media_types[0].bitmask) {
2924 device_printf(sc->dev, "%s:%s\n", cage_type,
2925 mxge_media_types[0].name);
2926 mxge_set_media(sc, IFM_10G_CX4);
2929 for (i = 1; i < mxge_media_type_entries; i++) {
2930 if (cmd.data0 & mxge_media_types[i].bitmask) {
2932 device_printf(sc->dev, "%s:%s\n",
2934 mxge_media_types[i].name);
2936 mxge_set_media(sc, mxge_media_types[i].flag);
2940 device_printf(sc->dev, "%s media 0x%x unknown\n", cage_type,
2947 mxge_intr(void *arg)
2949 struct mxge_slice_state *ss = arg;
2950 mxge_softc_t *sc = ss->sc;
2951 mcp_irq_data_t *stats = ss->fw_stats;
2952 mxge_tx_ring_t *tx = &ss->tx;
2953 mxge_rx_done_t *rx_done = &ss->rx_done;
2954 uint32_t send_done_count;
2958 #ifndef IFNET_BUF_RING
2959 /* an interrupt on a non-zero slice is implicitly valid
2960 since MSI-X irqs are not shared */
2962 mxge_clean_rx_done(ss);
2963 *ss->irq_claim = be32toh(3);
2968 /* make sure the DMA has finished */
2969 if (!stats->valid) {
2972 valid = stats->valid;
2974 if (sc->legacy_irq) {
2975 /* lower legacy IRQ */
2976 *sc->irq_deassert = 0;
2977 if (!mxge_deassert_wait)
2978 /* don't wait for conf. that irq is low */
2984 /* loop while waiting for legacy irq deassertion */
2986 /* check for transmit completes and receives */
2987 send_done_count = be32toh(stats->send_done_count);
2988 while ((send_done_count != tx->pkt_done) ||
2989 (rx_done->entry[rx_done->idx].length != 0)) {
2990 if (send_done_count != tx->pkt_done)
2991 mxge_tx_done(ss, (int)send_done_count);
2992 mxge_clean_rx_done(ss);
2993 send_done_count = be32toh(stats->send_done_count);
2995 if (sc->legacy_irq && mxge_deassert_wait)
2997 } while (*((volatile uint8_t *) &stats->valid));
2999 /* fw link & error stats meaningful only on the first slice */
3000 if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
3001 if (sc->link_state != stats->link_up) {
3002 sc->link_state = stats->link_up;
3003 if (sc->link_state) {
3004 sc->ifp->if_link_state = LINK_STATE_UP;
3005 if_link_state_change(sc->ifp);
3007 device_printf(sc->dev, "link up\n");
3009 sc->ifp->if_link_state = LINK_STATE_DOWN;
3010 if_link_state_change(sc->ifp);
3012 device_printf(sc->dev, "link down\n");
3014 sc->need_media_probe = 1;
3016 if (sc->rdma_tags_available !=
3017 be32toh(stats->rdma_tags_available)) {
3018 sc->rdma_tags_available =
3019 be32toh(stats->rdma_tags_available);
3020 device_printf(sc->dev, "RDMA timed out! %d tags "
3021 "left\n", sc->rdma_tags_available);
3024 if (stats->link_down) {
3025 sc->down_cnt += stats->link_down;
3027 sc->ifp->if_link_state = LINK_STATE_DOWN;
3028 if_link_state_change(sc->ifp);
3032 /* check to see if we have rx token to pass back */
3034 *ss->irq_claim = be32toh(3);
3035 *(ss->irq_claim + 1) = be32toh(3);
3039 mxge_init(void *arg)
3046 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
3048 struct lro_entry *lro_entry;
3051 while (!SLIST_EMPTY(&ss->lro_free)) {
3052 lro_entry = SLIST_FIRST(&ss->lro_free);
3053 SLIST_REMOVE_HEAD(&ss->lro_free, next);
3054 kfree(lro_entry, M_DEVBUF);
3057 for (i = 0; i <= ss->rx_big.mask; i++) {
3058 if (ss->rx_big.info[i].m == NULL)
3060 bus_dmamap_unload(ss->rx_big.dmat,
3061 ss->rx_big.info[i].map);
3062 m_freem(ss->rx_big.info[i].m);
3063 ss->rx_big.info[i].m = NULL;
3066 for (i = 0; i <= ss->rx_small.mask; i++) {
3067 if (ss->rx_small.info[i].m == NULL)
3069 bus_dmamap_unload(ss->rx_small.dmat,
3070 ss->rx_small.info[i].map);
3071 m_freem(ss->rx_small.info[i].m);
3072 ss->rx_small.info[i].m = NULL;
3075 /* transmit ring used only on the first slice */
3076 if (ss->tx.info == NULL)
3079 for (i = 0; i <= ss->tx.mask; i++) {
3080 ss->tx.info[i].flag = 0;
3081 if (ss->tx.info[i].m == NULL)
3083 bus_dmamap_unload(ss->tx.dmat,
3084 ss->tx.info[i].map);
3085 m_freem(ss->tx.info[i].m);
3086 ss->tx.info[i].m = NULL;
3091 mxge_free_mbufs(mxge_softc_t *sc)
3095 for (slice = 0; slice < sc->num_slices; slice++)
3096 mxge_free_slice_mbufs(&sc->ss[slice]);
3100 mxge_free_slice_rings(struct mxge_slice_state *ss)
3105 if (ss->rx_done.entry != NULL)
3106 mxge_dma_free(&ss->rx_done.dma);
3107 ss->rx_done.entry = NULL;
3109 if (ss->tx.req_bytes != NULL)
3110 kfree(ss->tx.req_bytes, M_DEVBUF);
3111 ss->tx.req_bytes = NULL;
3113 if (ss->tx.seg_list != NULL)
3114 kfree(ss->tx.seg_list, M_DEVBUF);
3115 ss->tx.seg_list = NULL;
3117 if (ss->rx_small.shadow != NULL)
3118 kfree(ss->rx_small.shadow, M_DEVBUF);
3119 ss->rx_small.shadow = NULL;
3121 if (ss->rx_big.shadow != NULL)
3122 kfree(ss->rx_big.shadow, M_DEVBUF);
3123 ss->rx_big.shadow = NULL;
3125 if (ss->tx.info != NULL) {
3126 if (ss->tx.dmat != NULL) {
3127 for (i = 0; i <= ss->tx.mask; i++) {
3128 bus_dmamap_destroy(ss->tx.dmat,
3129 ss->tx.info[i].map);
3131 bus_dma_tag_destroy(ss->tx.dmat);
3133 kfree(ss->tx.info, M_DEVBUF);
3137 if (ss->rx_small.info != NULL) {
3138 if (ss->rx_small.dmat != NULL) {
3139 for (i = 0; i <= ss->rx_small.mask; i++) {
3140 bus_dmamap_destroy(ss->rx_small.dmat,
3141 ss->rx_small.info[i].map);
3143 bus_dmamap_destroy(ss->rx_small.dmat,
3144 ss->rx_small.extra_map);
3145 bus_dma_tag_destroy(ss->rx_small.dmat);
3147 kfree(ss->rx_small.info, M_DEVBUF);
3149 ss->rx_small.info = NULL;
3151 if (ss->rx_big.info != NULL) {
3152 if (ss->rx_big.dmat != NULL) {
3153 for (i = 0; i <= ss->rx_big.mask; i++) {
3154 bus_dmamap_destroy(ss->rx_big.dmat,
3155 ss->rx_big.info[i].map);
3157 bus_dmamap_destroy(ss->rx_big.dmat,
3158 ss->rx_big.extra_map);
3159 bus_dma_tag_destroy(ss->rx_big.dmat);
3161 kfree(ss->rx_big.info, M_DEVBUF);
3163 ss->rx_big.info = NULL;
3167 mxge_free_rings(mxge_softc_t *sc)
3171 for (slice = 0; slice < sc->num_slices; slice++)
3172 mxge_free_slice_rings(&sc->ss[slice]);
3176 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3177 int tx_ring_entries)
3179 mxge_softc_t *sc = ss->sc;
3185 /* allocate per-slice receive resources */
3187 ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3188 ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3190 /* allocate the rx shadow rings */
3191 bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3192 ss->rx_small.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3193 if (ss->rx_small.shadow == NULL)
3196 bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3197 ss->rx_big.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3198 if (ss->rx_big.shadow == NULL)
3201 /* allocate the rx host info rings */
3202 bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3203 ss->rx_small.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3204 if (ss->rx_small.info == NULL)
3207 bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3208 ss->rx_big.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3209 if (ss->rx_big.info == NULL)
3212 /* allocate the rx busdma resources */
3213 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3215 4096, /* boundary */
3216 BUS_SPACE_MAXADDR, /* low */
3217 BUS_SPACE_MAXADDR, /* high */
3218 NULL, NULL, /* filter */
3219 MHLEN, /* maxsize */
3221 MHLEN, /* maxsegsize */
3222 BUS_DMA_ALLOCNOW, /* flags */
3223 &ss->rx_small.dmat); /* tag */
3225 device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3230 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3232 #if MXGE_VIRT_JUMBOS
3233 4096, /* boundary */
3237 BUS_SPACE_MAXADDR, /* low */
3238 BUS_SPACE_MAXADDR, /* high */
3239 NULL, NULL, /* filter */
3240 3*4096, /* maxsize */
3241 #if MXGE_VIRT_JUMBOS
3243 4096, /* maxsegsize*/
3246 MJUM9BYTES, /* maxsegsize*/
3248 BUS_DMA_ALLOCNOW, /* flags */
3249 &ss->rx_big.dmat); /* tag */
3251 device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3255 for (i = 0; i <= ss->rx_small.mask; i++) {
3256 err = bus_dmamap_create(ss->rx_small.dmat, 0,
3257 &ss->rx_small.info[i].map);
3259 device_printf(sc->dev, "Err %d rx_small dmamap\n",
3264 err = bus_dmamap_create(ss->rx_small.dmat, 0,
3265 &ss->rx_small.extra_map);
3267 device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3272 for (i = 0; i <= ss->rx_big.mask; i++) {
3273 err = bus_dmamap_create(ss->rx_big.dmat, 0,
3274 &ss->rx_big.info[i].map);
3276 device_printf(sc->dev, "Err %d rx_big dmamap\n",
3281 err = bus_dmamap_create(ss->rx_big.dmat, 0,
3282 &ss->rx_big.extra_map);
3284 device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3289 /* now allocate TX resouces */
3291 #ifndef IFNET_BUF_RING
3292 /* only use a single TX ring for now */
3293 if (ss != ss->sc->ss)
3297 ss->tx.mask = tx_ring_entries - 1;
3298 ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3301 /* allocate the tx request copy block */
3303 sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3304 ss->tx.req_bytes = kmalloc(bytes, M_DEVBUF, M_WAITOK);
3305 if (ss->tx.req_bytes == NULL)
3307 /* ensure req_list entries are aligned to 8 bytes */
3308 ss->tx.req_list = (mcp_kreq_ether_send_t *)
3309 ((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3311 /* allocate the tx busdma segment list */
3312 bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3313 ss->tx.seg_list = (bus_dma_segment_t *)
3314 kmalloc(bytes, M_DEVBUF, M_WAITOK);
3315 if (ss->tx.seg_list == NULL)
3318 /* allocate the tx host info ring */
3319 bytes = tx_ring_entries * sizeof (*ss->tx.info);
3320 ss->tx.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3321 if (ss->tx.info == NULL)
3324 /* allocate the tx busdma resources */
3325 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3327 sc->tx_boundary, /* boundary */
3328 BUS_SPACE_MAXADDR, /* low */
3329 BUS_SPACE_MAXADDR, /* high */
3330 NULL, NULL, /* filter */
3331 65536 + 256, /* maxsize */
3332 ss->tx.max_desc - 2, /* num segs */
3333 sc->tx_boundary, /* maxsegsz */
3334 BUS_DMA_ALLOCNOW, /* flags */
3335 &ss->tx.dmat); /* tag */
3338 device_printf(sc->dev, "Err %d allocating tx dmat\n",
3343 /* now use these tags to setup dmamaps for each slot
3345 for (i = 0; i <= ss->tx.mask; i++) {
3346 err = bus_dmamap_create(ss->tx.dmat, 0,
3347 &ss->tx.info[i].map);
3349 device_printf(sc->dev, "Err %d tx dmamap\n",
3359 mxge_alloc_rings(mxge_softc_t *sc)
3363 int tx_ring_entries, rx_ring_entries;
3366 /* get ring sizes */
3367 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3368 tx_ring_size = cmd.data0;
3370 device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3374 tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3375 rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3376 ifq_set_maxlen(&sc->ifp->if_snd, tx_ring_entries - 1);
3377 ifq_set_ready(&sc->ifp->if_snd);
3379 for (slice = 0; slice < sc->num_slices; slice++) {
3380 err = mxge_alloc_slice_rings(&sc->ss[slice],
3389 mxge_free_rings(sc);
3396 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3398 int bufsize = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN + MXGEFW_PAD;
3400 if (bufsize < MCLBYTES) {
3401 /* easy, everything fits in a single buffer */
3402 *big_buf_size = MCLBYTES;
3403 *cl_size = MCLBYTES;
3408 if (bufsize < MJUMPAGESIZE) {
3409 /* still easy, everything still fits in a single buffer */
3410 *big_buf_size = MJUMPAGESIZE;
3411 *cl_size = MJUMPAGESIZE;
3415 #if MXGE_VIRT_JUMBOS
3416 /* now we need to use virtually contiguous buffers */
3417 *cl_size = MJUM9BYTES;
3418 *big_buf_size = 4096;
3419 *nbufs = mtu / 4096 + 1;
3420 /* needs to be a power of two, so round up */
3424 *cl_size = MJUM9BYTES;
3425 *big_buf_size = MJUM9BYTES;
3431 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3436 struct lro_entry *lro_entry;
3441 slice = ss - sc->ss;
3443 SLIST_INIT(&ss->lro_free);
3444 SLIST_INIT(&ss->lro_active);
3446 for (i = 0; i < sc->lro_cnt; i++) {
3447 lro_entry = (struct lro_entry *)
3448 kmalloc(sizeof (*lro_entry), M_DEVBUF,
3450 if (lro_entry == NULL) {
3454 SLIST_INSERT_HEAD(&ss->lro_free, lro_entry, next);
3456 /* get the lanai pointers to the send and receive rings */
3459 #ifndef IFNET_BUF_RING
3460 /* We currently only send from the first slice */
3464 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3466 (volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3467 ss->tx.send_go = (volatile uint32_t *)
3468 (sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3469 ss->tx.send_stop = (volatile uint32_t *)
3470 (sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3471 #ifndef IFNET_BUF_RING
3475 err |= mxge_send_cmd(sc,
3476 MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3477 ss->rx_small.lanai =
3478 (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3480 err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3482 (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3485 device_printf(sc->dev,
3486 "failed to get ring sizes or locations\n");
3490 /* stock receive rings */
3491 for (i = 0; i <= ss->rx_small.mask; i++) {
3492 map = ss->rx_small.info[i].map;
3493 err = mxge_get_buf_small(ss, map, i);
3495 device_printf(sc->dev, "alloced %d/%d smalls\n",
3496 i, ss->rx_small.mask + 1);
3500 for (i = 0; i <= ss->rx_big.mask; i++) {
3501 ss->rx_big.shadow[i].addr_low = 0xffffffff;
3502 ss->rx_big.shadow[i].addr_high = 0xffffffff;
3504 ss->rx_big.nbufs = nbufs;
3505 ss->rx_big.cl_size = cl_size;
3506 ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3507 EVL_ENCAPLEN + MXGEFW_PAD;
3508 for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3509 map = ss->rx_big.info[i].map;
3510 err = mxge_get_buf_big(ss, map, i);
3512 device_printf(sc->dev, "alloced %d/%d bigs\n",
3513 i, ss->rx_big.mask + 1);
3521 mxge_open(mxge_softc_t *sc)
3524 int err, big_bytes, nbufs, slice, cl_size, i;
3526 volatile uint8_t *itable;
3527 struct mxge_slice_state *ss;
3529 ASSERT_SERIALIZED(sc->ifp->if_serializer);
3530 /* Copy the MAC address in case it was overridden */
3531 bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3533 err = mxge_reset(sc, 1);
3535 device_printf(sc->dev, "failed to reset\n");
3539 if (sc->num_slices > 1) {
3540 /* setup the indirection table */
3541 cmd.data0 = sc->num_slices;
3542 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3545 err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3548 device_printf(sc->dev,
3549 "failed to setup rss tables\n");
3553 /* just enable an identity mapping */
3554 itable = sc->sram + cmd.data0;
3555 for (i = 0; i < sc->num_slices; i++)
3556 itable[i] = (uint8_t)i;
3559 cmd.data1 = mxge_rss_hash_type;
3560 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3562 device_printf(sc->dev, "failed to enable slices\n");
3568 mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3571 err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3573 /* error is only meaningful if we're trying to set
3574 MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3575 if (err && nbufs > 1) {
3576 device_printf(sc->dev,
3577 "Failed to set alway-use-n to %d\n",
3581 /* Give the firmware the mtu and the big and small buffer
3582 sizes. The firmware wants the big buf size to be a power
3583 of two. Luckily, FreeBSD's clusters are powers of two */
3584 cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
3585 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3586 cmd.data0 = MHLEN - MXGEFW_PAD;
3587 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3589 cmd.data0 = big_bytes;
3590 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3593 device_printf(sc->dev, "failed to setup params\n");
3597 /* Now give him the pointer to the stats block */
3599 #ifdef IFNET_BUF_RING
3600 slice < sc->num_slices;
3605 ss = &sc->ss[slice];
3607 MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3609 MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3610 cmd.data2 = sizeof(struct mcp_irq_data);
3611 cmd.data2 |= (slice << 16);
3612 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3616 bus = sc->ss->fw_stats_dma.bus_addr;
3617 bus += offsetof(struct mcp_irq_data, send_done_count);
3618 cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3619 cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3620 err = mxge_send_cmd(sc,
3621 MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3623 /* Firmware cannot support multicast without STATS_DMA_V2 */
3624 sc->fw_multicast_support = 0;
3626 sc->fw_multicast_support = 1;
3630 device_printf(sc->dev, "failed to setup params\n");
3634 for (slice = 0; slice < sc->num_slices; slice++) {
3635 err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3637 device_printf(sc->dev, "couldn't open slice %d\n",
3643 /* Finally, start the firmware running */
3644 err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3646 device_printf(sc->dev, "Couldn't bring up link\n");
3649 #ifdef IFNET_BUF_RING
3650 for (slice = 0; slice < sc->num_slices; slice++) {
3651 ss = &sc->ss[slice];
3652 ss->if_flags |= IFF_RUNNING;
3653 ss->if_flags &= ~IFF_OACTIVE;
3656 sc->ifp->if_flags |= IFF_RUNNING;
3657 sc->ifp->if_flags &= ~IFF_OACTIVE;
3658 callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3664 mxge_free_mbufs(sc);
3670 mxge_close(mxge_softc_t *sc)
3673 int err, old_down_cnt;
3674 #ifdef IFNET_BUF_RING
3675 struct mxge_slice_state *ss;
3679 ASSERT_SERIALIZED(sc->ifp->if_serializer);
3680 callout_stop(&sc->co_hdl);
3681 #ifdef IFNET_BUF_RING
3682 for (slice = 0; slice < sc->num_slices; slice++) {
3683 ss = &sc->ss[slice];
3684 ss->if_flags &= ~IFF_RUNNING;
3687 sc->ifp->if_flags &= ~IFF_RUNNING;
3688 old_down_cnt = sc->down_cnt;
3690 err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3692 device_printf(sc->dev, "Couldn't bring down link\n");
3694 if (old_down_cnt == sc->down_cnt) {
3695 /* wait for down irq */
3696 DELAY(10 * sc->intr_coal_delay);
3699 if (old_down_cnt == sc->down_cnt) {
3700 device_printf(sc->dev, "never got down irq\n");
3703 mxge_free_mbufs(sc);
3709 mxge_setup_cfg_space(mxge_softc_t *sc)
3711 device_t dev = sc->dev;
3713 uint16_t cmd, lnk, pectl;
3715 /* find the PCIe link width and set max read request to 4KB*/
3716 if (pci_find_extcap(dev, PCIY_EXPRESS, ®) == 0) {
3717 lnk = pci_read_config(dev, reg + 0x12, 2);
3718 sc->link_width = (lnk >> 4) & 0x3f;
3720 pectl = pci_read_config(dev, reg + 0x8, 2);
3721 pectl = (pectl & ~0x7000) | (5 << 12);
3722 pci_write_config(dev, reg + 0x8, pectl, 2);
3725 /* Enable DMA and Memory space access */
3726 pci_enable_busmaster(dev);
3727 cmd = pci_read_config(dev, PCIR_COMMAND, 2);
3728 cmd |= PCIM_CMD_MEMEN;
3729 pci_write_config(dev, PCIR_COMMAND, cmd, 2);
3733 mxge_read_reboot(mxge_softc_t *sc)
3735 device_t dev = sc->dev;
3738 /* find the vendor specific offset */
3739 if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
3740 device_printf(sc->dev,
3741 "could not find vendor specific offset\n");
3742 return (uint32_t)-1;
3744 /* enable read32 mode */
3745 pci_write_config(dev, vs + 0x10, 0x3, 1);
3746 /* tell NIC which register to read */
3747 pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3748 return (pci_read_config(dev, vs + 0x14, 4));
3752 mxge_watchdog_reset(mxge_softc_t *sc, int slice)
3754 struct pci_devinfo *dinfo;
3762 device_printf(sc->dev, "Watchdog reset!\n");
3765 * check to see if the NIC rebooted. If it did, then all of
3766 * PCI config space has been reset, and things like the
3767 * busmaster bit will be zero. If this is the case, then we
3768 * must restore PCI config space before the NIC can be used
3771 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3772 if (cmd == 0xffff) {
3774 * maybe the watchdog caught the NIC rebooting; wait
3775 * up to 100ms for it to finish. If it does not come
3776 * back, then give up
3779 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3780 if (cmd == 0xffff) {
3781 device_printf(sc->dev, "NIC disappeared!\n");
3785 if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3786 /* print the reboot status */
3787 reboot = mxge_read_reboot(sc);
3788 device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3790 /* restore PCI configuration space */
3791 dinfo = device_get_ivars(sc->dev);
3792 pci_cfg_restore(sc->dev, dinfo);
3794 /* and redo any changes we made to our config space */
3795 mxge_setup_cfg_space(sc);
3797 if (sc->ifp->if_flags & IFF_RUNNING) {
3799 err = mxge_open(sc);
3802 tx = &sc->ss[slice].tx;
3803 device_printf(sc->dev,
3804 "NIC did not reboot, slice %d ring state:\n",
3806 device_printf(sc->dev,
3807 "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3808 tx->req, tx->done, tx->queue_active);
3809 device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3810 tx->activate, tx->deactivate);
3811 device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3813 be32toh(sc->ss->fw_stats->send_done_count));
3814 device_printf(sc->dev, "not resetting\n");
3820 mxge_watchdog(mxge_softc_t *sc)
3823 uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3826 /* see if we have outstanding transmits, which
3827 have been pending for more than mxge_ticks */
3829 #ifdef IFNET_BUF_RING
3830 (i < sc->num_slices) && (err == 0);
3832 (i < 1) && (err == 0);
3836 if (tx->req != tx->done &&
3837 tx->watchdog_req != tx->watchdog_done &&
3838 tx->done == tx->watchdog_done) {
3839 /* check for pause blocking before resetting */
3840 if (tx->watchdog_rx_pause == rx_pause)
3841 err = mxge_watchdog_reset(sc, i);
3843 device_printf(sc->dev, "Flow control blocking "
3844 "xmits, check link partner\n");
3847 tx->watchdog_req = tx->req;
3848 tx->watchdog_done = tx->done;
3849 tx->watchdog_rx_pause = rx_pause;
3852 if (sc->need_media_probe)
3853 mxge_media_probe(sc);
3858 mxge_update_stats(mxge_softc_t *sc)
3860 struct mxge_slice_state *ss;
3861 u_long ipackets = 0;
3862 u_long opackets = 0;
3863 #ifdef IFNET_BUF_RING
3871 for (slice = 0; slice < sc->num_slices; slice++) {
3872 ss = &sc->ss[slice];
3873 ipackets += ss->ipackets;
3874 opackets += ss->opackets;
3875 #ifdef IFNET_BUF_RING
3876 obytes += ss->obytes;
3877 omcasts += ss->omcasts;
3878 odrops += ss->tx.br->br_drops;
3880 oerrors += ss->oerrors;
3882 sc->ifp->if_ipackets = ipackets;
3883 sc->ifp->if_opackets = opackets;
3884 #ifdef IFNET_BUF_RING
3885 sc->ifp->if_obytes = obytes;
3886 sc->ifp->if_omcasts = omcasts;
3887 sc->ifp->if_snd.ifq_drops = odrops;
3889 sc->ifp->if_oerrors = oerrors;
3893 mxge_tick(void *arg)
3895 mxge_softc_t *sc = arg;
3898 lwkt_serialize_enter(sc->ifp->if_serializer);
3899 /* aggregate stats from different slices */
3900 mxge_update_stats(sc);
3901 if (!sc->watchdog_countdown) {
3902 err = mxge_watchdog(sc);
3903 sc->watchdog_countdown = 4;
3905 sc->watchdog_countdown--;
3907 callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3908 lwkt_serialize_exit(sc->ifp->if_serializer);
3912 mxge_media_change(struct ifnet *ifp)
3918 mxge_change_mtu(mxge_softc_t *sc, int mtu)
3920 struct ifnet *ifp = sc->ifp;
3921 int real_mtu, old_mtu;
3924 if (ifp->if_serializer)
3925 ASSERT_SERIALIZED(ifp->if_serializer);
3927 real_mtu = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
3928 if ((real_mtu > sc->max_mtu) || real_mtu < 60)
3930 old_mtu = ifp->if_mtu;
3932 if (ifp->if_flags & IFF_RUNNING) {
3934 err = mxge_open(sc);
3936 ifp->if_mtu = old_mtu;
3938 (void) mxge_open(sc);
3945 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
3947 mxge_softc_t *sc = ifp->if_softc;
3952 ifmr->ifm_status = IFM_AVALID;
3953 ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
3954 ifmr->ifm_active = IFM_AUTO | IFM_ETHER;
3955 ifmr->ifm_active |= sc->link_state ? IFM_FDX : 0;
3959 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data, struct ucred *cr)
3961 mxge_softc_t *sc = ifp->if_softc;
3962 struct ifreq *ifr = (struct ifreq *)data;
3967 ASSERT_SERIALIZED(ifp->if_serializer);
3971 err = ether_ioctl(ifp, command, data);
3975 err = mxge_change_mtu(sc, ifr->ifr_mtu);
3982 if (ifp->if_flags & IFF_UP) {
3983 if (!(ifp->if_flags & IFF_RUNNING)) {
3984 err = mxge_open(sc);
3986 /* take care of promis can allmulti
3988 mxge_change_promisc(sc,
3989 ifp->if_flags & IFF_PROMISC);
3990 mxge_set_multicast_list(sc);
3993 if (ifp->if_flags & IFF_RUNNING) {
4001 mxge_set_multicast_list(sc);
4005 mask = ifr->ifr_reqcap ^ ifp->if_capenable;
4006 if (mask & IFCAP_TXCSUM) {
4007 if (IFCAP_TXCSUM & ifp->if_capenable) {
4008 ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
4009 ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
4012 ifp->if_capenable |= IFCAP_TXCSUM;
4013 ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
4015 } else if (mask & IFCAP_RXCSUM) {
4016 if (IFCAP_RXCSUM & ifp->if_capenable) {
4017 ifp->if_capenable &= ~IFCAP_RXCSUM;
4020 ifp->if_capenable |= IFCAP_RXCSUM;
4024 if (mask & IFCAP_TSO4) {
4025 if (IFCAP_TSO4 & ifp->if_capenable) {
4026 ifp->if_capenable &= ~IFCAP_TSO4;
4027 ifp->if_hwassist &= ~CSUM_TSO;
4028 } else if (IFCAP_TXCSUM & ifp->if_capenable) {
4029 ifp->if_capenable |= IFCAP_TSO4;
4030 ifp->if_hwassist |= CSUM_TSO;
4032 kprintf("mxge requires tx checksum offload"
4033 " be enabled to use TSO\n");
4037 if (mask & IFCAP_LRO) {
4038 if (IFCAP_LRO & ifp->if_capenable)
4039 err = mxge_change_lro_locked(sc, 0);
4041 err = mxge_change_lro_locked(sc, mxge_lro_cnt);
4043 if (mask & IFCAP_VLAN_HWTAGGING)
4044 ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
4045 VLAN_CAPABILITIES(ifp);
4050 err = ifmedia_ioctl(ifp, (struct ifreq *)data,
4051 &sc->media, command);
4061 mxge_fetch_tunables(mxge_softc_t *sc)
4064 TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4065 TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4066 &mxge_flow_control);
4067 TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4068 &mxge_intr_coal_delay);
4069 TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4070 &mxge_nvidia_ecrc_enable);
4071 TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4072 &mxge_force_firmware);
4073 TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4074 &mxge_deassert_wait);
4075 TUNABLE_INT_FETCH("hw.mxge.verbose",
4077 TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4078 TUNABLE_INT_FETCH("hw.mxge.lro_cnt", &sc->lro_cnt);
4079 TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4080 TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4081 TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4082 if (sc->lro_cnt != 0)
4083 mxge_lro_cnt = sc->lro_cnt;
4087 if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4088 mxge_intr_coal_delay = 30;
4089 if (mxge_ticks == 0)
4090 mxge_ticks = hz / 2;
4091 sc->pause = mxge_flow_control;
4092 if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4093 || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4094 mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
4096 if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4097 mxge_initial_mtu < ETHER_MIN_LEN)
4098 mxge_initial_mtu = ETHERMTU_JUMBO;
4103 mxge_free_slices(mxge_softc_t *sc)
4105 struct mxge_slice_state *ss;
4112 for (i = 0; i < sc->num_slices; i++) {
4114 if (ss->fw_stats != NULL) {
4115 mxge_dma_free(&ss->fw_stats_dma);
4116 ss->fw_stats = NULL;
4117 #ifdef IFNET_BUF_RING
4118 if (ss->tx.br != NULL) {
4119 drbr_free(ss->tx.br, M_DEVBUF);
4124 if (ss->rx_done.entry != NULL) {
4125 mxge_dma_free(&ss->rx_done.dma);
4126 ss->rx_done.entry = NULL;
4129 kfree(sc->ss, M_DEVBUF);
4134 mxge_alloc_slices(mxge_softc_t *sc)
4137 struct mxge_slice_state *ss;
4139 int err, i, max_intr_slots;
4141 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4143 device_printf(sc->dev, "Cannot determine rx ring size\n");
4146 sc->rx_ring_size = cmd.data0;
4147 max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4149 bytes = sizeof (*sc->ss) * sc->num_slices;
4150 sc->ss = kmalloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4153 for (i = 0; i < sc->num_slices; i++) {
4158 /* allocate per-slice rx interrupt queues */
4160 bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4161 err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4164 ss->rx_done.entry = ss->rx_done.dma.addr;
4165 bzero(ss->rx_done.entry, bytes);
4168 * allocate the per-slice firmware stats; stats
4169 * (including tx) are used used only on the first
4172 #ifndef IFNET_BUF_RING
4177 bytes = sizeof (*ss->fw_stats);
4178 err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4179 sizeof (*ss->fw_stats), 64);
4182 ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4183 #ifdef IFNET_BUF_RING
4184 ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4192 mxge_free_slices(sc);
4197 mxge_slice_probe(mxge_softc_t *sc)
4201 int msix_cnt, status, max_intr_slots;
4205 * don't enable multiple slices if they are not enabled,
4206 * or if this is not an SMP system
4209 if (mxge_max_slices == 0 || mxge_max_slices == 1 || ncpus < 2)
4212 /* see how many MSI-X interrupts are available */
4213 msix_cnt = pci_msix_count(sc->dev);
4217 /* now load the slice aware firmware see what it supports */
4218 old_fw = sc->fw_name;
4219 if (old_fw == mxge_fw_aligned)
4220 sc->fw_name = mxge_fw_rss_aligned;
4222 sc->fw_name = mxge_fw_rss_unaligned;
4223 status = mxge_load_firmware(sc, 0);
4225 device_printf(sc->dev, "Falling back to a single slice\n");
4229 /* try to send a reset command to the card to see if it
4231 memset(&cmd, 0, sizeof (cmd));
4232 status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4234 device_printf(sc->dev, "failed reset\n");
4238 /* get rx ring size */
4239 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4241 device_printf(sc->dev, "Cannot determine rx ring size\n");
4244 max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4246 /* tell it the size of the interrupt queues */
4247 cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4248 status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4250 device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4254 /* ask the maximum number of slices it supports */
4255 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4257 device_printf(sc->dev,
4258 "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4261 sc->num_slices = cmd.data0;
4262 if (sc->num_slices > msix_cnt)
4263 sc->num_slices = msix_cnt;
4265 if (mxge_max_slices == -1) {
4266 /* cap to number of CPUs in system */
4267 if (sc->num_slices > ncpus)
4268 sc->num_slices = ncpus;
4270 if (sc->num_slices > mxge_max_slices)
4271 sc->num_slices = mxge_max_slices;
4273 /* make sure it is a power of two */
4274 while (sc->num_slices & (sc->num_slices - 1))
4278 device_printf(sc->dev, "using %d slices\n",
4284 sc->fw_name = old_fw;
4285 (void) mxge_load_firmware(sc, 0);
4289 mxge_add_msix_irqs(mxge_softc_t *sc)
4292 int count, err, i, rid;
4295 sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4298 if (sc->msix_table_res == NULL) {
4299 device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4303 count = sc->num_slices;
4304 err = pci_alloc_msix(sc->dev, &count);
4306 device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4307 "err = %d \n", sc->num_slices, err);
4308 goto abort_with_msix_table;
4310 if (count < sc->num_slices) {
4311 device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4312 count, sc->num_slices);
4313 device_printf(sc->dev,
4314 "Try setting hw.mxge.max_slices to %d\n",
4317 goto abort_with_msix;
4319 bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4320 sc->msix_irq_res = kmalloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4321 if (sc->msix_irq_res == NULL) {
4323 goto abort_with_msix;
4326 for (i = 0; i < sc->num_slices; i++) {
4328 sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4331 if (sc->msix_irq_res[i] == NULL) {
4332 device_printf(sc->dev, "couldn't allocate IRQ res"
4333 " for message %d\n", i);
4335 goto abort_with_res;
4339 bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4340 sc->msix_ih = kmalloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4342 for (i = 0; i < sc->num_slices; i++) {
4343 err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4345 mxge_intr, &sc->ss[i], &sc->msix_ih[i],
4346 sc->ifp->if_serializer);
4348 device_printf(sc->dev, "couldn't setup intr for "
4350 goto abort_with_intr;
4355 device_printf(sc->dev, "using %d msix IRQs:",
4357 for (i = 0; i < sc->num_slices; i++)
4358 kprintf(" %ld", rman_get_start(sc->msix_irq_res[i]));
4364 for (i = 0; i < sc->num_slices; i++) {
4365 if (sc->msix_ih[i] != NULL) {
4366 bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4368 sc->msix_ih[i] = NULL;
4371 kfree(sc->msix_ih, M_DEVBUF);
4375 for (i = 0; i < sc->num_slices; i++) {
4377 if (sc->msix_irq_res[i] != NULL)
4378 bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4379 sc->msix_irq_res[i]);
4380 sc->msix_irq_res[i] = NULL;
4382 kfree(sc->msix_irq_res, M_DEVBUF);
4386 pci_release_msi(sc->dev);
4388 abort_with_msix_table:
4389 bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4390 sc->msix_table_res);
4396 mxge_add_single_irq(mxge_softc_t *sc)
4398 int count, err, rid;
4400 count = pci_msi_count(sc->dev);
4401 if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4407 sc->irq_res = bus_alloc_resource(sc->dev, SYS_RES_IRQ, &rid, 0, ~0,
4408 1, RF_SHAREABLE | RF_ACTIVE);
4409 if (sc->irq_res == NULL) {
4410 device_printf(sc->dev, "could not alloc interrupt\n");
4414 device_printf(sc->dev, "using %s irq %ld\n",
4415 sc->legacy_irq ? "INTx" : "MSI",
4416 rman_get_start(sc->irq_res));
4417 err = bus_setup_intr(sc->dev, sc->irq_res,
4419 mxge_intr, &sc->ss[0], &sc->ih,
4420 sc->ifp->if_serializer);
4422 bus_release_resource(sc->dev, SYS_RES_IRQ,
4423 sc->legacy_irq ? 0 : 1, sc->irq_res);
4424 if (!sc->legacy_irq)
4425 pci_release_msi(sc->dev);
4431 mxge_rem_msix_irqs(mxge_softc_t *sc)
4435 for (i = 0; i < sc->num_slices; i++) {
4436 if (sc->msix_ih[i] != NULL) {
4437 bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4439 sc->msix_ih[i] = NULL;
4442 kfree(sc->msix_ih, M_DEVBUF);
4444 for (i = 0; i < sc->num_slices; i++) {
4446 if (sc->msix_irq_res[i] != NULL)
4447 bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4448 sc->msix_irq_res[i]);
4449 sc->msix_irq_res[i] = NULL;
4451 kfree(sc->msix_irq_res, M_DEVBUF);
4453 bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4454 sc->msix_table_res);
4456 pci_release_msi(sc->dev);
4461 mxge_rem_single_irq(mxge_softc_t *sc)
4463 bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4464 bus_release_resource(sc->dev, SYS_RES_IRQ,
4465 sc->legacy_irq ? 0 : 1, sc->irq_res);
4466 if (!sc->legacy_irq)
4467 pci_release_msi(sc->dev);
4471 mxge_rem_irq(mxge_softc_t *sc)
4473 if (sc->num_slices > 1)
4474 mxge_rem_msix_irqs(sc);
4476 mxge_rem_single_irq(sc);
4480 mxge_add_irq(mxge_softc_t *sc)
4484 if (sc->num_slices > 1)
4485 err = mxge_add_msix_irqs(sc);
4487 err = mxge_add_single_irq(sc);
4489 if (0 && err == 0 && sc->num_slices > 1) {
4490 mxge_rem_msix_irqs(sc);
4491 err = mxge_add_msix_irqs(sc);
4498 mxge_attach(device_t dev)
4500 mxge_softc_t *sc = device_get_softc(dev);
4501 struct ifnet *ifp = &sc->arpcom.ac_if;
4505 * avoid rewriting half the lines in this file to use
4506 * &sc->arpcom.ac_if instead
4510 mxge_fetch_tunables(sc);
4512 err = bus_dma_tag_create(NULL, /* parent */
4515 BUS_SPACE_MAXADDR, /* low */
4516 BUS_SPACE_MAXADDR, /* high */
4517 NULL, NULL, /* filter */
4518 65536 + 256, /* maxsize */
4519 MXGE_MAX_SEND_DESC, /* num segs */
4520 65536, /* maxsegsize */
4522 &sc->parent_dmat); /* tag */
4525 device_printf(sc->dev, "Err %d allocating parent dmat\n",
4527 goto abort_with_nothing;
4531 if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4533 callout_init_mp(&sc->co_hdl);
4535 mxge_setup_cfg_space(sc);
4537 /* Map the board into the kernel */
4539 sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
4541 if (sc->mem_res == NULL) {
4542 device_printf(dev, "could not map memory\n");
4544 goto abort_with_nothing;
4546 sc->sram = rman_get_virtual(sc->mem_res);
4547 sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4548 if (sc->sram_size > rman_get_size(sc->mem_res)) {
4549 device_printf(dev, "impossible memory region size %ld\n",
4550 rman_get_size(sc->mem_res));
4552 goto abort_with_mem_res;
4555 /* make NULL terminated copy of the EEPROM strings section of
4557 bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4558 bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4559 rman_get_bushandle(sc->mem_res),
4560 sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4562 MXGE_EEPROM_STRINGS_SIZE - 2);
4563 err = mxge_parse_strings(sc);
4565 goto abort_with_mem_res;
4567 /* Enable write combining for efficient use of PCIe bus */
4570 /* Allocate the out of band dma memory */
4571 err = mxge_dma_alloc(sc, &sc->cmd_dma,
4572 sizeof (mxge_cmd_t), 64);
4574 goto abort_with_mem_res;
4575 sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4576 err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4578 goto abort_with_cmd_dma;
4580 err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4582 goto abort_with_zeropad_dma;
4584 /* select & load the firmware */
4585 err = mxge_select_firmware(sc);
4587 goto abort_with_dmabench;
4588 sc->intr_coal_delay = mxge_intr_coal_delay;
4590 mxge_slice_probe(sc);
4591 err = mxge_alloc_slices(sc);
4593 goto abort_with_dmabench;
4595 err = mxge_reset(sc, 0);
4597 goto abort_with_slices;
4599 err = mxge_alloc_rings(sc);
4601 device_printf(sc->dev, "failed to allocate rings\n");
4602 goto abort_with_dmabench;
4605 ifp->if_baudrate = IF_Gbps(10UL);
4606 ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4609 ifp->if_capabilities |= IFCAP_LRO;
4612 #ifdef MXGE_NEW_VLAN_API
4613 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4616 sc->max_mtu = mxge_max_mtu(sc);
4617 if (sc->max_mtu >= 9000)
4618 ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4620 device_printf(dev, "MTU limited to %d. Install "
4621 "latest firmware for 9000 byte jumbo support\n",
4622 sc->max_mtu - ETHER_HDR_LEN);
4623 ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4624 ifp->if_capenable = ifp->if_capabilities;
4625 if (sc->lro_cnt == 0)
4626 ifp->if_capenable &= ~IFCAP_LRO;
4628 ifp->if_init = mxge_init;
4630 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4631 ifp->if_ioctl = mxge_ioctl;
4632 ifp->if_start = mxge_start;
4633 /* Initialise the ifmedia structure */
4634 ifmedia_init(&sc->media, 0, mxge_media_change,
4636 mxge_set_media(sc, IFM_ETHER | IFM_AUTO);
4637 mxge_media_probe(sc);
4639 ether_ifattach(ifp, sc->mac_addr, NULL);
4640 /* ether_ifattach sets mtu to ETHERMTU */
4641 if (mxge_initial_mtu != ETHERMTU) {
4642 lwkt_serialize_enter(ifp->if_serializer);
4643 mxge_change_mtu(sc, mxge_initial_mtu);
4644 lwkt_serialize_exit(ifp->if_serializer);
4646 /* must come after ether_ifattach() */
4647 err = mxge_add_irq(sc);
4649 device_printf(sc->dev, "failed to add irq\n");
4650 goto abort_with_rings;
4653 mxge_add_sysctls(sc);
4654 #ifdef IFNET_BUF_RING
4655 ifp->if_transmit = mxge_transmit;
4656 ifp->if_qflush = mxge_qflush;
4661 mxge_free_rings(sc);
4663 mxge_free_slices(sc);
4664 abort_with_dmabench:
4665 mxge_dma_free(&sc->dmabench_dma);
4666 abort_with_zeropad_dma:
4667 mxge_dma_free(&sc->zeropad_dma);
4669 mxge_dma_free(&sc->cmd_dma);
4671 bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4672 pci_disable_busmaster(dev);
4673 bus_dma_tag_destroy(sc->parent_dmat);
4679 mxge_detach(device_t dev)
4681 mxge_softc_t *sc = device_get_softc(dev);
4683 lwkt_serialize_enter(sc->ifp->if_serializer);
4685 if (sc->ifp->if_flags & IFF_RUNNING)
4688 * XXX: race: the callout callback could be spinning on
4689 * the serializer and run anyway
4691 callout_stop(&sc->co_hdl);
4692 lwkt_serialize_exit(sc->ifp->if_serializer);
4694 ether_ifdetach(sc->ifp);
4695 ifmedia_removeall(&sc->media);
4696 mxge_dummy_rdma(sc, 0);
4697 mxge_rem_sysctls(sc);
4699 mxge_free_rings(sc);
4700 mxge_free_slices(sc);
4701 mxge_dma_free(&sc->dmabench_dma);
4702 mxge_dma_free(&sc->zeropad_dma);
4703 mxge_dma_free(&sc->cmd_dma);
4704 bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4705 pci_disable_busmaster(dev);
4706 bus_dma_tag_destroy(sc->parent_dmat);
4711 mxge_shutdown(device_t dev)
4717 This file uses Myri10GE driver indentation.
4720 c-file-style:"linux"