1 /******************************************************************************
3 Copyright (c) 2006-2013, Myricom Inc.
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
9 1. Redistributions of source code must retain the above copyright notice,
10 this list of conditions and the following disclaimer.
12 2. Neither the name of the Myricom Inc, nor the names of its
13 contributors may be used to endorse or promote products derived from
14 this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
28 $FreeBSD: head/sys/dev/mxge/if_mxge.c 254263 2013-08-12 23:30:01Z scottl $
30 ***************************************************************************/
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/linker.h>
37 #include <sys/firmware.h>
38 #include <sys/endian.h>
39 #include <sys/in_cksum.h>
40 #include <sys/sockio.h>
42 #include <sys/malloc.h>
43 #include <sys/kernel.h>
44 #include <sys/module.h>
45 #include <sys/serialize.h>
46 #include <sys/socket.h>
47 #include <sys/sysctl.h>
50 #include <net/if_arp.h>
51 #include <net/ifq_var.h>
52 #include <net/ethernet.h>
53 #include <net/if_dl.h>
54 #include <net/if_media.h>
58 #include <net/if_types.h>
59 #include <net/vlan/if_vlan_var.h>
62 #include <netinet/in_systm.h>
63 #include <netinet/in.h>
64 #include <netinet/ip.h>
65 #include <netinet/tcp.h>
70 #include <bus/pci/pcireg.h>
71 #include <bus/pci/pcivar.h>
72 #include <bus/pci/pci_private.h> /* XXX for pci_cfg_restore */
74 #include <vm/vm.h> /* for pmap_mapdev() */
77 #if defined(__i386__) || defined(__x86_64__)
78 #include <machine/specialreg.h>
81 #include <dev/netif/mxge/mxge_mcp.h>
82 #include <dev/netif/mxge/mcp_gen_header.h>
83 #include <dev/netif/mxge/if_mxge_var.h>
86 static int mxge_nvidia_ecrc_enable = 1;
87 static int mxge_force_firmware = 0;
88 static int mxge_intr_coal_delay = 30;
89 static int mxge_deassert_wait = 1;
90 static int mxge_flow_control = 1;
91 static int mxge_verbose = 0;
92 static int mxge_ticks;
93 static int mxge_max_slices = 1;
94 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_TCP_IPV4;
95 static int mxge_always_promisc = 0;
96 static int mxge_throttle = 0;
97 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
98 static char *mxge_fw_aligned = "mxge_eth_z8e";
99 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
100 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
102 static int mxge_probe(device_t dev);
103 static int mxge_attach(device_t dev);
104 static int mxge_detach(device_t dev);
105 static int mxge_shutdown(device_t dev);
106 static void mxge_intr(void *arg);
108 static device_method_t mxge_methods[] = {
109 /* Device interface */
110 DEVMETHOD(device_probe, mxge_probe),
111 DEVMETHOD(device_attach, mxge_attach),
112 DEVMETHOD(device_detach, mxge_detach),
113 DEVMETHOD(device_shutdown, mxge_shutdown),
117 static driver_t mxge_driver = {
120 sizeof(mxge_softc_t),
123 static devclass_t mxge_devclass;
125 /* Declare ourselves to be a child of the PCI bus.*/
126 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, NULL, NULL);
127 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
128 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
130 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
131 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
132 static int mxge_close(mxge_softc_t *sc, int down);
133 static int mxge_open(mxge_softc_t *sc);
134 static void mxge_tick(void *arg);
137 mxge_probe(device_t dev)
141 if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
142 ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
143 (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
144 rev = pci_get_revid(dev);
146 case MXGE_PCI_REV_Z8E:
147 device_set_desc(dev, "Myri10G-PCIE-8A");
149 case MXGE_PCI_REV_Z8ES:
150 device_set_desc(dev, "Myri10G-PCIE-8B");
153 device_set_desc(dev, "Myri10G-PCIE-8??");
154 device_printf(dev, "Unrecognized rev %d NIC\n",
164 mxge_enable_wc(mxge_softc_t *sc)
166 #if defined(__i386__) || defined(__x86_64__)
170 len = rman_get_size(sc->mem_res);
171 pmap_change_attr((vm_offset_t) sc->sram, len / PAGE_SIZE,
172 PAT_WRITE_COMBINING);
176 /* callback to get our DMA address */
178 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
181 *(bus_addr_t *) arg = segs->ds_addr;
186 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
187 bus_size_t alignment)
190 device_t dev = sc->dev;
191 bus_size_t boundary, maxsegsize;
193 if (bytes > 4096 && alignment == 4096) {
201 /* allocate DMAable memory tags */
202 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
203 alignment, /* alignment */
204 boundary, /* boundary */
205 BUS_SPACE_MAXADDR, /* low */
206 BUS_SPACE_MAXADDR, /* high */
207 NULL, NULL, /* filter */
210 maxsegsize, /* maxsegsize */
211 BUS_DMA_COHERENT, /* flags */
212 &dma->dmat); /* tag */
214 device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
218 /* allocate DMAable memory & map */
219 err = bus_dmamem_alloc(dma->dmat, &dma->addr,
220 (BUS_DMA_WAITOK | BUS_DMA_COHERENT
221 | BUS_DMA_ZERO), &dma->map);
223 device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
224 goto abort_with_dmat;
227 /* load the memory */
228 err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
229 mxge_dmamap_callback,
230 (void *)&dma->bus_addr, 0);
232 device_printf(dev, "couldn't load map (err = %d)\n", err);
238 bus_dmamem_free(dma->dmat, dma->addr, dma->map);
240 (void)bus_dma_tag_destroy(dma->dmat);
245 mxge_dma_free(mxge_dma_t *dma)
247 bus_dmamap_unload(dma->dmat, dma->map);
248 bus_dmamem_free(dma->dmat, dma->addr, dma->map);
249 (void)bus_dma_tag_destroy(dma->dmat);
253 * The eeprom strings on the lanaiX have the format
259 mxge_parse_strings(mxge_softc_t *sc)
262 int i, found_mac, found_sn2;
265 ptr = sc->eeprom_strings;
268 while (*ptr != '\0') {
269 if (strncmp(ptr, "MAC=", 4) == 0) {
272 sc->mac_addr[i] = strtoul(ptr, &endptr, 16);
273 if (endptr - ptr != 2)
282 } else if (strncmp(ptr, "PC=", 3) == 0) {
284 strlcpy(sc->product_code_string, ptr,
285 sizeof(sc->product_code_string));
286 } else if (!found_sn2 && (strncmp(ptr, "SN=", 3) == 0)) {
288 strlcpy(sc->serial_number_string, ptr,
289 sizeof(sc->serial_number_string));
290 } else if (strncmp(ptr, "SN2=", 4) == 0) {
291 /* SN2 takes precedence over SN */
294 strlcpy(sc->serial_number_string, ptr,
295 sizeof(sc->serial_number_string));
297 while (*ptr++ != '\0') {}
304 device_printf(sc->dev, "failed to parse eeprom_strings\n");
308 #if defined(__i386__) || defined(__x86_64__)
311 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
314 unsigned long base, off;
316 device_t pdev, mcp55;
317 uint16_t vendor_id, device_id, word;
318 uintptr_t bus, slot, func, ivend, idev;
322 if (!mxge_nvidia_ecrc_enable)
325 pdev = device_get_parent(device_get_parent(sc->dev));
327 device_printf(sc->dev, "could not find parent?\n");
330 vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
331 device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
333 if (vendor_id != 0x10de)
338 if (device_id == 0x005d) {
339 /* ck804, base address is magic */
341 } else if (device_id >= 0x0374 && device_id <= 0x378) {
342 /* mcp55, base address stored in chipset */
343 mcp55 = pci_find_bsf(0, 0, 0);
345 0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
346 0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
347 word = pci_read_config(mcp55, 0x90, 2);
348 base = ((unsigned long)word & 0x7ffeU) << 25;
355 Test below is commented because it is believed that doing
356 config read/write beyond 0xff will access the config space
357 for the next larger function. Uncomment this and remove
358 the hacky pmap_mapdev() way of accessing config space when
359 FreeBSD grows support for extended pcie config space access
362 /* See if we can, by some miracle, access the extended
364 val = pci_read_config(pdev, 0x178, 4);
365 if (val != 0xffffffff) {
367 pci_write_config(pdev, 0x178, val, 4);
371 /* Rather than using normal pci config space writes, we must
372 * map the Nvidia config space ourselves. This is because on
373 * opteron/nvidia class machine the 0xe000000 mapping is
374 * handled by the nvidia chipset, that means the internal PCI
375 * device (the on-chip northbridge), or the amd-8131 bridge
376 * and things behind them are not visible by this method.
379 BUS_READ_IVAR(device_get_parent(pdev), pdev,
381 BUS_READ_IVAR(device_get_parent(pdev), pdev,
382 PCI_IVAR_SLOT, &slot);
383 BUS_READ_IVAR(device_get_parent(pdev), pdev,
384 PCI_IVAR_FUNCTION, &func);
385 BUS_READ_IVAR(device_get_parent(pdev), pdev,
386 PCI_IVAR_VENDOR, &ivend);
387 BUS_READ_IVAR(device_get_parent(pdev), pdev,
388 PCI_IVAR_DEVICE, &idev);
391 + 0x00100000UL * (unsigned long)bus
392 + 0x00001000UL * (unsigned long)(func
395 /* map it into the kernel */
396 va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
400 device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
403 /* get a pointer to the config space mapped into the kernel */
404 cfgptr = va + (off & PAGE_MASK);
406 /* make sure that we can really access it */
407 vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
408 device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
409 if (! (vendor_id == ivend && device_id == idev)) {
410 device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
411 vendor_id, device_id);
412 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
416 ptr32 = (uint32_t*)(cfgptr + 0x178);
419 if (val == 0xffffffff) {
420 device_printf(sc->dev, "extended mapping failed\n");
421 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
425 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
427 device_printf(sc->dev,
428 "Enabled ECRC on upstream Nvidia bridge "
430 (int)bus, (int)slot, (int)func);
434 #else /* __i386__ || __x86_64__ */
437 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
439 device_printf(sc->dev,
440 "Nforce 4 chipset on non-x86/x86_64!?!?!\n");
447 mxge_dma_test(mxge_softc_t *sc, int test_type)
450 bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
455 /* Run a small DMA test.
456 * The magic multipliers to the length tell the firmware
457 * to do DMA read, write, or read+write tests. The
458 * results are returned in cmd.data0. The upper 16
459 * bits of the return is the number of transfers completed.
460 * The lower 16 bits is the time in 0.5us ticks that the
461 * transfers took to complete.
464 len = sc->tx_boundary;
466 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
467 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
468 cmd.data2 = len * 0x10000;
469 status = mxge_send_cmd(sc, test_type, &cmd);
474 sc->read_dma = ((cmd.data0>>16) * len * 2) /
475 (cmd.data0 & 0xffff);
476 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
477 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
478 cmd.data2 = len * 0x1;
479 status = mxge_send_cmd(sc, test_type, &cmd);
484 sc->write_dma = ((cmd.data0>>16) * len * 2) /
485 (cmd.data0 & 0xffff);
487 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
488 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
489 cmd.data2 = len * 0x10001;
490 status = mxge_send_cmd(sc, test_type, &cmd);
495 sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
496 (cmd.data0 & 0xffff);
499 if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
500 device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
507 * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
508 * when the PCI-E Completion packets are aligned on an 8-byte
509 * boundary. Some PCI-E chip sets always align Completion packets; on
510 * the ones that do not, the alignment can be enforced by enabling
511 * ECRC generation (if supported).
513 * When PCI-E Completion packets are not aligned, it is actually more
514 * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
516 * If the driver can neither enable ECRC nor verify that it has
517 * already been enabled, then it must use a firmware image which works
518 * around unaligned completion packets (ethp_z8e.dat), and it should
519 * also ensure that it never gives the device a Read-DMA which is
520 * larger than 2KB by setting the tx_boundary to 2KB. If ECRC is
521 * enabled, then the driver should use the aligned (eth_z8e.dat)
522 * firmware image, and set tx_boundary to 4KB.
525 mxge_firmware_probe(mxge_softc_t *sc)
527 device_t dev = sc->dev;
531 sc->tx_boundary = 4096;
533 * Verify the max read request size was set to 4KB
534 * before trying the test with 4KB.
536 if (pci_find_extcap(dev, PCIY_EXPRESS, ®) == 0) {
537 pectl = pci_read_config(dev, reg + 0x8, 2);
538 if ((pectl & (5 << 12)) != (5 << 12)) {
539 device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
541 sc->tx_boundary = 2048;
546 * load the optimized firmware (which assumes aligned PCIe
547 * completions) in order to see if it works on this host.
549 sc->fw_name = mxge_fw_aligned;
550 status = mxge_load_firmware(sc, 1);
556 * Enable ECRC if possible
558 mxge_enable_nvidia_ecrc(sc);
561 * Run a DMA test which watches for unaligned completions and
562 * aborts on the first one seen. Not required on Z8ES or newer.
564 if (pci_get_revid(sc->dev) >= MXGE_PCI_REV_Z8ES)
567 status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
569 return 0; /* keep the aligned firmware */
572 device_printf(dev, "DMA test failed: %d\n", status);
573 if (status == ENOSYS)
574 device_printf(dev, "Falling back to ethp! "
575 "Please install up to date fw\n");
580 mxge_select_firmware(mxge_softc_t *sc)
583 int force_firmware = mxge_force_firmware;
586 force_firmware = sc->throttle;
588 if (force_firmware != 0) {
589 if (force_firmware == 1)
594 device_printf(sc->dev,
595 "Assuming %s completions (forced)\n",
596 aligned ? "aligned" : "unaligned");
600 /* if the PCIe link width is 4 or less, we can use the aligned
601 firmware and skip any checks */
602 if (sc->link_width != 0 && sc->link_width <= 4) {
603 device_printf(sc->dev,
604 "PCIe x%d Link, expect reduced performance\n",
610 if (0 == mxge_firmware_probe(sc))
615 sc->fw_name = mxge_fw_aligned;
616 sc->tx_boundary = 4096;
618 sc->fw_name = mxge_fw_unaligned;
619 sc->tx_boundary = 2048;
621 return (mxge_load_firmware(sc, 0));
625 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
629 if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
630 device_printf(sc->dev, "Bad firmware type: 0x%x\n",
631 be32toh(hdr->mcp_type));
635 /* save firmware version for sysctl */
636 strncpy(sc->fw_version, hdr->version, sizeof(sc->fw_version));
638 device_printf(sc->dev, "firmware id: %s\n", hdr->version);
640 ksscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
641 &sc->fw_ver_minor, &sc->fw_ver_tiny);
643 if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
644 && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
645 device_printf(sc->dev, "Found firmware version %s\n",
647 device_printf(sc->dev, "Driver needs %d.%d\n",
648 MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
656 z_alloc(void *nil, u_int items, u_int size)
660 ptr = kmalloc(items * size, M_TEMP, M_NOWAIT);
665 z_free(void *nil, void *ptr)
671 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
674 char *inflate_buffer;
675 const struct firmware *fw;
676 const mcp_gen_header_t *hdr;
683 fw = firmware_get(sc->fw_name);
685 device_printf(sc->dev, "Could not find firmware image %s\n",
692 /* setup zlib and decompress f/w */
693 bzero(&zs, sizeof (zs));
696 status = inflateInit(&zs);
697 if (status != Z_OK) {
702 /* the uncompressed size is stored as the firmware version,
703 which would otherwise go unused */
704 fw_len = (size_t) fw->version;
705 inflate_buffer = kmalloc(fw_len, M_TEMP, M_NOWAIT);
706 if (inflate_buffer == NULL)
708 zs.avail_in = fw->datasize;
709 zs.next_in = __DECONST(char *, fw->data);
710 zs.avail_out = fw_len;
711 zs.next_out = inflate_buffer;
712 status = inflate(&zs, Z_FINISH);
713 if (status != Z_STREAM_END) {
714 device_printf(sc->dev, "zlib %d\n", status);
716 goto abort_with_buffer;
720 hdr_offset = htobe32(*(const uint32_t *)
721 (inflate_buffer + MCP_HEADER_PTR_OFFSET));
722 if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
723 device_printf(sc->dev, "Bad firmware file");
725 goto abort_with_buffer;
727 hdr = (const void*)(inflate_buffer + hdr_offset);
729 status = mxge_validate_firmware(sc, hdr);
731 goto abort_with_buffer;
733 /* Copy the inflated firmware to NIC SRAM. */
734 for (i = 0; i < fw_len; i += 256) {
735 mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
737 min(256U, (unsigned)(fw_len - i)));
746 kfree(inflate_buffer, M_TEMP);
750 firmware_put(fw, FIRMWARE_UNLOAD);
755 * Enable or disable periodic RDMAs from the host to make certain
756 * chipsets resend dropped PCIe messages
759 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
762 volatile uint32_t *confirm;
763 volatile char *submit;
764 uint32_t *buf, dma_low, dma_high;
767 buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
769 /* clear confirmation addr */
770 confirm = (volatile uint32_t *)sc->cmd;
774 /* send an rdma command to the PCIe engine, and wait for the
775 response in the confirmation address. The firmware should
776 write a -1 there to indicate it is alive and well
779 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
780 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
781 buf[0] = htobe32(dma_high); /* confirm addr MSW */
782 buf[1] = htobe32(dma_low); /* confirm addr LSW */
783 buf[2] = htobe32(0xffffffff); /* confirm data */
784 dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
785 dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
786 buf[3] = htobe32(dma_high); /* dummy addr MSW */
787 buf[4] = htobe32(dma_low); /* dummy addr LSW */
788 buf[5] = htobe32(enable); /* enable? */
791 submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
793 mxge_pio_copy(submit, buf, 64);
798 while (*confirm != 0xffffffff && i < 20) {
802 if (*confirm != 0xffffffff) {
803 device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
804 (enable ? "enable" : "disable"), confirm,
811 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
814 char buf_bytes[sizeof(*buf) + 8];
815 volatile mcp_cmd_response_t *response = sc->cmd;
816 volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
817 uint32_t dma_low, dma_high;
818 int err, sleep_total = 0;
821 * We may be called during attach, before if_serializer is available.
822 * This is not a fast path, just check for NULL
825 if (sc->ifp->if_serializer)
826 ASSERT_SERIALIZED(sc->ifp->if_serializer);
828 /* ensure buf is aligned to 8 bytes */
829 buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
831 buf->data0 = htobe32(data->data0);
832 buf->data1 = htobe32(data->data1);
833 buf->data2 = htobe32(data->data2);
834 buf->cmd = htobe32(cmd);
835 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
836 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
838 buf->response_addr.low = htobe32(dma_low);
839 buf->response_addr.high = htobe32(dma_high);
841 response->result = 0xffffffff;
843 mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
845 /* wait up to 20ms */
847 for (sleep_total = 0; sleep_total < 20; sleep_total++) {
848 bus_dmamap_sync(sc->cmd_dma.dmat,
849 sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
851 switch (be32toh(response->result)) {
853 data->data0 = be32toh(response->data);
859 case MXGEFW_CMD_UNKNOWN:
862 case MXGEFW_CMD_ERROR_UNALIGNED:
865 case MXGEFW_CMD_ERROR_BUSY:
868 case MXGEFW_CMD_ERROR_I2C_ABSENT:
872 device_printf(sc->dev,
874 "failed, result = %d\n",
875 cmd, be32toh(response->result));
883 device_printf(sc->dev, "mxge: command %d timed out"
885 cmd, be32toh(response->result));
890 mxge_adopt_running_firmware(mxge_softc_t *sc)
892 struct mcp_gen_header *hdr;
893 const size_t bytes = sizeof (struct mcp_gen_header);
897 /* find running firmware header */
898 hdr_offset = htobe32(*(volatile uint32_t *)
899 (sc->sram + MCP_HEADER_PTR_OFFSET));
901 if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
902 device_printf(sc->dev,
903 "Running firmware has bad header offset (%d)\n",
908 /* copy header of running firmware from SRAM to host memory to
909 * validate firmware */
910 hdr = kmalloc(bytes, M_DEVBUF, M_NOWAIT);
912 device_printf(sc->dev, "could not kmalloc firmware hdr\n");
915 bus_space_read_region_1(rman_get_bustag(sc->mem_res),
916 rman_get_bushandle(sc->mem_res),
917 hdr_offset, (char *)hdr, bytes);
918 status = mxge_validate_firmware(sc, hdr);
919 kfree(hdr, M_DEVBUF);
922 * check to see if adopted firmware has bug where adopting
923 * it will cause broadcasts to be filtered unless the NIC
924 * is kept in ALLMULTI mode
926 if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
927 sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
928 sc->adopted_rx_filter_bug = 1;
929 device_printf(sc->dev, "Adopting fw %d.%d.%d: "
930 "working around rx filter bug\n",
931 sc->fw_ver_major, sc->fw_ver_minor,
939 mxge_load_firmware(mxge_softc_t *sc, int adopt)
941 volatile uint32_t *confirm;
942 volatile char *submit;
944 uint32_t *buf, size, dma_low, dma_high;
947 buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
949 size = sc->sram_size;
950 status = mxge_load_firmware_helper(sc, &size);
954 /* Try to use the currently running firmware, if
956 status = mxge_adopt_running_firmware(sc);
958 device_printf(sc->dev,
959 "failed to adopt running firmware\n");
962 device_printf(sc->dev,
963 "Successfully adopted running firmware\n");
964 if (sc->tx_boundary == 4096) {
965 device_printf(sc->dev,
966 "Using firmware currently running on NIC"
968 device_printf(sc->dev,
969 "performance consider loading optimized "
972 sc->fw_name = mxge_fw_unaligned;
973 sc->tx_boundary = 2048;
976 /* clear confirmation addr */
977 confirm = (volatile uint32_t *)sc->cmd;
980 /* send a reload command to the bootstrap MCP, and wait for the
981 response in the confirmation address. The firmware should
982 write a -1 there to indicate it is alive and well
985 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
986 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
988 buf[0] = htobe32(dma_high); /* confirm addr MSW */
989 buf[1] = htobe32(dma_low); /* confirm addr LSW */
990 buf[2] = htobe32(0xffffffff); /* confirm data */
992 /* FIX: All newest firmware should un-protect the bottom of
993 the sram before handoff. However, the very first interfaces
994 do not. Therefore the handoff copy must skip the first 8 bytes
996 /* where the code starts*/
997 buf[3] = htobe32(MXGE_FW_OFFSET + 8);
998 buf[4] = htobe32(size - 8); /* length of code */
999 buf[5] = htobe32(8); /* where to copy to */
1000 buf[6] = htobe32(0); /* where to jump to */
1002 submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1003 mxge_pio_copy(submit, buf, 64);
1008 while (*confirm != 0xffffffff && i < 20) {
1011 bus_dmamap_sync(sc->cmd_dma.dmat,
1012 sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1014 if (*confirm != 0xffffffff) {
1015 device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1024 mxge_update_mac_address(mxge_softc_t *sc)
1027 uint8_t *addr = sc->mac_addr;
1031 cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1032 | (addr[2] << 8) | addr[3]);
1034 cmd.data1 = ((addr[4] << 8) | (addr[5]));
1036 status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1041 mxge_change_pause(mxge_softc_t *sc, int pause)
1047 status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1050 status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1054 device_printf(sc->dev, "Failed to set flow control mode\n");
1062 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1068 if (sc->ifp->if_serializer)
1069 ASSERT_SERIALIZED(sc->ifp->if_serializer);
1071 if (mxge_always_promisc)
1075 status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1078 status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1082 device_printf(sc->dev, "Failed to set promisc mode\n");
1087 mxge_set_multicast_list(mxge_softc_t *sc)
1090 struct ifmultiaddr *ifma;
1091 struct ifnet *ifp = sc->ifp;
1095 if (ifp->if_serializer)
1096 ASSERT_SERIALIZED(ifp->if_serializer);
1098 /* This firmware is known to not support multicast */
1099 if (!sc->fw_multicast_support)
1102 /* Disable multicast filtering while we play with the lists*/
1103 err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1105 device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1106 " error status: %d\n", err);
1110 if (sc->adopted_rx_filter_bug)
1113 if (ifp->if_flags & IFF_ALLMULTI)
1114 /* request to disable multicast filtering, so quit here */
1117 /* Flush all the filters */
1119 err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1121 device_printf(sc->dev,
1122 "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1123 ", error status: %d\n", err);
1127 /* Walk the multicast list, and add each address */
1129 TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1130 if (ifma->ifma_addr->sa_family != AF_LINK)
1132 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1134 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1136 cmd.data0 = htonl(cmd.data0);
1137 cmd.data1 = htonl(cmd.data1);
1138 err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1140 device_printf(sc->dev, "Failed "
1141 "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1143 /* abort, leaving multicast filtering off */
1147 /* Enable multicast filtering */
1148 err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1150 device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1151 ", error status: %d\n", err);
1157 mxge_max_mtu(mxge_softc_t *sc)
1162 if (MJUMPAGESIZE - MXGEFW_PAD > MXGEFW_MAX_MTU)
1163 return MXGEFW_MAX_MTU - MXGEFW_PAD;
1165 /* try to set nbufs to see if it we can
1166 use virtually contiguous jumbos */
1168 status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1171 return MXGEFW_MAX_MTU - MXGEFW_PAD;
1173 /* otherwise, we're limited to MJUMPAGESIZE */
1174 return MJUMPAGESIZE - MXGEFW_PAD;
1179 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1181 struct mxge_slice_state *ss;
1182 mxge_rx_done_t *rx_done;
1183 volatile uint32_t *irq_claim;
1187 /* try to send a reset command to the card to see if it
1189 memset(&cmd, 0, sizeof (cmd));
1190 status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1192 device_printf(sc->dev, "failed reset\n");
1196 mxge_dummy_rdma(sc, 1);
1199 /* set the intrq size */
1200 cmd.data0 = sc->rx_ring_size;
1201 status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1204 * Even though we already know how many slices are supported
1205 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1206 * has magic side effects, and must be called after a reset.
1207 * It must be called prior to calling any RSS related cmds,
1208 * including assigning an interrupt queue for anything but
1209 * slice 0. It must also be called *after*
1210 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1211 * the firmware to compute offsets.
1214 if (sc->num_slices > 1) {
1215 /* ask the maximum number of slices it supports */
1216 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1219 device_printf(sc->dev,
1220 "failed to get number of slices\n");
1224 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1225 * to setting up the interrupt queue DMA
1227 cmd.data0 = sc->num_slices;
1228 cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1229 #ifdef IFNET_BUF_RING
1230 cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1232 status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1235 device_printf(sc->dev,
1236 "failed to set number of slices\n");
1242 if (interrupts_setup) {
1243 /* Now exchange information about interrupts */
1244 for (slice = 0; slice < sc->num_slices; slice++) {
1245 rx_done = &sc->ss[slice].rx_done;
1246 memset(rx_done->entry, 0, sc->rx_ring_size);
1247 cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1248 cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1250 status |= mxge_send_cmd(sc,
1251 MXGEFW_CMD_SET_INTRQ_DMA,
1256 status |= mxge_send_cmd(sc,
1257 MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1260 sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1262 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1263 irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1266 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1268 sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1270 device_printf(sc->dev, "failed set interrupt parameters\n");
1275 *sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1278 /* run a DMA benchmark */
1279 (void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1281 for (slice = 0; slice < sc->num_slices; slice++) {
1282 ss = &sc->ss[slice];
1284 ss->irq_claim = irq_claim + (2 * slice);
1285 /* reset mcp/driver shared state back to 0 */
1286 ss->rx_done.idx = 0;
1287 ss->rx_done.cnt = 0;
1290 ss->tx.pkt_done = 0;
1291 ss->tx.queue_active = 0;
1292 ss->tx.activate = 0;
1293 ss->tx.deactivate = 0;
1298 ss->rx_small.cnt = 0;
1299 if (ss->fw_stats != NULL) {
1300 bzero(ss->fw_stats, sizeof *ss->fw_stats);
1303 sc->rdma_tags_available = 15;
1304 status = mxge_update_mac_address(sc);
1305 mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1306 mxge_change_pause(sc, sc->pause);
1307 mxge_set_multicast_list(sc);
1309 cmd.data0 = sc->throttle;
1310 if (mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR,
1312 device_printf(sc->dev,
1313 "can't enable throttle\n");
1320 mxge_change_throttle(SYSCTL_HANDLER_ARGS)
1325 unsigned int throttle;
1328 throttle = sc->throttle;
1329 err = sysctl_handle_int(oidp, &throttle, arg2, req);
1334 if (throttle == sc->throttle)
1337 if (throttle < MXGE_MIN_THROTTLE || throttle > MXGE_MAX_THROTTLE)
1341 lwkt_serialize_enter(sc->ifp->if_serializer);
1343 cmd.data0 = throttle;
1344 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd);
1346 sc->throttle = throttle;
1348 lwkt_serialize_exit(sc->ifp->if_serializer);
1353 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1356 unsigned int intr_coal_delay;
1360 intr_coal_delay = sc->intr_coal_delay;
1361 err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1365 if (intr_coal_delay == sc->intr_coal_delay)
1368 if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1372 lwkt_serialize_enter(sc->ifp->if_serializer);
1374 *sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1375 sc->intr_coal_delay = intr_coal_delay;
1377 lwkt_serialize_exit(sc->ifp->if_serializer);
1382 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1385 unsigned int enabled;
1389 enabled = sc->pause;
1390 err = sysctl_handle_int(oidp, &enabled, arg2, req);
1394 if (enabled == sc->pause)
1398 lwkt_serialize_enter(sc->ifp->if_serializer);
1399 err = mxge_change_pause(sc, enabled);
1400 lwkt_serialize_exit(sc->ifp->if_serializer);
1406 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1412 arg2 = be32toh(*(int *)arg1);
1414 err = sysctl_handle_int(oidp, arg1, arg2, req);
1420 mxge_rem_sysctls(mxge_softc_t *sc)
1422 struct mxge_slice_state *ss;
1425 if (sc->slice_sysctl_tree == NULL)
1428 for (slice = 0; slice < sc->num_slices; slice++) {
1429 ss = &sc->ss[slice];
1430 if (ss == NULL || ss->sysctl_tree == NULL)
1432 sysctl_ctx_free(&ss->sysctl_ctx);
1433 ss->sysctl_tree = NULL;
1435 sysctl_ctx_free(&sc->slice_sysctl_ctx);
1436 sc->slice_sysctl_tree = NULL;
1437 sysctl_ctx_free(&sc->sysctl_ctx);
1438 sc->sysctl_tree = NULL;
1442 mxge_add_sysctls(mxge_softc_t *sc)
1444 struct sysctl_ctx_list *ctx;
1445 struct sysctl_oid_list *children;
1447 struct mxge_slice_state *ss;
1451 ctx = &sc->sysctl_ctx;
1452 sysctl_ctx_init(ctx);
1453 sc->sysctl_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_STATIC_CHILDREN(_hw),
1455 device_get_nameunit(sc->dev),
1457 if (sc->sysctl_tree == NULL) {
1458 device_printf(sc->dev, "can't add sysctl node\n");
1462 children = SYSCTL_CHILDREN(sc->sysctl_tree);
1463 fw = sc->ss[0].fw_stats;
1465 /* random information */
1466 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1468 CTLFLAG_RD, &sc->fw_version,
1469 0, "firmware version");
1470 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1472 CTLFLAG_RD, &sc->serial_number_string,
1473 0, "serial number");
1474 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1476 CTLFLAG_RD, &sc->product_code_string,
1478 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1480 CTLFLAG_RD, &sc->link_width,
1482 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1484 CTLFLAG_RD, &sc->tx_boundary,
1486 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1488 CTLFLAG_RD, &sc->wc,
1489 0, "write combining PIO?");
1490 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1492 CTLFLAG_RD, &sc->read_dma,
1493 0, "DMA Read speed in MB/s");
1494 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1496 CTLFLAG_RD, &sc->write_dma,
1497 0, "DMA Write speed in MB/s");
1498 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1499 "read_write_dma_MBs",
1500 CTLFLAG_RD, &sc->read_write_dma,
1501 0, "DMA concurrent Read/Write speed in MB/s");
1502 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1504 CTLFLAG_RD, &sc->watchdog_resets,
1505 0, "Number of times NIC was reset");
1508 /* performance related tunables */
1509 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1511 CTLTYPE_INT|CTLFLAG_RW, sc,
1512 0, mxge_change_intr_coal,
1513 "I", "interrupt coalescing delay in usecs");
1515 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1517 CTLTYPE_INT|CTLFLAG_RW, sc,
1518 0, mxge_change_throttle,
1519 "I", "transmit throttling");
1521 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1522 "flow_control_enabled",
1523 CTLTYPE_INT|CTLFLAG_RW, sc,
1524 0, mxge_change_flow_control,
1525 "I", "interrupt coalescing delay in usecs");
1527 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1529 CTLFLAG_RW, &mxge_deassert_wait,
1530 0, "Wait for IRQ line to go low in ihandler");
1532 /* stats block from firmware is in network byte order.
1534 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1536 CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1537 0, mxge_handle_be32,
1539 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1540 "rdma_tags_available",
1541 CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1542 0, mxge_handle_be32,
1543 "I", "rdma_tags_available");
1544 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1545 "dropped_bad_crc32",
1546 CTLTYPE_INT|CTLFLAG_RD,
1547 &fw->dropped_bad_crc32,
1548 0, mxge_handle_be32,
1549 "I", "dropped_bad_crc32");
1550 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1552 CTLTYPE_INT|CTLFLAG_RD,
1553 &fw->dropped_bad_phy,
1554 0, mxge_handle_be32,
1555 "I", "dropped_bad_phy");
1556 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1557 "dropped_link_error_or_filtered",
1558 CTLTYPE_INT|CTLFLAG_RD,
1559 &fw->dropped_link_error_or_filtered,
1560 0, mxge_handle_be32,
1561 "I", "dropped_link_error_or_filtered");
1562 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1563 "dropped_link_overflow",
1564 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1565 0, mxge_handle_be32,
1566 "I", "dropped_link_overflow");
1567 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1568 "dropped_multicast_filtered",
1569 CTLTYPE_INT|CTLFLAG_RD,
1570 &fw->dropped_multicast_filtered,
1571 0, mxge_handle_be32,
1572 "I", "dropped_multicast_filtered");
1573 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1574 "dropped_no_big_buffer",
1575 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1576 0, mxge_handle_be32,
1577 "I", "dropped_no_big_buffer");
1578 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1579 "dropped_no_small_buffer",
1580 CTLTYPE_INT|CTLFLAG_RD,
1581 &fw->dropped_no_small_buffer,
1582 0, mxge_handle_be32,
1583 "I", "dropped_no_small_buffer");
1584 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1586 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1587 0, mxge_handle_be32,
1588 "I", "dropped_overrun");
1589 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1591 CTLTYPE_INT|CTLFLAG_RD,
1593 0, mxge_handle_be32,
1594 "I", "dropped_pause");
1595 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1597 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1598 0, mxge_handle_be32,
1599 "I", "dropped_runt");
1601 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1602 "dropped_unicast_filtered",
1603 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1604 0, mxge_handle_be32,
1605 "I", "dropped_unicast_filtered");
1607 /* verbose printing? */
1608 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1610 CTLFLAG_RW, &mxge_verbose,
1611 0, "verbose printing");
1613 /* add counters exported for debugging from all slices */
1614 sysctl_ctx_init(&sc->slice_sysctl_ctx);
1615 sc->slice_sysctl_tree =
1616 SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1617 "slice", CTLFLAG_RD, 0, "");
1619 for (slice = 0; slice < sc->num_slices; slice++) {
1620 ss = &sc->ss[slice];
1621 sysctl_ctx_init(&ss->sysctl_ctx);
1622 ctx = &ss->sysctl_ctx;
1623 children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1624 ksprintf(slice_num, "%d", slice);
1626 SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1628 children = SYSCTL_CHILDREN(ss->sysctl_tree);
1629 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1631 CTLFLAG_RD, &ss->rx_small.cnt,
1633 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1635 CTLFLAG_RD, &ss->rx_big.cnt,
1638 #ifndef IFNET_BUF_RING
1639 /* only transmit from slice 0 for now */
1643 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1645 CTLFLAG_RD, &ss->tx.req,
1648 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1650 CTLFLAG_RD, &ss->tx.done,
1652 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1654 CTLFLAG_RD, &ss->tx.pkt_done,
1656 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1658 CTLFLAG_RD, &ss->tx.stall,
1660 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1662 CTLFLAG_RD, &ss->tx.wake,
1664 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1666 CTLFLAG_RD, &ss->tx.defrag,
1668 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1670 CTLFLAG_RD, &ss->tx.queue_active,
1671 0, "tx_queue_active");
1672 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1674 CTLFLAG_RD, &ss->tx.activate,
1676 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1678 CTLFLAG_RD, &ss->tx.deactivate,
1679 0, "tx_deactivate");
1684 * Copy an array of mcp_kreq_ether_send_t's to the mcp. Copy
1685 * backwards one at a time and handle ring wraps
1688 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1689 mcp_kreq_ether_send_t *src, int cnt)
1691 int idx, starting_slot;
1692 starting_slot = tx->req;
1695 idx = (starting_slot + cnt) & tx->mask;
1696 mxge_pio_copy(&tx->lanai[idx],
1697 &src[cnt], sizeof(*src));
1703 * Copy an array of mcp_kreq_ether_send_t's to the mcp. Copy
1704 * at most 32 bytes at a time, so as to avoid involving the software
1705 * pio handler in the nic. We re-write the first segment's flags
1706 * to mark them valid only after writing the entire chain
1709 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src, int cnt)
1713 volatile uint32_t *dst_ints;
1714 mcp_kreq_ether_send_t *srcp;
1715 volatile mcp_kreq_ether_send_t *dstp, *dst;
1718 idx = tx->req & tx->mask;
1720 last_flags = src->flags;
1723 dst = dstp = &tx->lanai[idx];
1726 if ((idx + cnt) < tx->mask) {
1727 for (i = 0; i < (cnt - 1); i += 2) {
1728 mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1729 wmb(); /* force write every 32 bytes */
1734 /* submit all but the first request, and ensure
1735 that it is submitted below */
1736 mxge_submit_req_backwards(tx, src, cnt);
1740 /* submit the first request */
1741 mxge_pio_copy(dstp, srcp, sizeof(*src));
1742 wmb(); /* barrier before setting valid flag */
1745 /* re-write the last 32-bits with the valid flags */
1746 src->flags = last_flags;
1747 src_ints = (uint32_t *)src;
1749 dst_ints = (volatile uint32_t *)dst;
1751 *dst_ints = *src_ints;
1757 mxge_pullup_tso(struct mbuf **mp)
1759 int hoff, iphlen, thoff;
1763 KASSERT(M_WRITABLE(m), ("TSO mbuf not writable"));
1765 iphlen = m->m_pkthdr.csum_iphlen;
1766 thoff = m->m_pkthdr.csum_thlen;
1767 hoff = m->m_pkthdr.csum_lhlen;
1769 KASSERT(iphlen > 0, ("invalid ip hlen"));
1770 KASSERT(thoff > 0, ("invalid tcp hlen"));
1771 KASSERT(hoff > 0, ("invalid ether hlen"));
1773 if (__predict_false(m->m_len < hoff + iphlen + thoff)) {
1774 m = m_pullup(m, hoff + iphlen + thoff);
1785 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1789 mcp_kreq_ether_send_t *req;
1790 bus_dma_segment_t *seg;
1791 uint32_t low, high_swapped;
1792 int len, seglen, cum_len, cum_len_next;
1793 int next_is_first, chop, cnt, rdma_count, small;
1794 uint16_t pseudo_hdr_offset, cksum_offset, mss;
1795 uint8_t flags, flags_next;
1798 mss = m->m_pkthdr.tso_segsz;
1800 /* negative cum_len signifies to the
1801 * send loop that we are still in the
1802 * header portion of the TSO packet.
1804 cum_len = -(m->m_pkthdr.csum_lhlen + m->m_pkthdr.csum_iphlen +
1805 m->m_pkthdr.csum_thlen);
1807 /* TSO implies checksum offload on this hardware */
1808 cksum_offset = m->m_pkthdr.csum_lhlen + m->m_pkthdr.csum_iphlen;
1809 flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1811 /* for TSO, pseudo_hdr_offset holds mss.
1812 * The firmware figures out where to put
1813 * the checksum by parsing the header. */
1814 pseudo_hdr_offset = htobe16(mss);
1821 /* "rdma_count" is the number of RDMAs belonging to the
1822 * current packet BEFORE the current send request. For
1823 * non-TSO packets, this is equal to "count".
1824 * For TSO packets, rdma_count needs to be reset
1825 * to 0 after a segment cut.
1827 * The rdma_count field of the send request is
1828 * the number of RDMAs of the packet starting at
1829 * that request. For TSO send requests with one ore more cuts
1830 * in the middle, this is the number of RDMAs starting
1831 * after the last cut in the request. All previous
1832 * segments before the last cut implicitly have 1 RDMA.
1834 * Since the number of RDMAs is not known beforehand,
1835 * it must be filled-in retroactively - after each
1836 * segmentation cut or at the end of the entire packet.
1839 while (busdma_seg_cnt) {
1840 /* Break the busdma segment up into pieces*/
1841 low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1842 high_swapped = htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1846 flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1848 cum_len_next = cum_len + seglen;
1849 (req-rdma_count)->rdma_count = rdma_count + 1;
1850 if (__predict_true(cum_len >= 0)) {
1852 chop = (cum_len_next > mss);
1853 cum_len_next = cum_len_next % mss;
1854 next_is_first = (cum_len_next == 0);
1855 flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1856 flags_next |= next_is_first *
1858 rdma_count |= -(chop | next_is_first);
1859 rdma_count += chop & !next_is_first;
1860 } else if (cum_len_next >= 0) {
1865 small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1866 flags_next = MXGEFW_FLAGS_TSO_PLD |
1867 MXGEFW_FLAGS_FIRST |
1868 (small * MXGEFW_FLAGS_SMALL);
1871 req->addr_high = high_swapped;
1872 req->addr_low = htobe32(low);
1873 req->pseudo_hdr_offset = pseudo_hdr_offset;
1875 req->rdma_count = 1;
1876 req->length = htobe16(seglen);
1877 req->cksum_offset = cksum_offset;
1878 req->flags = flags | ((cum_len & 1) *
1879 MXGEFW_FLAGS_ALIGN_ODD);
1882 cum_len = cum_len_next;
1887 if (__predict_false(cksum_offset > seglen))
1888 cksum_offset -= seglen;
1891 if (__predict_false(cnt > tx->max_desc))
1897 (req-rdma_count)->rdma_count = rdma_count;
1901 req->flags |= MXGEFW_FLAGS_TSO_LAST;
1902 } while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1904 tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1905 mxge_submit_req(tx, tx->req_list, cnt);
1906 #ifdef IFNET_BUF_RING
1907 if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1908 /* tell the NIC to start polling this slice */
1910 tx->queue_active = 1;
1918 bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1922 kprintf("tx->max_desc exceeded via TSO!\n");
1923 kprintf("mss = %d, %ld, %d!\n", mss,
1924 (long)seg - (long)tx->seg_list, tx->max_desc);
1930 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
1933 mcp_kreq_ether_send_t *req;
1934 bus_dma_segment_t *seg;
1936 int cnt, cum_len, err, i, idx, odd_flag;
1937 uint16_t pseudo_hdr_offset;
1938 uint8_t flags, cksum_offset;
1943 if (m->m_pkthdr.csum_flags & CSUM_TSO) {
1944 if (mxge_pullup_tso(&m))
1948 /* (try to) map the frame for DMA */
1949 idx = tx->req & tx->mask;
1950 err = bus_dmamap_load_mbuf_defrag(tx->dmat, tx->info[idx].map, &m,
1951 tx->seg_list, tx->max_desc - 2, &cnt, BUS_DMA_NOWAIT);
1952 if (__predict_false(err != 0))
1954 bus_dmamap_sync(tx->dmat, tx->info[idx].map, BUS_DMASYNC_PREWRITE);
1955 tx->info[idx].m = m;
1957 /* TSO is different enough, we handle it in another routine */
1958 if (m->m_pkthdr.csum_flags & CSUM_TSO) {
1959 mxge_encap_tso(ss, m, cnt);
1965 pseudo_hdr_offset = 0;
1966 flags = MXGEFW_FLAGS_NO_TSO;
1968 /* checksum offloading? */
1969 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
1970 cksum_offset = m->m_pkthdr.csum_lhlen + m->m_pkthdr.csum_iphlen;
1971 pseudo_hdr_offset = cksum_offset + m->m_pkthdr.csum_data;
1972 pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
1973 req->cksum_offset = cksum_offset;
1974 flags |= MXGEFW_FLAGS_CKSUM;
1975 odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
1979 if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
1980 flags |= MXGEFW_FLAGS_SMALL;
1982 /* convert segments into a request list */
1985 req->flags = MXGEFW_FLAGS_FIRST;
1986 for (i = 0; i < cnt; i++) {
1988 htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
1990 htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1991 req->length = htobe16(seg->ds_len);
1992 req->cksum_offset = cksum_offset;
1993 if (cksum_offset > seg->ds_len)
1994 cksum_offset -= seg->ds_len;
1997 req->pseudo_hdr_offset = pseudo_hdr_offset;
1998 req->pad = 0; /* complete solid 16-byte block */
1999 req->rdma_count = 1;
2000 req->flags |= flags | ((cum_len & 1) * odd_flag);
2001 cum_len += seg->ds_len;
2007 /* pad runts to 60 bytes */
2011 htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2013 htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2014 req->length = htobe16(60 - cum_len);
2015 req->cksum_offset = 0;
2016 req->pseudo_hdr_offset = pseudo_hdr_offset;
2017 req->pad = 0; /* complete solid 16-byte block */
2018 req->rdma_count = 1;
2019 req->flags |= flags | ((cum_len & 1) * odd_flag);
2023 tx->req_list[0].rdma_count = cnt;
2025 /* print what the firmware will see */
2026 for (i = 0; i < cnt; i++) {
2027 kprintf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2028 "cso:%d, flags:0x%x, rdma:%d\n",
2029 i, (int)ntohl(tx->req_list[i].addr_high),
2030 (int)ntohl(tx->req_list[i].addr_low),
2031 (int)ntohs(tx->req_list[i].length),
2032 (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2033 tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2034 tx->req_list[i].rdma_count);
2036 kprintf("--------------\n");
2038 tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2039 mxge_submit_req(tx, tx->req_list, cnt);
2040 #ifdef IFNET_BUF_RING
2041 if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2042 /* tell the NIC to start polling this slice */
2044 tx->queue_active = 1;
2057 mxge_start_locked(struct mxge_slice_state *ss)
2067 while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2068 m = ifq_dequeue(&ifp->if_snd);
2072 /* let BPF see it */
2075 /* give it to the nic */
2079 /* ran out of transmit slots */
2080 ifq_set_oactive(&ifp->if_snd);
2084 mxge_start(struct ifnet *ifp, struct ifaltq_subque *ifsq)
2086 mxge_softc_t *sc = ifp->if_softc;
2087 struct mxge_slice_state *ss;
2089 ASSERT_ALTQ_SQ_DEFAULT(ifp, ifsq);
2090 ASSERT_SERIALIZED(sc->ifp->if_serializer);
2091 /* only use the first slice for now */
2093 mxge_start_locked(ss);
2097 * copy an array of mcp_kreq_ether_recv_t's to the mcp. Copy
2098 * at most 32 bytes at a time, so as to avoid involving the software
2099 * pio handler in the nic. We re-write the first segment's low
2100 * DMA address to mark it valid only after we write the entire chunk
2104 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2105 mcp_kreq_ether_recv_t *src)
2109 low = src->addr_low;
2110 src->addr_low = 0xffffffff;
2111 mxge_pio_copy(dst, src, 4 * sizeof (*src));
2113 mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2115 src->addr_low = low;
2116 dst->addr_low = low;
2121 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2123 bus_dma_segment_t seg;
2125 mxge_rx_ring_t *rx = &ss->rx_small;
2128 m = m_gethdr(MB_DONTWAIT, MT_DATA);
2134 m->m_len = m->m_pkthdr.len = MHLEN;
2135 err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2136 &seg, 1, &cnt, BUS_DMA_NOWAIT);
2138 kprintf("can't dmamap small (%d)\n", err);
2142 rx->info[idx].m = m;
2143 rx->shadow[idx].addr_low =
2144 htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2145 rx->shadow[idx].addr_high =
2146 htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2150 mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2155 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2157 bus_dma_segment_t seg[3];
2159 mxge_rx_ring_t *rx = &ss->rx_big;
2162 if (rx->cl_size == MCLBYTES)
2163 m = m_getcl(MB_DONTWAIT, MT_DATA, M_PKTHDR);
2166 m = m_getjcl(MB_DONTWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2169 * XXX: allocate normal sized buffers for big buffers.
2170 * We should be fine as long as we don't get any jumbo frames
2172 m = m_getcl(MB_DONTWAIT, MT_DATA, M_PKTHDR);
2180 m->m_len = m->m_pkthdr.len = rx->mlen;
2181 err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2182 seg, 1, &cnt, BUS_DMA_NOWAIT);
2184 kprintf("can't dmamap big (%d)\n", err);
2188 rx->info[idx].m = m;
2189 rx->shadow[idx].addr_low =
2190 htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2191 rx->shadow[idx].addr_high =
2192 htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2194 #if MXGE_VIRT_JUMBOS
2195 for (i = 1; i < cnt; i++) {
2196 rx->shadow[idx + i].addr_low =
2197 htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2198 rx->shadow[idx + i].addr_high =
2199 htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2204 for (i = 0; i < rx->nbufs; i++) {
2205 if ((idx & 7) == 7) {
2206 mxge_submit_8rx(&rx->lanai[idx - 7],
2207 &rx->shadow[idx - 7]);
2215 * Myri10GE hardware checksums are not valid if the sender
2216 * padded the frame with non-zero padding. This is because
2217 * the firmware just does a simple 16-bit 1s complement
2218 * checksum across the entire frame, excluding the first 14
2219 * bytes. It is best to simply to check the checksum and
2220 * tell the stack about it only if the checksum is good
2222 static inline uint16_t
2223 mxge_rx_csum(struct mbuf *m, int csum)
2225 struct ether_header *eh;
2229 eh = mtod(m, struct ether_header *);
2231 /* only deal with IPv4 TCP & UDP for now */
2232 if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2234 ip = (struct ip *)(eh + 1);
2235 if (__predict_false(ip->ip_p != IPPROTO_TCP &&
2236 ip->ip_p != IPPROTO_UDP))
2239 c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2240 htonl(ntohs(csum) + ntohs(ip->ip_len) +
2241 - (ip->ip_hl << 2) + ip->ip_p));
2250 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2252 struct ether_vlan_header *evl;
2255 evl = mtod(m, struct ether_vlan_header *);
2258 * fix checksum by subtracting EVL_ENCAPLEN bytes
2259 * after what the firmware thought was the end of the ethernet
2263 /* put checksum into host byte order */
2264 *csum = ntohs(*csum);
2265 partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2266 (*csum) += ~partial;
2267 (*csum) += ((*csum) < ~partial);
2268 (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2269 (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2271 /* restore checksum to network byte order;
2272 later consumers expect this */
2273 *csum = htons(*csum);
2276 m->m_pkthdr.ether_vlantag = ntohs(evl->evl_tag);
2277 m->m_flags |= M_VLANTAG;
2280 * Remove the 802.1q header by copying the Ethernet
2281 * addresses over it and adjusting the beginning of
2282 * the data in the mbuf. The encapsulated Ethernet
2283 * type field is already in place.
2285 bcopy((char *)evl, (char *)evl + EVL_ENCAPLEN,
2286 ETHER_HDR_LEN - ETHER_TYPE_LEN);
2287 m_adj(m, EVL_ENCAPLEN);
2292 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2297 struct ether_header *eh;
2299 bus_dmamap_t old_map;
2305 idx = rx->cnt & rx->mask;
2306 rx->cnt += rx->nbufs;
2307 /* save a pointer to the received mbuf */
2308 m = rx->info[idx].m;
2309 /* try to replace the received mbuf */
2310 if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2311 /* drop the frame -- the old mbuf is re-cycled */
2312 IFNET_STAT_INC(ifp, ierrors, 1);
2316 /* unmap the received buffer */
2317 old_map = rx->info[idx].map;
2318 bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2319 bus_dmamap_unload(rx->dmat, old_map);
2321 /* swap the bus_dmamap_t's */
2322 rx->info[idx].map = rx->extra_map;
2323 rx->extra_map = old_map;
2325 /* mcp implicitly skips 1st 2 bytes so that packet is properly
2327 m->m_data += MXGEFW_PAD;
2329 m->m_pkthdr.rcvif = ifp;
2330 m->m_len = m->m_pkthdr.len = len;
2332 eh = mtod(m, struct ether_header *);
2333 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2334 mxge_vlan_tag_remove(m, &csum);
2336 /* if the checksum is valid, mark it in the mbuf header */
2337 if ((ifp->if_capenable & IFCAP_RXCSUM) &&
2338 0 == mxge_rx_csum(m, csum)) {
2339 /* Tell the stack that the checksum is good */
2340 m->m_pkthdr.csum_data = 0xffff;
2341 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2345 /* flowid only valid if RSS hashing is enabled */
2346 if (sc->num_slices > 1) {
2347 m->m_pkthdr.flowid = (ss - sc->ss);
2348 m->m_flags |= M_FLOWID;
2351 ifp->if_input(ifp, m);
2355 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2359 struct ether_header *eh;
2362 bus_dmamap_t old_map;
2368 idx = rx->cnt & rx->mask;
2370 /* save a pointer to the received mbuf */
2371 m = rx->info[idx].m;
2372 /* try to replace the received mbuf */
2373 if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2374 /* drop the frame -- the old mbuf is re-cycled */
2375 IFNET_STAT_INC(ifp, ierrors, 1);
2379 /* unmap the received buffer */
2380 old_map = rx->info[idx].map;
2381 bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2382 bus_dmamap_unload(rx->dmat, old_map);
2384 /* swap the bus_dmamap_t's */
2385 rx->info[idx].map = rx->extra_map;
2386 rx->extra_map = old_map;
2388 /* mcp implicitly skips 1st 2 bytes so that packet is properly
2390 m->m_data += MXGEFW_PAD;
2392 m->m_pkthdr.rcvif = ifp;
2393 m->m_len = m->m_pkthdr.len = len;
2395 eh = mtod(m, struct ether_header *);
2396 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2397 mxge_vlan_tag_remove(m, &csum);
2399 /* if the checksum is valid, mark it in the mbuf header */
2400 if ((ifp->if_capenable & IFCAP_RXCSUM) &&
2401 0 == mxge_rx_csum(m, csum)) {
2402 /* Tell the stack that the checksum is good */
2403 m->m_pkthdr.csum_data = 0xffff;
2404 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2408 /* flowid only valid if RSS hashing is enabled */
2409 if (sc->num_slices > 1) {
2410 m->m_pkthdr.flowid = (ss - sc->ss);
2411 m->m_flags |= M_FLOWID;
2414 ifp->if_input(ifp, m);
2418 mxge_clean_rx_done(struct mxge_slice_state *ss)
2420 mxge_rx_done_t *rx_done = &ss->rx_done;
2425 while (rx_done->entry[rx_done->idx].length != 0) {
2426 length = ntohs(rx_done->entry[rx_done->idx].length);
2427 rx_done->entry[rx_done->idx].length = 0;
2428 checksum = rx_done->entry[rx_done->idx].checksum;
2429 if (length <= (MHLEN - MXGEFW_PAD))
2430 mxge_rx_done_small(ss, length, checksum);
2432 mxge_rx_done_big(ss, length, checksum);
2434 rx_done->idx = rx_done->cnt & rx_done->mask;
2436 /* limit potential for livelock */
2437 if (__predict_false(++limit > rx_done->mask / 2))
2443 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2453 ASSERT_SERIALIZED(ifp->if_serializer);
2454 while (tx->pkt_done != mcp_idx) {
2455 idx = tx->done & tx->mask;
2457 m = tx->info[idx].m;
2458 /* mbuf and DMA map only attached to the first
2461 ss->obytes += m->m_pkthdr.len;
2462 if (m->m_flags & M_MCAST)
2465 tx->info[idx].m = NULL;
2466 map = tx->info[idx].map;
2467 bus_dmamap_unload(tx->dmat, map);
2470 if (tx->info[idx].flag) {
2471 tx->info[idx].flag = 0;
2476 /* If we have space, clear OACTIVE to tell the stack that
2477 its OK to send packets */
2478 if (tx->req - tx->done < (tx->mask + 1)/4)
2479 ifq_clr_oactive(&ifp->if_snd);
2481 if (!ifq_is_empty(&ifp->if_snd))
2484 #ifdef IFNET_BUF_RING
2485 if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2486 /* let the NIC stop polling this queue, since there
2487 * are no more transmits pending */
2488 if (tx->req == tx->done) {
2490 tx->queue_active = 0;
2498 static struct mxge_media_type mxge_xfp_media_types[] = {
2499 {IFM_10G_CX4, 0x7f, "10GBASE-CX4 (module)"},
2500 {IFM_10G_SR, (1 << 7), "10GBASE-SR"},
2501 {IFM_10G_LR, (1 << 6), "10GBASE-LR"},
2502 {0, (1 << 5), "10GBASE-ER"},
2503 {IFM_10G_LRM, (1 << 4), "10GBASE-LRM"},
2504 {0, (1 << 3), "10GBASE-SW"},
2505 {0, (1 << 2), "10GBASE-LW"},
2506 {0, (1 << 1), "10GBASE-EW"},
2507 {0, (1 << 0), "Reserved"}
2510 static struct mxge_media_type mxge_sfp_media_types[] = {
2511 {IFM_10G_TWINAX, 0, "10GBASE-Twinax"},
2512 {0, (1 << 7), "Reserved"},
2513 {IFM_10G_LRM, (1 << 6), "10GBASE-LRM"},
2514 {IFM_10G_LR, (1 << 5), "10GBASE-LR"},
2515 {IFM_10G_SR, (1 << 4), "10GBASE-SR"},
2516 {IFM_10G_TWINAX,(1 << 0), "10GBASE-Twinax"}
2520 mxge_media_set(mxge_softc_t *sc, int media_type)
2522 ifmedia_add(&sc->media, IFM_ETHER | IFM_FDX | media_type,
2524 ifmedia_set(&sc->media, IFM_ETHER | IFM_FDX | media_type);
2525 sc->current_media = media_type;
2526 sc->media.ifm_media = sc->media.ifm_cur->ifm_media;
2530 mxge_media_init(mxge_softc_t *sc)
2535 ifmedia_removeall(&sc->media);
2536 mxge_media_set(sc, IFM_AUTO);
2539 * parse the product code to deterimine the interface type
2540 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2541 * after the 3rd dash in the driver's cached copy of the
2542 * EEPROM's product code string.
2544 ptr = sc->product_code_string;
2546 device_printf(sc->dev, "Missing product code\n");
2550 for (i = 0; i < 3; i++, ptr++) {
2551 ptr = strchr(ptr, '-');
2553 device_printf(sc->dev,
2554 "only %d dashes in PC?!?\n", i);
2558 if (*ptr == 'C' || *(ptr +1) == 'C') {
2560 sc->connector = MXGE_CX4;
2561 mxge_media_set(sc, IFM_10G_CX4);
2562 } else if (*ptr == 'Q') {
2563 /* -Q is Quad Ribbon Fiber */
2564 sc->connector = MXGE_QRF;
2565 device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2566 /* FreeBSD has no media type for Quad ribbon fiber */
2567 } else if (*ptr == 'R') {
2569 sc->connector = MXGE_XFP;
2570 } else if (*ptr == 'S' || *(ptr +1) == 'S') {
2571 /* -S or -2S is SFP+ */
2572 sc->connector = MXGE_SFP;
2574 device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2579 * Determine the media type for a NIC. Some XFPs will identify
2580 * themselves only when their link is up, so this is initiated via a
2581 * link up interrupt. However, this can potentially take up to
2582 * several milliseconds, so it is run via the watchdog routine, rather
2583 * than in the interrupt handler itself.
2586 mxge_media_probe(mxge_softc_t *sc)
2591 struct mxge_media_type *mxge_media_types = NULL;
2592 int i, err, ms, mxge_media_type_entries;
2595 sc->need_media_probe = 0;
2597 if (sc->connector == MXGE_XFP) {
2599 mxge_media_types = mxge_xfp_media_types;
2600 mxge_media_type_entries =
2601 sizeof (mxge_xfp_media_types) /
2602 sizeof (mxge_xfp_media_types[0]);
2603 byte = MXGE_XFP_COMPLIANCE_BYTE;
2605 } else if (sc->connector == MXGE_SFP) {
2606 /* -S or -2S is SFP+ */
2607 mxge_media_types = mxge_sfp_media_types;
2608 mxge_media_type_entries =
2609 sizeof (mxge_sfp_media_types) /
2610 sizeof (mxge_sfp_media_types[0]);
2614 /* nothing to do; media type cannot change */
2619 * At this point we know the NIC has an XFP cage, so now we
2620 * try to determine what is in the cage by using the
2621 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2622 * register. We read just one byte, which may take over
2626 cmd.data0 = 0; /* just fetch 1 byte, not all 256 */
2628 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2629 if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
2630 device_printf(sc->dev, "failed to read XFP\n");
2632 if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
2633 device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
2635 if (err != MXGEFW_CMD_OK) {
2639 /* now we wait for the data to be cached */
2641 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2642 for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
2645 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2647 if (err != MXGEFW_CMD_OK) {
2648 device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
2649 cage_type, err, ms);
2653 if (cmd.data0 == mxge_media_types[0].bitmask) {
2655 device_printf(sc->dev, "%s:%s\n", cage_type,
2656 mxge_media_types[0].name);
2657 if (sc->current_media != mxge_media_types[0].flag) {
2658 mxge_media_init(sc);
2659 mxge_media_set(sc, mxge_media_types[0].flag);
2663 for (i = 1; i < mxge_media_type_entries; i++) {
2664 if (cmd.data0 & mxge_media_types[i].bitmask) {
2666 device_printf(sc->dev, "%s:%s\n",
2668 mxge_media_types[i].name);
2670 if (sc->current_media != mxge_media_types[i].flag) {
2671 mxge_media_init(sc);
2672 mxge_media_set(sc, mxge_media_types[i].flag);
2678 device_printf(sc->dev, "%s media 0x%x unknown\n",
2679 cage_type, cmd.data0);
2685 mxge_intr(void *arg)
2687 struct mxge_slice_state *ss = arg;
2688 mxge_softc_t *sc = ss->sc;
2689 mcp_irq_data_t *stats = ss->fw_stats;
2690 mxge_tx_ring_t *tx = &ss->tx;
2691 mxge_rx_done_t *rx_done = &ss->rx_done;
2692 uint32_t send_done_count;
2696 #ifndef IFNET_BUF_RING
2697 /* an interrupt on a non-zero slice is implicitly valid
2698 since MSI-X irqs are not shared */
2700 mxge_clean_rx_done(ss);
2701 *ss->irq_claim = be32toh(3);
2706 /* make sure the DMA has finished */
2707 if (!stats->valid) {
2710 valid = stats->valid;
2712 if (sc->irq_type == PCI_INTR_TYPE_LEGACY) {
2713 /* lower legacy IRQ */
2714 *sc->irq_deassert = 0;
2715 if (!mxge_deassert_wait)
2716 /* don't wait for conf. that irq is low */
2722 /* loop while waiting for legacy irq deassertion */
2724 /* check for transmit completes and receives */
2725 send_done_count = be32toh(stats->send_done_count);
2726 while ((send_done_count != tx->pkt_done) ||
2727 (rx_done->entry[rx_done->idx].length != 0)) {
2728 if (send_done_count != tx->pkt_done)
2729 mxge_tx_done(ss, (int)send_done_count);
2730 mxge_clean_rx_done(ss);
2731 send_done_count = be32toh(stats->send_done_count);
2733 if (sc->irq_type == PCI_INTR_TYPE_LEGACY && mxge_deassert_wait)
2735 } while (*((volatile uint8_t *) &stats->valid));
2737 /* fw link & error stats meaningful only on the first slice */
2738 if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
2739 if (sc->link_state != stats->link_up) {
2740 sc->link_state = stats->link_up;
2741 if (sc->link_state) {
2742 sc->ifp->if_link_state = LINK_STATE_UP;
2743 if_link_state_change(sc->ifp);
2745 device_printf(sc->dev, "link up\n");
2747 sc->ifp->if_link_state = LINK_STATE_DOWN;
2748 if_link_state_change(sc->ifp);
2750 device_printf(sc->dev, "link down\n");
2752 sc->need_media_probe = 1;
2754 if (sc->rdma_tags_available !=
2755 be32toh(stats->rdma_tags_available)) {
2756 sc->rdma_tags_available =
2757 be32toh(stats->rdma_tags_available);
2758 device_printf(sc->dev, "RDMA timed out! %d tags "
2759 "left\n", sc->rdma_tags_available);
2762 if (stats->link_down) {
2763 sc->down_cnt += stats->link_down;
2765 sc->ifp->if_link_state = LINK_STATE_DOWN;
2766 if_link_state_change(sc->ifp);
2770 /* check to see if we have rx token to pass back */
2772 *ss->irq_claim = be32toh(3);
2773 *(ss->irq_claim + 1) = be32toh(3);
2777 mxge_init(void *arg)
2779 struct mxge_softc *sc = arg;
2781 ASSERT_SERIALIZED(sc->ifp->if_serializer);
2782 if ((sc->ifp->if_flags & IFF_RUNNING) == 0)
2787 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
2791 for (i = 0; i <= ss->rx_big.mask; i++) {
2792 if (ss->rx_big.info[i].m == NULL)
2794 bus_dmamap_unload(ss->rx_big.dmat,
2795 ss->rx_big.info[i].map);
2796 m_freem(ss->rx_big.info[i].m);
2797 ss->rx_big.info[i].m = NULL;
2800 for (i = 0; i <= ss->rx_small.mask; i++) {
2801 if (ss->rx_small.info[i].m == NULL)
2803 bus_dmamap_unload(ss->rx_small.dmat,
2804 ss->rx_small.info[i].map);
2805 m_freem(ss->rx_small.info[i].m);
2806 ss->rx_small.info[i].m = NULL;
2809 /* transmit ring used only on the first slice */
2810 if (ss->tx.info == NULL)
2813 for (i = 0; i <= ss->tx.mask; i++) {
2814 ss->tx.info[i].flag = 0;
2815 if (ss->tx.info[i].m == NULL)
2817 bus_dmamap_unload(ss->tx.dmat,
2818 ss->tx.info[i].map);
2819 m_freem(ss->tx.info[i].m);
2820 ss->tx.info[i].m = NULL;
2825 mxge_free_mbufs(mxge_softc_t *sc)
2829 for (slice = 0; slice < sc->num_slices; slice++)
2830 mxge_free_slice_mbufs(&sc->ss[slice]);
2834 mxge_free_slice_rings(struct mxge_slice_state *ss)
2839 if (ss->rx_done.entry != NULL)
2840 mxge_dma_free(&ss->rx_done.dma);
2841 ss->rx_done.entry = NULL;
2843 if (ss->tx.req_bytes != NULL)
2844 kfree(ss->tx.req_bytes, M_DEVBUF);
2845 ss->tx.req_bytes = NULL;
2847 if (ss->tx.seg_list != NULL)
2848 kfree(ss->tx.seg_list, M_DEVBUF);
2849 ss->tx.seg_list = NULL;
2851 if (ss->rx_small.shadow != NULL)
2852 kfree(ss->rx_small.shadow, M_DEVBUF);
2853 ss->rx_small.shadow = NULL;
2855 if (ss->rx_big.shadow != NULL)
2856 kfree(ss->rx_big.shadow, M_DEVBUF);
2857 ss->rx_big.shadow = NULL;
2859 if (ss->tx.info != NULL) {
2860 if (ss->tx.dmat != NULL) {
2861 for (i = 0; i <= ss->tx.mask; i++) {
2862 bus_dmamap_destroy(ss->tx.dmat,
2863 ss->tx.info[i].map);
2865 bus_dma_tag_destroy(ss->tx.dmat);
2867 kfree(ss->tx.info, M_DEVBUF);
2871 if (ss->rx_small.info != NULL) {
2872 if (ss->rx_small.dmat != NULL) {
2873 for (i = 0; i <= ss->rx_small.mask; i++) {
2874 bus_dmamap_destroy(ss->rx_small.dmat,
2875 ss->rx_small.info[i].map);
2877 bus_dmamap_destroy(ss->rx_small.dmat,
2878 ss->rx_small.extra_map);
2879 bus_dma_tag_destroy(ss->rx_small.dmat);
2881 kfree(ss->rx_small.info, M_DEVBUF);
2883 ss->rx_small.info = NULL;
2885 if (ss->rx_big.info != NULL) {
2886 if (ss->rx_big.dmat != NULL) {
2887 for (i = 0; i <= ss->rx_big.mask; i++) {
2888 bus_dmamap_destroy(ss->rx_big.dmat,
2889 ss->rx_big.info[i].map);
2891 bus_dmamap_destroy(ss->rx_big.dmat,
2892 ss->rx_big.extra_map);
2893 bus_dma_tag_destroy(ss->rx_big.dmat);
2895 kfree(ss->rx_big.info, M_DEVBUF);
2897 ss->rx_big.info = NULL;
2901 mxge_free_rings(mxge_softc_t *sc)
2905 for (slice = 0; slice < sc->num_slices; slice++)
2906 mxge_free_slice_rings(&sc->ss[slice]);
2910 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
2911 int tx_ring_entries)
2913 mxge_softc_t *sc = ss->sc;
2917 /* allocate per-slice receive resources */
2919 ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
2920 ss->rx_done.mask = (2 * rx_ring_entries) - 1;
2922 /* allocate the rx shadow rings */
2923 bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
2924 ss->rx_small.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2926 bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
2927 ss->rx_big.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2929 /* allocate the rx host info rings */
2930 bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
2931 ss->rx_small.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2933 bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
2934 ss->rx_big.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2936 /* allocate the rx busdma resources */
2937 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
2939 4096, /* boundary */
2940 BUS_SPACE_MAXADDR, /* low */
2941 BUS_SPACE_MAXADDR, /* high */
2942 NULL, NULL, /* filter */
2943 MHLEN, /* maxsize */
2945 MHLEN, /* maxsegsize */
2946 BUS_DMA_ALLOCNOW, /* flags */
2947 &ss->rx_small.dmat); /* tag */
2949 device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
2954 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
2956 #if MXGE_VIRT_JUMBOS
2957 4096, /* boundary */
2961 BUS_SPACE_MAXADDR, /* low */
2962 BUS_SPACE_MAXADDR, /* high */
2963 NULL, NULL, /* filter */
2964 3*4096, /* maxsize */
2965 #if MXGE_VIRT_JUMBOS
2967 4096, /* maxsegsize*/
2970 MJUM9BYTES, /* maxsegsize*/
2972 BUS_DMA_ALLOCNOW, /* flags */
2973 &ss->rx_big.dmat); /* tag */
2975 device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
2979 for (i = 0; i <= ss->rx_small.mask; i++) {
2980 err = bus_dmamap_create(ss->rx_small.dmat, 0,
2981 &ss->rx_small.info[i].map);
2983 device_printf(sc->dev, "Err %d rx_small dmamap\n",
2988 err = bus_dmamap_create(ss->rx_small.dmat, 0,
2989 &ss->rx_small.extra_map);
2991 device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
2996 for (i = 0; i <= ss->rx_big.mask; i++) {
2997 err = bus_dmamap_create(ss->rx_big.dmat, 0,
2998 &ss->rx_big.info[i].map);
3000 device_printf(sc->dev, "Err %d rx_big dmamap\n",
3005 err = bus_dmamap_create(ss->rx_big.dmat, 0,
3006 &ss->rx_big.extra_map);
3008 device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3013 /* now allocate TX resources */
3015 #ifndef IFNET_BUF_RING
3016 /* only use a single TX ring for now */
3017 if (ss != ss->sc->ss)
3021 ss->tx.mask = tx_ring_entries - 1;
3022 ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3025 /* allocate the tx request copy block */
3027 sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3028 ss->tx.req_bytes = kmalloc(bytes, M_DEVBUF, M_WAITOK);
3029 /* ensure req_list entries are aligned to 8 bytes */
3030 ss->tx.req_list = (mcp_kreq_ether_send_t *)
3031 ((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3033 /* allocate the tx busdma segment list */
3034 bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3035 ss->tx.seg_list = (bus_dma_segment_t *)
3036 kmalloc(bytes, M_DEVBUF, M_WAITOK);
3038 /* allocate the tx host info ring */
3039 bytes = tx_ring_entries * sizeof (*ss->tx.info);
3040 ss->tx.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3042 /* allocate the tx busdma resources */
3043 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3045 sc->tx_boundary, /* boundary */
3046 BUS_SPACE_MAXADDR, /* low */
3047 BUS_SPACE_MAXADDR, /* high */
3048 NULL, NULL, /* filter */
3049 65536 + 256, /* maxsize */
3050 ss->tx.max_desc - 2, /* num segs */
3051 sc->tx_boundary, /* maxsegsz */
3052 BUS_DMA_ALLOCNOW, /* flags */
3053 &ss->tx.dmat); /* tag */
3056 device_printf(sc->dev, "Err %d allocating tx dmat\n",
3061 /* now use these tags to setup dmamaps for each slot
3063 for (i = 0; i <= ss->tx.mask; i++) {
3064 err = bus_dmamap_create(ss->tx.dmat, 0,
3065 &ss->tx.info[i].map);
3067 device_printf(sc->dev, "Err %d tx dmamap\n",
3077 mxge_alloc_rings(mxge_softc_t *sc)
3081 int tx_ring_entries, rx_ring_entries;
3084 /* get ring sizes */
3085 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3086 tx_ring_size = cmd.data0;
3088 device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3092 tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3093 rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3094 ifq_set_maxlen(&sc->ifp->if_snd, tx_ring_entries - 1);
3095 ifq_set_ready(&sc->ifp->if_snd);
3097 for (slice = 0; slice < sc->num_slices; slice++) {
3098 err = mxge_alloc_slice_rings(&sc->ss[slice],
3107 mxge_free_rings(sc);
3113 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3115 int bufsize = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN + MXGEFW_PAD;
3117 if (bufsize < MCLBYTES) {
3118 /* easy, everything fits in a single buffer */
3119 *big_buf_size = MCLBYTES;
3120 *cl_size = MCLBYTES;
3125 if (bufsize < MJUMPAGESIZE) {
3126 /* still easy, everything still fits in a single buffer */
3127 *big_buf_size = MJUMPAGESIZE;
3128 *cl_size = MJUMPAGESIZE;
3132 #if MXGE_VIRT_JUMBOS
3133 /* now we need to use virtually contiguous buffers */
3134 *cl_size = MJUM9BYTES;
3135 *big_buf_size = 4096;
3136 *nbufs = mtu / 4096 + 1;
3137 /* needs to be a power of two, so round up */
3141 *cl_size = MJUM9BYTES;
3142 *big_buf_size = MJUM9BYTES;
3148 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3156 slice = ss - sc->ss;
3158 /* get the lanai pointers to the send and receive rings */
3161 #ifndef IFNET_BUF_RING
3162 /* We currently only send from the first slice */
3166 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3168 (volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3169 ss->tx.send_go = (volatile uint32_t *)
3170 (sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3171 ss->tx.send_stop = (volatile uint32_t *)
3172 (sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3173 #ifndef IFNET_BUF_RING
3177 err |= mxge_send_cmd(sc,
3178 MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3179 ss->rx_small.lanai =
3180 (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3182 err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3184 (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3187 device_printf(sc->dev,
3188 "failed to get ring sizes or locations\n");
3192 /* stock receive rings */
3193 for (i = 0; i <= ss->rx_small.mask; i++) {
3194 map = ss->rx_small.info[i].map;
3195 err = mxge_get_buf_small(ss, map, i);
3197 device_printf(sc->dev, "alloced %d/%d smalls\n",
3198 i, ss->rx_small.mask + 1);
3202 for (i = 0; i <= ss->rx_big.mask; i++) {
3203 ss->rx_big.shadow[i].addr_low = 0xffffffff;
3204 ss->rx_big.shadow[i].addr_high = 0xffffffff;
3206 ss->rx_big.nbufs = nbufs;
3207 ss->rx_big.cl_size = cl_size;
3208 ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3209 EVL_ENCAPLEN + MXGEFW_PAD;
3210 for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3211 map = ss->rx_big.info[i].map;
3212 err = mxge_get_buf_big(ss, map, i);
3214 device_printf(sc->dev, "alloced %d/%d bigs\n",
3215 i, ss->rx_big.mask + 1);
3223 mxge_open(mxge_softc_t *sc)
3226 int err, big_bytes, nbufs, slice, cl_size, i;
3228 volatile uint8_t *itable;
3229 struct mxge_slice_state *ss;
3231 ASSERT_SERIALIZED(sc->ifp->if_serializer);
3232 /* Copy the MAC address in case it was overridden */
3233 bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3235 err = mxge_reset(sc, 1);
3237 device_printf(sc->dev, "failed to reset\n");
3241 if (sc->num_slices > 1) {
3242 /* setup the indirection table */
3243 cmd.data0 = sc->num_slices;
3244 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3247 err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3250 device_printf(sc->dev,
3251 "failed to setup rss tables\n");
3255 /* just enable an identity mapping */
3256 itable = sc->sram + cmd.data0;
3257 for (i = 0; i < sc->num_slices; i++)
3258 itable[i] = (uint8_t)i;
3261 cmd.data1 = mxge_rss_hash_type;
3262 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3264 device_printf(sc->dev, "failed to enable slices\n");
3269 cmd.data0 = MXGEFW_TSO_MODE_NDIS;
3270 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_TSO_MODE, &cmd);
3272 device_printf(sc->dev, "failed set TSO mode\n");
3273 sc->ifp->if_capenable &= ~IFCAP_TSO;
3274 sc->ifp->if_capabilities &= ~IFCAP_TSO;
3275 sc->ifp->if_hwassist &= ~CSUM_TSO;
3278 mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3281 err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3283 /* error is only meaningful if we're trying to set
3284 MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3285 if (err && nbufs > 1) {
3286 device_printf(sc->dev,
3287 "Failed to set alway-use-n to %d\n",
3291 /* Give the firmware the mtu and the big and small buffer
3292 sizes. The firmware wants the big buf size to be a power
3293 of two. Luckily, FreeBSD's clusters are powers of two */
3294 cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
3295 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3296 cmd.data0 = MHLEN - MXGEFW_PAD;
3297 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3299 cmd.data0 = big_bytes;
3300 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3303 device_printf(sc->dev, "failed to setup params\n");
3307 /* Now give him the pointer to the stats block */
3309 #ifdef IFNET_BUF_RING
3310 slice < sc->num_slices;
3315 ss = &sc->ss[slice];
3317 MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3319 MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3320 cmd.data2 = sizeof(struct mcp_irq_data);
3321 cmd.data2 |= (slice << 16);
3322 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3326 bus = sc->ss->fw_stats_dma.bus_addr;
3327 bus += offsetof(struct mcp_irq_data, send_done_count);
3328 cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3329 cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3330 err = mxge_send_cmd(sc,
3331 MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3333 /* Firmware cannot support multicast without STATS_DMA_V2 */
3334 sc->fw_multicast_support = 0;
3336 sc->fw_multicast_support = 1;
3340 device_printf(sc->dev, "failed to setup params\n");
3344 for (slice = 0; slice < sc->num_slices; slice++) {
3345 err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3347 device_printf(sc->dev, "couldn't open slice %d\n",
3353 /* Finally, start the firmware running */
3354 err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3356 device_printf(sc->dev, "Couldn't bring up link\n");
3359 sc->ifp->if_flags |= IFF_RUNNING;
3360 ifq_clr_oactive(&sc->ifp->if_snd);
3366 mxge_free_mbufs(sc);
3372 mxge_close(mxge_softc_t *sc, int down)
3375 int err, old_down_cnt;
3377 ASSERT_SERIALIZED(sc->ifp->if_serializer);
3379 sc->ifp->if_flags &= ~IFF_RUNNING;
3381 old_down_cnt = sc->down_cnt;
3383 err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3385 device_printf(sc->dev,
3386 "Couldn't bring down link\n");
3388 if (old_down_cnt == sc->down_cnt) {
3389 /* wait for down irq */
3390 lwkt_serialize_exit(sc->ifp->if_serializer);
3391 DELAY(10 * sc->intr_coal_delay);
3392 lwkt_serialize_enter(sc->ifp->if_serializer);
3395 if (old_down_cnt == sc->down_cnt) {
3396 device_printf(sc->dev, "never got down irq\n");
3399 mxge_free_mbufs(sc);
3405 mxge_setup_cfg_space(mxge_softc_t *sc)
3407 device_t dev = sc->dev;
3409 uint16_t lnk, pectl;
3411 /* find the PCIe link width and set max read request to 4KB*/
3412 if (pci_find_extcap(dev, PCIY_EXPRESS, ®) == 0) {
3413 lnk = pci_read_config(dev, reg + 0x12, 2);
3414 sc->link_width = (lnk >> 4) & 0x3f;
3416 if (sc->pectl == 0) {
3417 pectl = pci_read_config(dev, reg + 0x8, 2);
3418 pectl = (pectl & ~0x7000) | (5 << 12);
3419 pci_write_config(dev, reg + 0x8, pectl, 2);
3422 /* restore saved pectl after watchdog reset */
3423 pci_write_config(dev, reg + 0x8, sc->pectl, 2);
3427 /* Enable DMA and Memory space access */
3428 pci_enable_busmaster(dev);
3432 mxge_read_reboot(mxge_softc_t *sc)
3434 device_t dev = sc->dev;
3437 /* find the vendor specific offset */
3438 if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
3439 device_printf(sc->dev,
3440 "could not find vendor specific offset\n");
3441 return (uint32_t)-1;
3443 /* enable read32 mode */
3444 pci_write_config(dev, vs + 0x10, 0x3, 1);
3445 /* tell NIC which register to read */
3446 pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3447 return (pci_read_config(dev, vs + 0x14, 4));
3451 mxge_watchdog_reset(mxge_softc_t *sc)
3453 struct pci_devinfo *dinfo;
3460 device_printf(sc->dev, "Watchdog reset!\n");
3463 * check to see if the NIC rebooted. If it did, then all of
3464 * PCI config space has been reset, and things like the
3465 * busmaster bit will be zero. If this is the case, then we
3466 * must restore PCI config space before the NIC can be used
3469 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3470 if (cmd == 0xffff) {
3472 * maybe the watchdog caught the NIC rebooting; wait
3473 * up to 100ms for it to finish. If it does not come
3474 * back, then give up
3477 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3478 if (cmd == 0xffff) {
3479 device_printf(sc->dev, "NIC disappeared!\n");
3482 if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3483 /* print the reboot status */
3484 reboot = mxge_read_reboot(sc);
3485 device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3487 running = sc->ifp->if_flags & IFF_RUNNING;
3491 * quiesce NIC so that TX routines will not try to
3492 * xmit after restoration of BAR
3495 /* Mark the link as down */
3496 if (sc->link_state) {
3497 sc->ifp->if_link_state = LINK_STATE_DOWN;
3498 if_link_state_change(sc->ifp);
3502 /* restore PCI configuration space */
3503 dinfo = device_get_ivars(sc->dev);
3504 pci_cfg_restore(sc->dev, dinfo);
3506 /* and redo any changes we made to our config space */
3507 mxge_setup_cfg_space(sc);
3510 err = mxge_load_firmware(sc, 0);
3512 device_printf(sc->dev,
3513 "Unable to re-load f/w\n");
3517 err = mxge_open(sc);
3518 if_devstart_sched(sc->ifp);
3521 sc->watchdog_resets++;
3523 device_printf(sc->dev,
3524 "NIC did not reboot, not resetting\n");
3528 device_printf(sc->dev, "watchdog reset failed\n");
3532 callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3537 mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice)
3539 tx = &sc->ss[slice].tx;
3540 device_printf(sc->dev, "slice %d struck? ring state:\n", slice);
3541 device_printf(sc->dev,
3542 "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3543 tx->req, tx->done, tx->queue_active);
3544 device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3545 tx->activate, tx->deactivate);
3546 device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3548 be32toh(sc->ss->fw_stats->send_done_count));
3552 mxge_watchdog(mxge_softc_t *sc)
3555 uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3558 /* see if we have outstanding transmits, which
3559 have been pending for more than mxge_ticks */
3561 #ifdef IFNET_BUF_RING
3562 (i < sc->num_slices) && (err == 0);
3564 (i < 1) && (err == 0);
3568 if (tx->req != tx->done &&
3569 tx->watchdog_req != tx->watchdog_done &&
3570 tx->done == tx->watchdog_done) {
3571 /* check for pause blocking before resetting */
3572 if (tx->watchdog_rx_pause == rx_pause) {
3573 mxge_warn_stuck(sc, tx, i);
3574 mxge_watchdog_reset(sc);
3578 device_printf(sc->dev, "Flow control blocking "
3579 "xmits, check link partner\n");
3582 tx->watchdog_req = tx->req;
3583 tx->watchdog_done = tx->done;
3584 tx->watchdog_rx_pause = rx_pause;
3587 if (sc->need_media_probe)
3588 mxge_media_probe(sc);
3593 mxge_update_stats(mxge_softc_t *sc)
3595 struct mxge_slice_state *ss;
3597 u_long ipackets = 0, old_ipackets;
3598 u_long opackets = 0, old_opackets;
3599 #ifdef IFNET_BUF_RING
3607 for (slice = 0; slice < sc->num_slices; slice++) {
3608 ss = &sc->ss[slice];
3609 ipackets += ss->ipackets;
3610 opackets += ss->opackets;
3611 #ifdef IFNET_BUF_RING
3612 obytes += ss->obytes;
3613 omcasts += ss->omcasts;
3614 odrops += ss->tx.br->br_drops;
3616 oerrors += ss->oerrors;
3618 IFNET_STAT_GET(sc->ifp, ipackets, old_ipackets);
3619 IFNET_STAT_GET(sc->ifp, opackets, old_opackets);
3621 pkts = ipackets - old_ipackets;
3622 pkts += opackets - old_opackets;
3624 IFNET_STAT_SET(sc->ifp, ipackets, ipackets);
3625 IFNET_STAT_SET(sc->ifp, opackets, opackets);
3626 #ifdef IFNET_BUF_RING
3627 sc->ifp->if_obytes = obytes;
3628 sc->ifp->if_omcasts = omcasts;
3629 sc->ifp->if_snd.ifq_drops = odrops;
3631 IFNET_STAT_SET(sc->ifp, oerrors, oerrors);
3636 mxge_tick(void *arg)
3638 mxge_softc_t *sc = arg;
3644 lwkt_serialize_enter(sc->ifp->if_serializer);
3647 running = sc->ifp->if_flags & IFF_RUNNING;
3649 /* aggregate stats from different slices */
3650 pkts = mxge_update_stats(sc);
3651 if (!sc->watchdog_countdown) {
3652 err = mxge_watchdog(sc);
3653 sc->watchdog_countdown = 4;
3655 sc->watchdog_countdown--;
3658 /* ensure NIC did not suffer h/w fault while idle */
3659 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3660 if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3662 mxge_watchdog_reset(sc);
3665 /* look less often if NIC is idle */
3670 callout_reset(&sc->co_hdl, ticks, mxge_tick, sc);
3672 lwkt_serialize_exit(sc->ifp->if_serializer);
3676 mxge_media_change(struct ifnet *ifp)
3682 mxge_change_mtu(mxge_softc_t *sc, int mtu)
3684 struct ifnet *ifp = sc->ifp;
3685 int real_mtu, old_mtu;
3688 if (ifp->if_serializer)
3689 ASSERT_SERIALIZED(ifp->if_serializer);
3691 real_mtu = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
3692 if ((real_mtu > sc->max_mtu) || real_mtu < 60)
3694 old_mtu = ifp->if_mtu;
3696 if (ifp->if_flags & IFF_RUNNING) {
3698 err = mxge_open(sc);
3700 ifp->if_mtu = old_mtu;
3702 (void) mxge_open(sc);
3709 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
3711 mxge_softc_t *sc = ifp->if_softc;
3716 ifmr->ifm_status = IFM_AVALID;
3717 ifmr->ifm_active = IFM_ETHER | IFM_FDX;
3718 ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
3719 ifmr->ifm_active |= sc->current_media;
3723 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data,
3724 struct ucred *cr __unused)
3726 mxge_softc_t *sc = ifp->if_softc;
3727 struct ifreq *ifr = (struct ifreq *)data;
3731 ASSERT_SERIALIZED(ifp->if_serializer);
3734 err = mxge_change_mtu(sc, ifr->ifr_mtu);
3741 if (ifp->if_flags & IFF_UP) {
3742 if (!(ifp->if_flags & IFF_RUNNING)) {
3743 err = mxge_open(sc);
3745 /* take care of promis can allmulti
3747 mxge_change_promisc(sc,
3748 ifp->if_flags & IFF_PROMISC);
3749 mxge_set_multicast_list(sc);
3752 if (ifp->if_flags & IFF_RUNNING) {
3760 mxge_set_multicast_list(sc);
3764 mask = ifr->ifr_reqcap ^ ifp->if_capenable;
3765 if (mask & IFCAP_TXCSUM) {
3766 ifp->if_capenable ^= IFCAP_TXCSUM;
3767 if (ifp->if_capenable & IFCAP_TXCSUM)
3768 ifp->if_hwassist |= CSUM_TCP | CSUM_UDP;
3770 ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP);
3772 if (mask & IFCAP_TSO) {
3773 ifp->if_capenable ^= IFCAP_TSO;
3774 if (ifp->if_capenable & IFCAP_TSO)
3775 ifp->if_hwassist |= CSUM_TSO;
3777 ifp->if_hwassist &= ~CSUM_TSO;
3779 if (mask & IFCAP_RXCSUM)
3780 ifp->if_capenable ^= IFCAP_RXCSUM;
3781 if (mask & IFCAP_VLAN_HWTAGGING)
3782 ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
3786 mxge_media_probe(sc);
3787 err = ifmedia_ioctl(ifp, (struct ifreq *)data,
3788 &sc->media, command);
3792 err = ether_ioctl(ifp, command, data);
3799 mxge_fetch_tunables(mxge_softc_t *sc)
3802 TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
3803 TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
3804 &mxge_flow_control);
3805 TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
3806 &mxge_intr_coal_delay);
3807 TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
3808 &mxge_nvidia_ecrc_enable);
3809 TUNABLE_INT_FETCH("hw.mxge.force_firmware",
3810 &mxge_force_firmware);
3811 TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
3812 &mxge_deassert_wait);
3813 TUNABLE_INT_FETCH("hw.mxge.verbose",
3815 TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
3816 TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
3817 TUNABLE_INT_FETCH("hw.mxge.throttle", &mxge_throttle);
3821 if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
3822 mxge_intr_coal_delay = 30;
3823 if (mxge_ticks == 0)
3824 mxge_ticks = hz / 2;
3825 sc->pause = mxge_flow_control;
3827 if (mxge_throttle && mxge_throttle > MXGE_MAX_THROTTLE)
3828 mxge_throttle = MXGE_MAX_THROTTLE;
3829 if (mxge_throttle && mxge_throttle < MXGE_MIN_THROTTLE)
3830 mxge_throttle = MXGE_MIN_THROTTLE;
3831 sc->throttle = mxge_throttle;
3835 mxge_free_slices(mxge_softc_t *sc)
3837 struct mxge_slice_state *ss;
3844 for (i = 0; i < sc->num_slices; i++) {
3846 if (ss->fw_stats != NULL) {
3847 mxge_dma_free(&ss->fw_stats_dma);
3848 ss->fw_stats = NULL;
3850 if (ss->rx_done.entry != NULL) {
3851 mxge_dma_free(&ss->rx_done.dma);
3852 ss->rx_done.entry = NULL;
3855 kfree(sc->ss, M_DEVBUF);
3860 mxge_alloc_slices(mxge_softc_t *sc)
3863 struct mxge_slice_state *ss;
3865 int err, i, max_intr_slots;
3867 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
3869 device_printf(sc->dev, "Cannot determine rx ring size\n");
3872 sc->rx_ring_size = cmd.data0;
3873 max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
3875 bytes = sizeof (*sc->ss) * sc->num_slices;
3876 sc->ss = kmalloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
3879 for (i = 0; i < sc->num_slices; i++) {
3884 /* allocate per-slice rx interrupt queues */
3886 bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
3887 err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
3890 ss->rx_done.entry = ss->rx_done.dma.addr;
3891 bzero(ss->rx_done.entry, bytes);
3894 * allocate the per-slice firmware stats; stats
3895 * (including tx) are used used only on the first
3898 #ifndef IFNET_BUF_RING
3903 bytes = sizeof (*ss->fw_stats);
3904 err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
3905 sizeof (*ss->fw_stats), 64);
3908 ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
3914 mxge_free_slices(sc);
3919 mxge_slice_probe(mxge_softc_t *sc)
3923 int msix_cnt, status, max_intr_slots;
3927 * don't enable multiple slices if they are not enabled,
3928 * or if this is not an SMP system
3931 if (mxge_max_slices == 0 || mxge_max_slices == 1 || ncpus < 2)
3934 /* see how many MSI-X interrupts are available */
3935 msix_cnt = pci_msix_count(sc->dev);
3939 /* now load the slice aware firmware see what it supports */
3940 old_fw = sc->fw_name;
3941 if (old_fw == mxge_fw_aligned)
3942 sc->fw_name = mxge_fw_rss_aligned;
3944 sc->fw_name = mxge_fw_rss_unaligned;
3945 status = mxge_load_firmware(sc, 0);
3947 device_printf(sc->dev, "Falling back to a single slice\n");
3951 /* try to send a reset command to the card to see if it
3953 memset(&cmd, 0, sizeof (cmd));
3954 status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
3956 device_printf(sc->dev, "failed reset\n");
3960 /* get rx ring size */
3961 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
3963 device_printf(sc->dev, "Cannot determine rx ring size\n");
3966 max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
3968 /* tell it the size of the interrupt queues */
3969 cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
3970 status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
3972 device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
3976 /* ask the maximum number of slices it supports */
3977 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
3979 device_printf(sc->dev,
3980 "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
3983 sc->num_slices = cmd.data0;
3984 if (sc->num_slices > msix_cnt)
3985 sc->num_slices = msix_cnt;
3987 if (mxge_max_slices == -1) {
3988 /* cap to number of CPUs in system */
3989 if (sc->num_slices > ncpus)
3990 sc->num_slices = ncpus;
3992 if (sc->num_slices > mxge_max_slices)
3993 sc->num_slices = mxge_max_slices;
3995 /* make sure it is a power of two */
3996 while (sc->num_slices & (sc->num_slices - 1))
4000 device_printf(sc->dev, "using %d slices\n",
4006 sc->fw_name = old_fw;
4007 (void) mxge_load_firmware(sc, 0);
4012 mxge_add_msix_irqs(mxge_softc_t *sc)
4015 int count, err, i, rid;
4018 sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4021 if (sc->msix_table_res == NULL) {
4022 device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4026 count = sc->num_slices;
4027 err = pci_alloc_msix(sc->dev, &count);
4029 device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4030 "err = %d \n", sc->num_slices, err);
4031 goto abort_with_msix_table;
4033 if (count < sc->num_slices) {
4034 device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4035 count, sc->num_slices);
4036 device_printf(sc->dev,
4037 "Try setting hw.mxge.max_slices to %d\n",
4040 goto abort_with_msix;
4042 bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4043 sc->msix_irq_res = kmalloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4044 if (sc->msix_irq_res == NULL) {
4046 goto abort_with_msix;
4049 for (i = 0; i < sc->num_slices; i++) {
4051 sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4054 if (sc->msix_irq_res[i] == NULL) {
4055 device_printf(sc->dev, "couldn't allocate IRQ res"
4056 " for message %d\n", i);
4058 goto abort_with_res;
4062 bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4063 sc->msix_ih = kmalloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4065 for (i = 0; i < sc->num_slices; i++) {
4066 err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4068 mxge_intr, &sc->ss[i], &sc->msix_ih[i],
4069 sc->ifp->if_serializer);
4071 device_printf(sc->dev, "couldn't setup intr for "
4073 goto abort_with_intr;
4078 device_printf(sc->dev, "using %d msix IRQs:",
4080 for (i = 0; i < sc->num_slices; i++)
4081 kprintf(" %ld", rman_get_start(sc->msix_irq_res[i]));
4087 for (i = 0; i < sc->num_slices; i++) {
4088 if (sc->msix_ih[i] != NULL) {
4089 bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4091 sc->msix_ih[i] = NULL;
4094 kfree(sc->msix_ih, M_DEVBUF);
4098 for (i = 0; i < sc->num_slices; i++) {
4100 if (sc->msix_irq_res[i] != NULL)
4101 bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4102 sc->msix_irq_res[i]);
4103 sc->msix_irq_res[i] = NULL;
4105 kfree(sc->msix_irq_res, M_DEVBUF);
4109 pci_release_msi(sc->dev);
4111 abort_with_msix_table:
4112 bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4113 sc->msix_table_res);
4120 mxge_add_single_irq(mxge_softc_t *sc)
4125 sc->irq_type = pci_alloc_1intr(sc->dev, 1, &sc->irq_rid, &irq_flags);
4127 sc->irq_res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ,
4128 &sc->irq_rid, irq_flags);
4129 if (sc->irq_res == NULL) {
4130 device_printf(sc->dev, "could not alloc interrupt\n");
4134 err = bus_setup_intr(sc->dev, sc->irq_res,
4136 mxge_intr, &sc->ss[0], &sc->ih,
4137 sc->ifp->if_serializer);
4139 bus_release_resource(sc->dev, SYS_RES_IRQ, sc->irq_rid,
4141 if (sc->irq_type == PCI_INTR_TYPE_MSI)
4142 pci_release_msi(sc->dev);
4149 mxge_rem_msix_irqs(mxge_softc_t *sc)
4153 for (i = 0; i < sc->num_slices; i++) {
4154 if (sc->msix_ih[i] != NULL) {
4155 bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4157 sc->msix_ih[i] = NULL;
4160 kfree(sc->msix_ih, M_DEVBUF);
4162 for (i = 0; i < sc->num_slices; i++) {
4164 if (sc->msix_irq_res[i] != NULL)
4165 bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4166 sc->msix_irq_res[i]);
4167 sc->msix_irq_res[i] = NULL;
4169 kfree(sc->msix_irq_res, M_DEVBUF);
4171 bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4172 sc->msix_table_res);
4174 pci_release_msi(sc->dev);
4180 mxge_rem_single_irq(mxge_softc_t *sc)
4182 bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4183 bus_release_resource(sc->dev, SYS_RES_IRQ, sc->irq_rid, sc->irq_res);
4184 if (sc->irq_type == PCI_INTR_TYPE_MSI)
4185 pci_release_msi(sc->dev);
4189 mxge_rem_irq(mxge_softc_t *sc)
4192 if (sc->num_slices > 1)
4193 mxge_rem_msix_irqs(sc);
4196 mxge_rem_single_irq(sc);
4200 mxge_add_irq(mxge_softc_t *sc)
4205 if (sc->num_slices > 1)
4206 err = mxge_add_msix_irqs(sc);
4208 err = mxge_add_single_irq(sc);
4210 if (0 && err == 0 && sc->num_slices > 1) {
4211 mxge_rem_msix_irqs(sc);
4212 err = mxge_add_msix_irqs(sc);
4216 return mxge_add_single_irq(sc);
4221 mxge_attach(device_t dev)
4223 mxge_softc_t *sc = device_get_softc(dev);
4224 struct ifnet *ifp = &sc->arpcom.ac_if;
4228 * avoid rewriting half the lines in this file to use
4229 * &sc->arpcom.ac_if instead
4233 mxge_fetch_tunables(sc);
4235 err = bus_dma_tag_create(NULL, /* parent */
4238 BUS_SPACE_MAXADDR, /* low */
4239 BUS_SPACE_MAXADDR, /* high */
4240 NULL, NULL, /* filter */
4241 65536 + 256, /* maxsize */
4242 MXGE_MAX_SEND_DESC, /* num segs */
4243 65536, /* maxsegsize */
4245 &sc->parent_dmat); /* tag */
4248 device_printf(sc->dev, "Err %d allocating parent dmat\n",
4250 goto abort_with_nothing;
4254 if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4256 callout_init_mp(&sc->co_hdl);
4258 mxge_setup_cfg_space(sc);
4260 /* Map the board into the kernel */
4262 sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
4264 if (sc->mem_res == NULL) {
4265 device_printf(dev, "could not map memory\n");
4267 goto abort_with_parent_dmat;
4269 sc->sram = rman_get_virtual(sc->mem_res);
4270 sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4271 if (sc->sram_size > rman_get_size(sc->mem_res)) {
4272 device_printf(dev, "impossible memory region size %ld\n",
4273 rman_get_size(sc->mem_res));
4275 goto abort_with_mem_res;
4278 /* make NULL terminated copy of the EEPROM strings section of
4280 bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4281 bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4282 rman_get_bushandle(sc->mem_res),
4283 sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4285 MXGE_EEPROM_STRINGS_SIZE - 2);
4286 err = mxge_parse_strings(sc);
4288 goto abort_with_mem_res;
4290 /* Enable write combining for efficient use of PCIe bus */
4293 /* Allocate the out of band dma memory */
4294 err = mxge_dma_alloc(sc, &sc->cmd_dma,
4295 sizeof (mxge_cmd_t), 64);
4297 goto abort_with_mem_res;
4298 sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4299 err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4301 goto abort_with_cmd_dma;
4303 err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4305 goto abort_with_zeropad_dma;
4307 /* select & load the firmware */
4308 err = mxge_select_firmware(sc);
4310 goto abort_with_dmabench;
4311 sc->intr_coal_delay = mxge_intr_coal_delay;
4313 mxge_slice_probe(sc);
4314 err = mxge_alloc_slices(sc);
4316 goto abort_with_dmabench;
4318 err = mxge_reset(sc, 0);
4320 goto abort_with_slices;
4322 err = mxge_alloc_rings(sc);
4324 device_printf(sc->dev, "failed to allocate rings\n");
4325 goto abort_with_slices;
4328 ifp->if_baudrate = IF_Gbps(10UL);
4329 ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO;
4330 ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4332 ifp->if_capabilities |= IFCAP_VLAN_MTU;
4334 /* Well, its software, sigh */
4335 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING;
4337 ifp->if_capenable = ifp->if_capabilities;
4340 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4341 ifp->if_init = mxge_init;
4342 ifp->if_ioctl = mxge_ioctl;
4343 ifp->if_start = mxge_start;
4346 /* Initialise the ifmedia structure */
4347 ifmedia_init(&sc->media, 0, mxge_media_change,
4349 mxge_media_init(sc);
4350 mxge_media_probe(sc);
4352 ether_ifattach(ifp, sc->mac_addr, NULL);
4354 sc->max_mtu = ETHERMTU + EVL_ENCAPLEN;
4357 /* must come after ether_ifattach() */
4358 err = mxge_add_irq(sc);
4360 device_printf(sc->dev, "failed to add irq\n");
4361 goto abort_with_rings;
4363 ifq_set_cpuid(&ifp->if_snd, rman_get_cpuid(sc->irq_res));
4365 mxge_add_sysctls(sc);
4367 callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
4371 mxge_free_rings(sc);
4373 mxge_free_slices(sc);
4374 abort_with_dmabench:
4375 mxge_dma_free(&sc->dmabench_dma);
4376 abort_with_zeropad_dma:
4377 mxge_dma_free(&sc->zeropad_dma);
4379 mxge_dma_free(&sc->cmd_dma);
4381 bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4382 pci_disable_busmaster(dev);
4383 abort_with_parent_dmat:
4384 bus_dma_tag_destroy(sc->parent_dmat);
4390 mxge_detach(device_t dev)
4392 mxge_softc_t *sc = device_get_softc(dev);
4394 lwkt_serialize_enter(sc->ifp->if_serializer);
4396 if (sc->ifp->if_flags & IFF_RUNNING)
4399 * XXX: race: the callout callback could be spinning on
4400 * the serializer and run anyway
4402 callout_stop(&sc->co_hdl);
4403 lwkt_serialize_exit(sc->ifp->if_serializer);
4405 callout_terminate(&sc->co_hdl);
4407 ether_ifdetach(sc->ifp);
4408 ifmedia_removeall(&sc->media);
4409 mxge_dummy_rdma(sc, 0);
4410 mxge_rem_sysctls(sc);
4412 mxge_free_rings(sc);
4413 mxge_free_slices(sc);
4414 mxge_dma_free(&sc->dmabench_dma);
4415 mxge_dma_free(&sc->zeropad_dma);
4416 mxge_dma_free(&sc->cmd_dma);
4417 bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4418 pci_disable_busmaster(dev);
4419 bus_dma_tag_destroy(sc->parent_dmat);
4424 mxge_shutdown(device_t dev)