Merge branch 'mxge'
authorAggelos Economopoulos <aoiko@cc.ece.ntua.gr>
Mon, 13 Jul 2009 18:47:37 +0000 (21:47 +0300)
committerAggelos Economopoulos <aoiko@cc.ece.ntua.gr>
Mon, 13 Jul 2009 18:47:37 +0000 (21:47 +0300)
13 files changed:
share/man/man4/Makefile
share/man/man4/mxge.4 [new file with mode: 0644]
sys/dev/netif/Makefile
sys/dev/netif/mxge/Makefile [new file with mode: 0644]
sys/dev/netif/mxge/if_mxge.c [new file with mode: 0644]
sys/dev/netif/mxge/if_mxge_var.h [new file with mode: 0644]
sys/dev/netif/mxge/mcp_gen_header.h [new file with mode: 0644]
sys/dev/netif/mxge/mxge_lro.c [new file with mode: 0644]
sys/dev/netif/mxge/mxge_mcp.h [new file with mode: 0644]
sys/net/ethernet.h
sys/net/if.h
sys/net/if_media.h
sys/sys/param.h

index c981d7e..bb7144a 100644 (file)
@@ -137,6 +137,7 @@ MAN=        aac.4 \
        msk.4 \
        mtio.4 \
        multicast.4 \
+       mxge.4 \
        my.4 \
        nata.4 \
        nataraid.4 \
diff --git a/share/man/man4/mxge.4 b/share/man/man4/mxge.4
new file mode 100644 (file)
index 0000000..23b7f85
--- /dev/null
@@ -0,0 +1,172 @@
+.\" Copyright (c) 2006, Myricom Inc
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions are met:
+.\"
+.\" 1. Redistributions of source code must retain the above copyright notice,
+.\"    this list of conditions and the following disclaimer.
+.\"
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\"    notice, this list of conditions and the following disclaimer in the
+.\"    documentation and/or other materials provided with the distribution.
+.\"
+.\" 3. Neither the name of the Myricom Inc nor the names of its
+.\"    contributors may be used to endorse or promote products derived from
+.\"    this software without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+.\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+.\" LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+.\" CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+.\" SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+.\" INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+.\" CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+.\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+.\" POSSIBILITY OF SUCH DAMAGE.
+.\"
+.\" * Other names and brands may be claimed as the property of others.
+.\"
+.\" $FreeBSD: src/share/man/man4/mxge.4,v 1.7.4.2.4.1 2009/04/15 03:14:26 kensmith Exp $
+.\"
+.Dd February 13, 2008
+.Dt MXGE 4
+.Os
+.Sh NAME
+.Nm mxge
+.Nd "Myricom Myri10GE 10 Gigabit Ethernet adapter driver"
+.Sh SYNOPSIS
+To compile this driver into the kernel,
+place the following lines in your
+kernel configuration file:
+.Bd -ragged -offset indent
+.Cd "device firmware"
+.Cd "device mxge"
+.Ed
+.Pp
+Alternatively, to load the driver as a
+module at boot time, place the following line in
+.Xr loader.conf 5 :
+.Bd -literal -offset indent
+if_mxge_load="YES"
+.Ed
+.Sh DESCRIPTION
+The
+.Nm
+driver provides support for PCI Express 10 Gigabit Ethernet adapters based on
+the Myricom LANai Z8E chip.
+The driver supports Transmit/Receive checksum offload and hardware vlan tagging.
+Support for Jumbo frames is not yet implemented, while support for TCP
+Segmentation Offload (TSO) as well as Large Receive Offload (LRO) is not
+implemented in 
+For further hardware information, see
+.Pa http://www.myri.com/ .
+.Pp
+For questions related to hardware requirements,
+refer to the documentation supplied with your Myri10GE adapter.
+All hardware requirements listed apply to use with
+.Dx .
+.Pp
+.Sh HARDWARE
+The
+.Nm
+driver supports 10 Gigabit Ethernet adapters based on the
+Myricom LANai Z8E chips:
+.Pp
+.Bl -bullet -compact
+.It
+Myricom 10GBase-CX4 (10G-PCIE-8A-C, 10G-PCIE-8AL-C)
+.It
+Myricom 10GBase-R (10G-PCIE-8A-R, 10G-PCIE-8AL-R)
+.It
+Myricom 10G XAUI over ribbon fiber (10G-PCIE-8A-Q, 10G-PCIE-8AL-Q)
+.El
+.Sh LOADER TUNABLES
+Tunables can be set at the
+.Xr loader 8
+prompt before booting the kernel or stored in
+.Xr loader.conf 5 .
+.Bl -tag -width indent
+.It Va hw.mxge.flow_control_enabled
+Whether or not hardware flow control is enabled on the adapter.
+The default value is 1.
+.It Va hw.mxge.intr_coal_delay
+This value delays the generation of all interrupts in units of
+1 microsecond.
+The default value is 30.
+.It Va hw.mxge.skip_pio_read
+This value determines whether or not the driver may omit doing a
+PIO read in the interrupt handler which ensures that the interrupt
+line has been deasserted when using xPIC interrupts.
+A non-zero value
+may result in lower CPU overhead, however it may also result in
+spurious interrupts.
+The default value is 0.
+This tunable has no effect when the device is
+using MSI or MSI-X interrupts.
+.It Va hw.mxge.max_slices
+This value determines the maximum number of slices the driver
+will attempt to use.
+The default value is 1.
+A slice is comprised
+of a set of receive queues and an associated interrupt thread.
+When using multiple slices, the NIC hashes traffic to different slices
+based on the value of
+.Va hw.mxge.rss_hash_type .
+Using multiple slices requires that your motherboard and Myri10GE NIC
+both be capable of MSI-X.
+Older Myri10GE NICs can be field upgraded to add
+MSI-X using the "10G NIC Tool Kit" for FreeBSD which is available from
+.Pa http://www.myri.com/scs/download-10g-tools.html .
+.Pp
+.It Va hw.mxge.rss_hash_type
+This value determines how incoming traffic is steered to different
+slices.
+This tunable is ignored when using just a single slice.
+The legal values for this tunable are:
+.Bl -tag -width "XXXX"
+.It 1
+Hash on the source and destination IPv4 addresses.
+.It 2
+Hash on source and destination IPv4 addresses and if the packet
+is TCP, then also hash on the TCP source and destination ports.
+.It 4
+Hash on the TCP or UDP source ports.
+This is the default value.
+.El
+.El
+.Sh DIAGNOSTICS
+.Bl -diag
+.It "mxge%d: Unable to allocate bus resource: memory"
+A fatal initialization error has occurred.
+.It "mxge%d: Unable to allocate bus resource: interrupt"
+A fatal initialization error has occurred.
+.It "mxge%d: Could not find firmware image %s"
+The appropriate firmware kld module was not installed.
+This is a non-fatal initialization error, but will
+result in running in a reduced performance mode.
+.El
+.Sh SUPPORT
+For general information and support,
+go to the Myricom support website at:
+.Pa http://www.myri.com/scs/ .
+.Sh SEE ALSO
+.Xr altq 4 ,
+.Xr arp 4 ,
+.Xr netintro 4 ,
+.Xr ng_ether 4 ,
+.Xr ifconfig 8
+.Sh HISTORY
+The
+.Nm
+device driver first appeared in
+.Fx 6.3 .
+It was first ported to
+.Dx 2.4 .
+.Sh AUTHORS
+The
+.Nm
+driver was written by
+.An Andrew Gallatin Aq gallatin@FreeBSD.org .
index 0b9b6e1..effda69 100644 (file)
@@ -2,8 +2,8 @@
 #
 
 SUBDIR= an acx age ale ar ath aue axe bce bfe bge bwi cue dc ed em ep et fwe \
-       fxp iwi iwl jme kue lge lnc mii_layer my msk nfe nge pcn ral re rl \
-       rtw rue rum sbni sbsh sf sis sk sln sr ste stge ti tl tx txp ural vge \
-       vr vx wb wi xe xl ig_hal emx
+       fxp iwi iwl jme kue lge lnc mii_layer my msk mxge nfe nge pcn ral re \
+       rl rtw rue rum sbni sbsh sf sis sk sln sr ste stge ti tl tx txp ural \
+       vge vr vx wb wi xe xl ig_hal emx
 
 .include <bsd.subdir.mk>
diff --git a/sys/dev/netif/mxge/Makefile b/sys/dev/netif/mxge/Makefile
new file mode 100644 (file)
index 0000000..1aa00f6
--- /dev/null
@@ -0,0 +1,14 @@
+#$FreeBSD: src/sys/modules/em/Makefile,v 1.1.2.3 2002/06/18 21:00:56 pdeuskar Exp $
+#$DragonFly: src/sys/dev/netif/em/Makefile,v 1.10 2008/09/17 08:51:29 sephe Exp $
+
+KMOD=  if_mxge
+SRCS=  if_mxge.c if_mxge_var.h mcp_gen_header.h mxge_mcp.h
+SRCS+= device_if.h bus_if.h pci_if.h
+SRCS+= opt_polling.h
+
+.ifndef BUILDING_WITH_KERNEL
+opt_polling.h:
+       echo '#define DEVICE_POLLING 1' > ${.OBJDIR}/${.TARGET}
+.endif
+
+.include <bsd.kmod.mk>
diff --git a/sys/dev/netif/mxge/if_mxge.c b/sys/dev/netif/mxge/if_mxge.c
new file mode 100644 (file)
index 0000000..1b3758b
--- /dev/null
@@ -0,0 +1,4716 @@
+/******************************************************************************
+
+Copyright (c) 2006-2009, Myricom Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Myricom Inc, nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+***************************************************************************/
+
+#include <sys/cdefs.h>
+/*__FBSDID("$FreeBSD: src/sys/dev/mxge/if_mxge.c,v 1.63 2009/06/26 11:45:06 rwatson Exp $");*/
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/linker.h>
+#include <sys/firmware.h>
+#include <sys/endian.h>
+#include <sys/in_cksum.h>
+#include <sys/sockio.h>
+#include <sys/mbuf.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/serialize.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+
+/* count xmits ourselves, rather than via drbr */
+#define NO_SLOW_STATS
+#include <net/if.h>
+#include <net/if_arp.h>
+#include <net/ifq_var.h>
+#include <net/ethernet.h>
+#include <net/if_dl.h>
+#include <net/if_media.h>
+
+#include <net/bpf.h>
+
+#include <net/if_types.h>
+#include <net/vlan/if_vlan_var.h>
+#include <net/zlib.h>
+
+#include <netinet/in_systm.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/tcp.h>
+
+#include <sys/bus.h>
+#include <sys/rman.h>
+
+#include <bus/pci/pcireg.h>
+#include <bus/pci/pcivar.h>
+#include <bus/pci/pci_private.h> /* XXX for pci_cfg_restore */
+
+#include <vm/vm.h>             /* for pmap_mapdev() */
+#include <vm/pmap.h>
+
+#if defined(__i386) || defined(__amd64)
+#include <machine/specialreg.h>
+#endif
+
+#include <dev/netif/mxge/mxge_mcp.h>
+#include <dev/netif/mxge/mcp_gen_header.h>
+/*#define MXGE_FAKE_IFP*/
+#include <dev/netif/mxge/if_mxge_var.h>
+#ifdef IFNET_BUF_RING
+#include <sys/buf_ring.h>
+#endif
+
+#include "opt_inet.h"
+
+/* tunable params */
+static int mxge_nvidia_ecrc_enable = 1;
+static int mxge_force_firmware = 0;
+static int mxge_intr_coal_delay = 30;
+static int mxge_deassert_wait = 1;
+static int mxge_flow_control = 1;
+static int mxge_verbose = 0;
+static int mxge_lro_cnt = 8;
+static int mxge_ticks;
+static int mxge_max_slices = 1;
+static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
+static int mxge_always_promisc = 0;
+/* XXX: not yet */
+/* static int mxge_initial_mtu = ETHERMTU_JUMBO; */
+static int mxge_initial_mtu = ETHERMTU;
+static char *mxge_fw_unaligned = "mxge_ethp_z8e";
+static char *mxge_fw_aligned = "mxge_eth_z8e";
+static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
+static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
+
+static int mxge_probe(device_t dev);
+static int mxge_attach(device_t dev);
+static int mxge_detach(device_t dev);
+static int mxge_shutdown(device_t dev);
+static void mxge_intr(void *arg);
+
+static device_method_t mxge_methods[] =
+{
+  /* Device interface */
+  DEVMETHOD(device_probe, mxge_probe),
+  DEVMETHOD(device_attach, mxge_attach),
+  DEVMETHOD(device_detach, mxge_detach),
+  DEVMETHOD(device_shutdown, mxge_shutdown),
+  {0, 0}
+};
+
+static driver_t mxge_driver =
+{
+  "mxge",
+  mxge_methods,
+  sizeof(mxge_softc_t),
+};
+
+static devclass_t mxge_devclass;
+
+/* Declare ourselves to be a child of the PCI bus.*/
+DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
+MODULE_DEPEND(mxge, firmware, 1, 1, 1);
+MODULE_DEPEND(mxge, zlib, 1, 1, 1);
+
+static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
+static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
+static int mxge_close(mxge_softc_t *sc);
+static int mxge_open(mxge_softc_t *sc);
+static void mxge_tick(void *arg);
+
+/* XXX: we don't have Large Receive Offload support yet */
+ inline int
+mxge_lro_rx(struct mxge_slice_state *ss, struct mbuf *m_head, uint32_t csum)
+{
+       (void)ss;
+       (void)m_head;
+       (void)csum;
+       return 1;
+}
+
+ inline void
+mxge_lro_flush(struct mxge_slice_state *ss, struct lro_entry *lro)
+{
+       (void)ss;
+       (void)lro;
+}
+
+static int
+mxge_probe(device_t dev)
+{
+       int rev;
+
+
+       if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
+           ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
+            (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
+               rev = pci_get_revid(dev);
+               switch (rev) {
+               case MXGE_PCI_REV_Z8E:
+                       device_set_desc(dev, "Myri10G-PCIE-8A");
+                       break;
+               case MXGE_PCI_REV_Z8ES:
+                       device_set_desc(dev, "Myri10G-PCIE-8B");
+                       break;
+               default:
+                       device_set_desc(dev, "Myri10G-PCIE-8??");
+                       device_printf(dev, "Unrecognized rev %d NIC\n",
+                                     rev);
+                       break;  
+               }
+               return 0;
+       }
+       return ENXIO;
+}
+
+static void
+mxge_enable_wc(mxge_softc_t *sc)
+{
+#if 0
+#if defined(__i386) || defined(__amd64)
+       vm_offset_t len;
+       int err;
+
+       sc->wc = 1;
+       len = rman_get_size(sc->mem_res);
+       err = pmap_change_attr((vm_offset_t) sc->sram,
+                              len, PAT_WRITE_COMBINING);
+       if (err != 0) {
+               device_printf(sc->dev, "pmap_change_attr failed, %d\n",
+                             err);
+               sc->wc = 0;
+       }
+#endif
+#else
+       sc->wc = 0;     /* TBD: PAT support */
+#endif
+}
+
+
+/* callback to get our DMA address */
+static void
+mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
+                        int error)
+{
+       if (error == 0) {
+               *(bus_addr_t *) arg = segs->ds_addr;
+       }
+}
+
+static int
+mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes, 
+                  bus_size_t alignment)
+{
+       int err;
+       device_t dev = sc->dev;
+       bus_size_t boundary, maxsegsize;
+
+       if (bytes > 4096 && alignment == 4096) {
+               boundary = 0;
+               maxsegsize = bytes;
+       } else {
+               boundary = 4096;
+               maxsegsize = 4096;
+       }
+
+       /* allocate DMAable memory tags */
+       err = bus_dma_tag_create(sc->parent_dmat,       /* parent */
+                                alignment,             /* alignment */
+                                boundary,              /* boundary */
+                                BUS_SPACE_MAXADDR,     /* low */
+                                BUS_SPACE_MAXADDR,     /* high */
+                                NULL, NULL,            /* filter */
+                                bytes,                 /* maxsize */
+                                1,                     /* num segs */
+                                maxsegsize,            /* maxsegsize */
+                                BUS_DMA_COHERENT,      /* flags */
+                                &dma->dmat);           /* tag */
+       if (err != 0) {
+               device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
+               return err;
+       }
+
+       /* allocate DMAable memory & map */
+       err = bus_dmamem_alloc(dma->dmat, &dma->addr, 
+                              (BUS_DMA_WAITOK | BUS_DMA_COHERENT 
+                               | BUS_DMA_ZERO),  &dma->map);
+       if (err != 0) {
+               device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
+               goto abort_with_dmat;
+       }
+
+       /* load the memory */
+       err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
+                             mxge_dmamap_callback,
+                             (void *)&dma->bus_addr, 0);
+       if (err != 0) {
+               device_printf(dev, "couldn't load map (err = %d)\n", err);
+               goto abort_with_mem;
+       }
+       return 0;
+
+abort_with_mem:
+       bus_dmamem_free(dma->dmat, dma->addr, dma->map);
+abort_with_dmat:
+       (void)bus_dma_tag_destroy(dma->dmat);
+       return err;
+}
+
+
+static void
+mxge_dma_free(mxge_dma_t *dma)
+{
+       bus_dmamap_unload(dma->dmat, dma->map);
+       bus_dmamem_free(dma->dmat, dma->addr, dma->map);
+       (void)bus_dma_tag_destroy(dma->dmat);
+}
+
+/*
+ * The eeprom strings on the lanaiX have the format
+ * SN=x\0
+ * MAC=x:x:x:x:x:x\0
+ * PC=text\0
+ */
+
+static int
+mxge_parse_strings(mxge_softc_t *sc)
+{
+#define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
+
+       char *ptr, *limit;
+       int i, found_mac;
+
+       ptr = sc->eeprom_strings;
+       limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
+       found_mac = 0;
+       while (ptr < limit && *ptr != '\0') {
+               if (memcmp(ptr, "MAC=", 4) == 0) {
+                       ptr += 1;
+                       sc->mac_addr_string = ptr;
+                       for (i = 0; i < 6; i++) {
+                               ptr += 3;
+                               if ((ptr + 2) > limit)
+                                       goto abort;
+                               sc->mac_addr[i] = strtoul(ptr, NULL, 16);
+                               found_mac = 1;
+                       }
+               } else if (memcmp(ptr, "PC=", 3) == 0) {
+                       ptr += 3;
+                       strncpy(sc->product_code_string, ptr,
+                               sizeof (sc->product_code_string) - 1);
+               } else if (memcmp(ptr, "SN=", 3) == 0) {
+                       ptr += 3;
+                       strncpy(sc->serial_number_string, ptr,
+                               sizeof (sc->serial_number_string) - 1);
+               }
+               MXGE_NEXT_STRING(ptr);
+       }
+
+       if (found_mac)
+               return 0;
+
+ abort:
+       device_printf(sc->dev, "failed to parse eeprom_strings\n");
+
+       return ENXIO;
+}
+
+#if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
+static void
+mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
+{
+       uint32_t val;
+       unsigned long base, off;
+       char *va, *cfgptr;
+       device_t pdev, mcp55;
+       uint16_t vendor_id, device_id, word;
+       uintptr_t bus, slot, func, ivend, idev;
+       uint32_t *ptr32;
+
+
+       if (!mxge_nvidia_ecrc_enable)
+               return;
+
+       pdev = device_get_parent(device_get_parent(sc->dev));
+       if (pdev == NULL) {
+               device_printf(sc->dev, "could not find parent?\n");
+               return;
+       }
+       vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
+       device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
+
+       if (vendor_id != 0x10de)
+               return;
+
+       base = 0;
+
+       if (device_id == 0x005d) {
+               /* ck804, base address is magic */
+               base = 0xe0000000UL;
+       } else if (device_id >= 0x0374 && device_id <= 0x378) {
+               /* mcp55, base address stored in chipset */
+               mcp55 = pci_find_bsf(0, 0, 0);
+               if (mcp55 &&
+                   0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
+                   0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
+                       word = pci_read_config(mcp55, 0x90, 2);
+                       base = ((unsigned long)word & 0x7ffeU) << 25;
+               }
+       }
+       if (!base)
+               return;
+
+       /* XXXX
+          Test below is commented because it is believed that doing
+          config read/write beyond 0xff will access the config space
+          for the next larger function.  Uncomment this and remove 
+          the hacky pmap_mapdev() way of accessing config space when
+          FreeBSD grows support for extended pcie config space access
+       */
+#if 0  
+       /* See if we can, by some miracle, access the extended
+          config space */
+       val = pci_read_config(pdev, 0x178, 4);
+       if (val != 0xffffffff) {
+               val |= 0x40;
+               pci_write_config(pdev, 0x178, val, 4);
+               return;
+       }
+#endif
+       /* Rather than using normal pci config space writes, we must
+        * map the Nvidia config space ourselves.  This is because on
+        * opteron/nvidia class machine the 0xe000000 mapping is
+        * handled by the nvidia chipset, that means the internal PCI
+        * device (the on-chip northbridge), or the amd-8131 bridge
+        * and things behind them are not visible by this method.
+        */
+
+       BUS_READ_IVAR(device_get_parent(pdev), pdev,
+                     PCI_IVAR_BUS, &bus);
+       BUS_READ_IVAR(device_get_parent(pdev), pdev,
+                     PCI_IVAR_SLOT, &slot);
+       BUS_READ_IVAR(device_get_parent(pdev), pdev,
+                     PCI_IVAR_FUNCTION, &func);
+       BUS_READ_IVAR(device_get_parent(pdev), pdev,
+                     PCI_IVAR_VENDOR, &ivend);
+       BUS_READ_IVAR(device_get_parent(pdev), pdev,
+                     PCI_IVAR_DEVICE, &idev);
+                                       
+       off =  base
+               + 0x00100000UL * (unsigned long)bus
+               + 0x00001000UL * (unsigned long)(func
+                                                + 8 * slot);
+
+       /* map it into the kernel */
+       va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
+       
+
+       if (va == NULL) {
+               device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
+               return;
+       }
+       /* get a pointer to the config space mapped into the kernel */
+       cfgptr = va + (off & PAGE_MASK);
+
+       /* make sure that we can really access it */
+       vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
+       device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
+       if (! (vendor_id == ivend && device_id == idev)) {
+               device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
+                             vendor_id, device_id);
+               pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
+               return;
+       }
+
+       ptr32 = (uint32_t*)(cfgptr + 0x178);
+       val = *ptr32;
+
+       if (val == 0xffffffff) {
+               device_printf(sc->dev, "extended mapping failed\n");
+               pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
+               return;
+       }
+       *ptr32 = val | 0x40;
+       pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
+       if (mxge_verbose) 
+               device_printf(sc->dev,
+                             "Enabled ECRC on upstream Nvidia bridge "
+                             "at %d:%d:%d\n",
+                             (int)bus, (int)slot, (int)func);
+       return;
+}
+#else
+static void
+mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
+{
+       device_printf(sc->dev,
+                     "Nforce 4 chipset on non-x86/amd64!?!?!\n");
+       return;
+}
+#endif
+
+
+static int
+mxge_dma_test(mxge_softc_t *sc, int test_type)
+{
+       mxge_cmd_t cmd;
+       bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
+       int status;
+       uint32_t len;
+       char *test = " ";
+
+
+       /* Run a small DMA test.
+        * The magic multipliers to the length tell the firmware
+        * to do DMA read, write, or read+write tests.  The
+        * results are returned in cmd.data0.  The upper 16
+        * bits of the return is the number of transfers completed.
+        * The lower 16 bits is the time in 0.5us ticks that the
+        * transfers took to complete.
+        */
+
+       len = sc->tx_boundary;
+
+       cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
+       cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
+       cmd.data2 = len * 0x10000;
+       status = mxge_send_cmd(sc, test_type, &cmd);
+       if (status != 0) {
+               test = "read";
+               goto abort;
+       }
+       sc->read_dma = ((cmd.data0>>16) * len * 2) /
+               (cmd.data0 & 0xffff);
+       cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
+       cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
+       cmd.data2 = len * 0x1;
+       status = mxge_send_cmd(sc, test_type, &cmd);
+       if (status != 0) {
+               test = "write";
+               goto abort;
+       }
+       sc->write_dma = ((cmd.data0>>16) * len * 2) /
+               (cmd.data0 & 0xffff);
+
+       cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
+       cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
+       cmd.data2 = len * 0x10001;
+       status = mxge_send_cmd(sc, test_type, &cmd);
+       if (status != 0) {
+               test = "read/write";
+               goto abort;
+       }
+       sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
+               (cmd.data0 & 0xffff);
+
+abort:
+       if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
+               device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
+                             test, status);
+
+       return status;
+}
+
+/*
+ * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
+ * when the PCI-E Completion packets are aligned on an 8-byte
+ * boundary.  Some PCI-E chip sets always align Completion packets; on
+ * the ones that do not, the alignment can be enforced by enabling
+ * ECRC generation (if supported).
+ *
+ * When PCI-E Completion packets are not aligned, it is actually more
+ * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
+ *
+ * If the driver can neither enable ECRC nor verify that it has
+ * already been enabled, then it must use a firmware image which works
+ * around unaligned completion packets (ethp_z8e.dat), and it should
+ * also ensure that it never gives the device a Read-DMA which is
+ * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
+ * enabled, then the driver should use the aligned (eth_z8e.dat)
+ * firmware image, and set tx_boundary to 4KB.
+ */
+
+static int
+mxge_firmware_probe(mxge_softc_t *sc)
+{
+       device_t dev = sc->dev;
+       int reg, status;
+       uint16_t pectl;
+
+       sc->tx_boundary = 4096;
+       /*
+        * Verify the max read request size was set to 4KB
+        * before trying the test with 4KB.
+        */
+       if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
+               pectl = pci_read_config(dev, reg + 0x8, 2);
+               if ((pectl & (5 << 12)) != (5 << 12)) {
+                       device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
+                                     pectl);
+                       sc->tx_boundary = 2048;
+               }
+       }
+
+       /* 
+        * load the optimized firmware (which assumes aligned PCIe
+        * completions) in order to see if it works on this host.
+        */
+       sc->fw_name = mxge_fw_aligned;
+       status = mxge_load_firmware(sc, 1);
+       if (status != 0) {
+               return status;
+       }
+
+       /* 
+        * Enable ECRC if possible
+        */
+       mxge_enable_nvidia_ecrc(sc);
+
+       /* 
+        * Run a DMA test which watches for unaligned completions and
+        * aborts on the first one seen.
+        */
+
+       status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
+       if (status == 0)
+               return 0; /* keep the aligned firmware */
+
+       if (status != E2BIG)
+               device_printf(dev, "DMA test failed: %d\n", status);
+       if (status == ENOSYS)
+               device_printf(dev, "Falling back to ethp! "
+                             "Please install up to date fw\n");
+       return status;
+}
+
+static int
+mxge_select_firmware(mxge_softc_t *sc)
+{
+       int aligned = 0;
+
+
+       if (mxge_force_firmware != 0) {
+               if (mxge_force_firmware == 1)
+                       aligned = 1;
+               else
+                       aligned = 0;
+               if (mxge_verbose)
+                       device_printf(sc->dev,
+                                     "Assuming %s completions (forced)\n",
+                                     aligned ? "aligned" : "unaligned");
+               goto abort;
+       }
+
+       /* if the PCIe link width is 4 or less, we can use the aligned
+          firmware and skip any checks */
+       if (sc->link_width != 0 && sc->link_width <= 4) {
+               device_printf(sc->dev,
+                             "PCIe x%d Link, expect reduced performance\n",
+                             sc->link_width);
+               aligned = 1;
+               goto abort;
+       }
+
+       if (0 == mxge_firmware_probe(sc))
+               return 0;
+
+abort:
+       if (aligned) {
+               sc->fw_name = mxge_fw_aligned;
+               sc->tx_boundary = 4096;
+       } else {
+               sc->fw_name = mxge_fw_unaligned;
+               sc->tx_boundary = 2048;
+       }
+       return (mxge_load_firmware(sc, 0));
+}
+
+union qualhack
+{
+        const char *ro_char;
+        char *rw_char;
+};
+
+static int
+mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
+{
+
+
+       if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
+               device_printf(sc->dev, "Bad firmware type: 0x%x\n", 
+                             be32toh(hdr->mcp_type));
+               return EIO;
+       }
+
+       /* save firmware version for sysctl */
+       strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
+       if (mxge_verbose)
+               device_printf(sc->dev, "firmware id: %s\n", hdr->version);
+
+       ksscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
+              &sc->fw_ver_minor, &sc->fw_ver_tiny);
+
+       if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
+             && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
+               device_printf(sc->dev, "Found firmware version %s\n",
+                             sc->fw_version);
+               device_printf(sc->dev, "Driver needs %d.%d\n",
+                             MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
+               return EINVAL;
+       }
+       return 0;
+
+}
+
+#if 0
+static void *
+z_alloc(void *nil, u_int items, u_int size)
+{
+        void *ptr;
+
+        ptr = kmalloc(items * size, M_TEMP, M_NOWAIT);
+        return ptr;
+}
+
+static void
+z_free(void *nil, void *ptr)
+{
+        kfree(ptr, M_TEMP);
+}
+#endif
+
+static int
+mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
+{
+       struct fw_image *fw;
+       const mcp_gen_header_t *hdr;
+       unsigned hdr_offset;
+       int status;
+       unsigned int i;
+       char dummy;
+       size_t fw_len;
+
+       fw = firmware_image_load(sc->fw_name, NULL);
+       if (fw == NULL) {
+               device_printf(sc->dev, "Could not find firmware image %s\n",
+                             sc->fw_name);
+               return ENOENT;
+       }
+#if 0
+       /* setup zlib and decompress f/w */
+       bzero(&zs, sizeof (zs));
+       zs.zalloc = z_alloc;
+       zs.zfree = z_free;
+       status = inflateInit(&zs);
+       if (status != Z_OK) {
+               status = EIO;
+               goto abort_with_fw;
+       }
+
+       /* the uncompressed size is stored as the firmware version,
+          which would otherwise go unused */
+       fw_len = (size_t) fw->version; 
+       inflate_buffer = kmalloc(fw_len, M_TEMP, M_NOWAIT);
+       if (inflate_buffer == NULL)
+               goto abort_with_zs;
+       zs.avail_in = fw->datasize;
+       zs.next_in = __DECONST(char *, fw->data);
+       zs.avail_out = fw_len;
+       zs.next_out = inflate_buffer;
+       status = inflate(&zs, Z_FINISH);
+       if (status != Z_STREAM_END) {
+               device_printf(sc->dev, "zlib %d\n", status);
+               status = EIO;
+               goto abort_with_buffer;
+       }
+#endif
+       fw_len = fw->fw_imglen;
+       /* check id */
+       hdr_offset = htobe32(*(const uint32_t *)
+                            (fw->fw_image + MCP_HEADER_PTR_OFFSET));
+       if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
+               device_printf(sc->dev, "Bad firmware file");
+               status = EIO;
+               goto abort_with_fw;
+       }
+       hdr = (const void*)(fw->fw_image + hdr_offset); 
+
+       status = mxge_validate_firmware(sc, hdr);
+       if (status != 0)
+               goto abort_with_fw;
+
+       /* Copy the inflated firmware to NIC SRAM. */
+       for (i = 0; i < fw_len; i += 256) {
+               mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
+                             fw->fw_image + i,
+                             min(256U, (unsigned)(fw_len - i)));
+               wmb();
+               dummy = *sc->sram;
+               wmb();
+       }
+
+       *limit = fw_len;
+       status = 0;
+#if 0
+abort_with_buffer:
+       kfree(inflate_buffer, M_TEMP);
+abort_with_zs:
+       inflateEnd(&zs);
+#endif
+abort_with_fw:
+       firmware_image_unload(fw);
+       return status;
+}
+
+/*
+ * Enable or disable periodic RDMAs from the host to make certain
+ * chipsets resend dropped PCIe messages
+ */
+
+static void
+mxge_dummy_rdma(mxge_softc_t *sc, int enable)
+{
+       char buf_bytes[72];
+       volatile uint32_t *confirm;
+       volatile char *submit;
+       uint32_t *buf, dma_low, dma_high;
+       int i;
+
+       buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
+
+       /* clear confirmation addr */
+       confirm = (volatile uint32_t *)sc->cmd;
+       *confirm = 0;
+       wmb();
+
+       /* send an rdma command to the PCIe engine, and wait for the
+          response in the confirmation address.  The firmware should
+          write a -1 there to indicate it is alive and well
+       */
+
+       dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
+       dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
+       buf[0] = htobe32(dma_high);             /* confirm addr MSW */
+       buf[1] = htobe32(dma_low);              /* confirm addr LSW */
+       buf[2] = htobe32(0xffffffff);           /* confirm data */
+       dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
+       dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
+       buf[3] = htobe32(dma_high);             /* dummy addr MSW */
+       buf[4] = htobe32(dma_low);              /* dummy addr LSW */
+       buf[5] = htobe32(enable);                       /* enable? */
+
+
+       submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
+
+       mxge_pio_copy(submit, buf, 64);
+       wmb();
+       DELAY(1000);
+       wmb();
+       i = 0;
+       while (*confirm != 0xffffffff && i < 20) {
+               DELAY(1000);
+               i++;
+       }
+       if (*confirm != 0xffffffff) {
+               device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)", 
+                             (enable ? "enable" : "disable"), confirm, 
+                             *confirm);
+       }
+       return;
+}
+
+static int 
+mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
+{
+       mcp_cmd_t *buf;
+       char buf_bytes[sizeof(*buf) + 8];
+       volatile mcp_cmd_response_t *response = sc->cmd;
+       volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
+       uint32_t dma_low, dma_high;
+       int err, sleep_total = 0;
+
+       /*
+        * We may be called during attach, before if_serializer is available.
+        * This is not a fast path, just check for NULL
+        */
+
+       if (sc->ifp->if_serializer)
+               ASSERT_SERIALIZED(sc->ifp->if_serializer);
+
+       /* ensure buf is aligned to 8 bytes */
+       buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
+
+       buf->data0 = htobe32(data->data0);
+       buf->data1 = htobe32(data->data1);
+       buf->data2 = htobe32(data->data2);
+       buf->cmd = htobe32(cmd);
+       dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
+       dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
+
+       buf->response_addr.low = htobe32(dma_low);
+       buf->response_addr.high = htobe32(dma_high);
+
+
+       response->result = 0xffffffff;
+       wmb();
+       mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
+
+       /* wait up to 20ms */
+       err = EAGAIN;
+       for (sleep_total = 0; sleep_total <  20; sleep_total++) {
+               bus_dmamap_sync(sc->cmd_dma.dmat, 
+                               sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
+               wmb();
+               switch (be32toh(response->result)) {
+               case 0:
+                       data->data0 = be32toh(response->data);
+                       err = 0;
+                       break;
+               case 0xffffffff:
+                       DELAY(1000);
+                       break;
+               case MXGEFW_CMD_UNKNOWN:
+                       err = ENOSYS;
+                       break;
+               case MXGEFW_CMD_ERROR_UNALIGNED:
+                       err = E2BIG;
+                       break;
+               case MXGEFW_CMD_ERROR_BUSY:
+                       err = EBUSY;
+                       break;
+               default:
+                       device_printf(sc->dev, 
+                                     "mxge: command %d "
+                                     "failed, result = %d\n",
+                                     cmd, be32toh(response->result));
+                       err = ENXIO;
+                       break;
+               }
+               if (err != EAGAIN)
+                       break;
+       }
+       if (err == EAGAIN)
+               device_printf(sc->dev, "mxge: command %d timed out"
+                             "result = %d\n",
+                             cmd, be32toh(response->result));
+       return err;
+}
+
+static int
+mxge_adopt_running_firmware(mxge_softc_t *sc)
+{
+       struct mcp_gen_header *hdr;
+       const size_t bytes = sizeof (struct mcp_gen_header);
+       size_t hdr_offset;
+       int status;
+
+       /* find running firmware header */
+       hdr_offset = htobe32(*(volatile uint32_t *)
+                            (sc->sram + MCP_HEADER_PTR_OFFSET));
+
+       if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
+               device_printf(sc->dev, 
+                             "Running firmware has bad header offset (%d)\n",
+                             (int)hdr_offset);
+               return EIO;
+       }
+
+       /* copy header of running firmware from SRAM to host memory to
+        * validate firmware */
+       hdr = kmalloc(bytes, M_DEVBUF, M_NOWAIT);
+       if (hdr == NULL) {
+               device_printf(sc->dev, "could not kmalloc firmware hdr\n");
+               return ENOMEM;
+       }
+       bus_space_read_region_1(rman_get_bustag(sc->mem_res),
+                               rman_get_bushandle(sc->mem_res),
+                               hdr_offset, (char *)hdr, bytes);
+       status = mxge_validate_firmware(sc, hdr);
+       kfree(hdr, M_DEVBUF);
+
+       /* 
+        * check to see if adopted firmware has bug where adopting
+        * it will cause broadcasts to be filtered unless the NIC
+        * is kept in ALLMULTI mode
+        */
+       if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
+           sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
+               sc->adopted_rx_filter_bug = 1;
+               device_printf(sc->dev, "Adopting fw %d.%d.%d: "
+                             "working around rx filter bug\n",
+                             sc->fw_ver_major, sc->fw_ver_minor,
+                             sc->fw_ver_tiny);
+       }
+
+       return status;
+}
+
+
+static int
+mxge_load_firmware(mxge_softc_t *sc, int adopt)
+{
+       volatile uint32_t *confirm;
+       volatile char *submit;
+       char buf_bytes[72];
+       uint32_t *buf, size, dma_low, dma_high;
+       int status, i;
+
+       buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
+
+       size = sc->sram_size;
+       status = mxge_load_firmware_helper(sc, &size);
+       if (status) {
+               if (!adopt)
+                       return status;
+               /* Try to use the currently running firmware, if
+                  it is new enough */
+               status = mxge_adopt_running_firmware(sc);
+               if (status) {
+                       device_printf(sc->dev,
+                                     "failed to adopt running firmware\n");
+                       return status;
+               }
+               device_printf(sc->dev,
+                             "Successfully adopted running firmware\n");
+               if (sc->tx_boundary == 4096) {
+                       device_printf(sc->dev,
+                               "Using firmware currently running on NIC"
+                                ".  For optimal\n");
+                       device_printf(sc->dev,
+                                "performance consider loading optimized "
+                                "firmware\n");
+               }
+               sc->fw_name = mxge_fw_unaligned;
+               sc->tx_boundary = 2048;
+               return 0;
+       }
+       /* clear confirmation addr */
+       confirm = (volatile uint32_t *)sc->cmd;
+       *confirm = 0;
+       wmb();
+       /* send a reload command to the bootstrap MCP, and wait for the
+          response in the confirmation address.  The firmware should
+          write a -1 there to indicate it is alive and well
+       */
+
+       dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
+       dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
+
+       buf[0] = htobe32(dma_high);     /* confirm addr MSW */
+       buf[1] = htobe32(dma_low);      /* confirm addr LSW */
+       buf[2] = htobe32(0xffffffff);   /* confirm data */
+
+       /* FIX: All newest firmware should un-protect the bottom of
+          the sram before handoff. However, the very first interfaces
+          do not. Therefore the handoff copy must skip the first 8 bytes
+       */
+                                       /* where the code starts*/
+       buf[3] = htobe32(MXGE_FW_OFFSET + 8);
+       buf[4] = htobe32(size - 8);     /* length of code */
+       buf[5] = htobe32(8);            /* where to copy to */
+       buf[6] = htobe32(0);            /* where to jump to */
+
+       submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
+       mxge_pio_copy(submit, buf, 64);
+       wmb();
+       DELAY(1000);
+       wmb();
+       i = 0;
+       while (*confirm != 0xffffffff && i < 20) {
+               DELAY(1000*10);
+               i++;
+               bus_dmamap_sync(sc->cmd_dma.dmat, 
+                               sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
+       }
+       if (*confirm != 0xffffffff) {
+               device_printf(sc->dev,"handoff failed (%p = 0x%x)", 
+                       confirm, *confirm);
+               
+               return ENXIO;
+       }
+       return 0;
+}
+
+static int
+mxge_update_mac_address(mxge_softc_t *sc)
+{
+       mxge_cmd_t cmd;
+       uint8_t *addr = sc->mac_addr;
+       int status;
+
+       
+       cmd.data0 = ((addr[0] << 24) | (addr[1] << 16) 
+                    | (addr[2] << 8) | addr[3]);
+
+       cmd.data1 = ((addr[4] << 8) | (addr[5]));
+
+       status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
+       return status;
+}
+
+static int
+mxge_change_pause(mxge_softc_t *sc, int pause)
+{      
+       mxge_cmd_t cmd;
+       int status;
+
+       if (pause)
+               status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
+                                      &cmd);
+       else
+               status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
+                                      &cmd);
+
+       if (status) {
+               device_printf(sc->dev, "Failed to set flow control mode\n");
+               return ENXIO;
+       }
+       sc->pause = pause;
+       return 0;
+}
+
+static void
+mxge_change_promisc(mxge_softc_t *sc, int promisc)
+{      
+       mxge_cmd_t cmd;
+       int status;
+
+       if( sc->ifp->if_serializer)
+               ASSERT_SERIALIZED(sc->ifp->if_serializer);
+       if (mxge_always_promisc)
+               promisc = 1;
+
+       if (promisc)
+               status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
+                                      &cmd);
+       else
+               status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
+                                      &cmd);
+
+       if (status) {
+               device_printf(sc->dev, "Failed to set promisc mode\n");
+       }
+}
+
+static void
+mxge_set_multicast_list(mxge_softc_t *sc)
+{
+       mxge_cmd_t cmd;
+       struct ifmultiaddr *ifma;
+       struct ifnet *ifp = sc->ifp;
+       int err;
+
+       if (ifp->if_serializer)
+               ASSERT_SERIALIZED(ifp->if_serializer);
+
+       /* This firmware is known to not support multicast */
+       if (!sc->fw_multicast_support)
+               return;
+
+       /* Disable multicast filtering while we play with the lists*/
+       err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
+       if (err != 0) {
+               device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
+                      " error status: %d\n", err);
+               return;
+       }
+       
+       if (sc->adopted_rx_filter_bug)
+               return;
+       
+       if (ifp->if_flags & IFF_ALLMULTI)
+               /* request to disable multicast filtering, so quit here */
+               return;
+
+       /* Flush all the filters */
+
+       err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
+       if (err != 0) {
+               device_printf(sc->dev, 
+                             "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
+                             ", error status: %d\n", err);
+               return;
+       }
+
+       /* Walk the multicast list, and add each address */
+
+       LIST_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
+               if (ifma->ifma_addr->sa_family != AF_LINK)
+                       continue;
+               bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
+                     &cmd.data0, 4);
+               bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
+                     &cmd.data1, 2);
+               cmd.data0 = htonl(cmd.data0);
+               cmd.data1 = htonl(cmd.data1);
+               err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
+               if (err != 0) {
+                       device_printf(sc->dev, "Failed "
+                              "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
+                              "%d\t", err);
+                       /* abort, leaving multicast filtering off */
+                       return;
+               }
+       }
+       /* Enable multicast filtering */
+       err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
+       if (err != 0) {
+               device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
+                      ", error status: %d\n", err);
+       }
+}
+
+static int
+mxge_max_mtu(mxge_softc_t *sc)
+{
+       mxge_cmd_t cmd;
+       int status;
+
+       if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
+               return  MXGEFW_MAX_MTU - MXGEFW_PAD;
+
+       /* try to set nbufs to see if it we can
+          use virtually contiguous jumbos */
+       cmd.data0 = 0;
+       status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
+                              &cmd);
+       if (status == 0)
+               return  MXGEFW_MAX_MTU - MXGEFW_PAD;
+
+       /* otherwise, we're limited to MJUMPAGESIZE */
+       return MJUMPAGESIZE - MXGEFW_PAD;
+}
+
+static int
+mxge_reset(mxge_softc_t *sc, int interrupts_setup)
+{
+       struct mxge_slice_state *ss;
+       mxge_rx_done_t *rx_done;
+       volatile uint32_t *irq_claim;
+       mxge_cmd_t cmd;
+       int slice, status;
+
+       /* try to send a reset command to the card to see if it
+          is alive */
+       memset(&cmd, 0, sizeof (cmd));
+       status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
+       if (status != 0) {
+               device_printf(sc->dev, "failed reset\n");
+               return ENXIO;
+       }
+
+       mxge_dummy_rdma(sc, 1);
+
+
+       /* set the intrq size */
+       cmd.data0 = sc->rx_ring_size;
+       status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
+
+       /* 
+        * Even though we already know how many slices are supported
+        * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
+        * has magic side effects, and must be called after a reset.
+        * It must be called prior to calling any RSS related cmds,
+        * including assigning an interrupt queue for anything but
+        * slice 0.  It must also be called *after*
+        * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
+        * the firmware to compute offsets.
+        */
+        
+       if (sc->num_slices > 1) {
+               /* ask the maximum number of slices it supports */
+               status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
+                                          &cmd);
+               if (status != 0) {
+                       device_printf(sc->dev, 
+                                     "failed to get number of slices\n");
+                       return status;
+               }
+               /* 
+                * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
+                * to setting up the interrupt queue DMA
+                */
+               cmd.data0 = sc->num_slices;
+               cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
+#ifdef IFNET_BUF_RING
+               cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
+#endif
+               status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
+                                          &cmd);
+               if (status != 0) {
+                       device_printf(sc->dev,
+                                     "failed to set number of slices\n");
+                       return status;
+               }
+       }
+
+
+       if (interrupts_setup) {
+               /* Now exchange information about interrupts  */
+               for (slice = 0; slice < sc->num_slices; slice++) {
+                       rx_done = &sc->ss[slice].rx_done;
+                       memset(rx_done->entry, 0, sc->rx_ring_size);
+                       cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
+                       cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
+                       cmd.data2 = slice;
+                       status |= mxge_send_cmd(sc,
+                                               MXGEFW_CMD_SET_INTRQ_DMA,
+                                               &cmd);
+               }
+       }
+
+       status |= mxge_send_cmd(sc, 
+                               MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
+       
+
+       sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
+
+       status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
+       irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
+
+
+       status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET, 
+                               &cmd);
+       sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
+       if (status != 0) {
+               device_printf(sc->dev, "failed set interrupt parameters\n");
+               return status;
+       }
+       
+
+       *sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
+
+       
+       /* run a DMA benchmark */
+       (void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
+
+       for (slice = 0; slice < sc->num_slices; slice++) {
+               ss = &sc->ss[slice];
+
+               ss->irq_claim = irq_claim + (2 * slice);
+               /* reset mcp/driver shared state back to 0 */
+               ss->rx_done.idx = 0;
+               ss->rx_done.cnt = 0;
+               ss->tx.req = 0;
+               ss->tx.done = 0;
+               ss->tx.pkt_done = 0;
+               ss->tx.queue_active = 0;
+               ss->tx.activate = 0;
+               ss->tx.deactivate = 0;
+               ss->tx.wake = 0;
+               ss->tx.defrag = 0;
+               ss->tx.stall = 0;
+               ss->rx_big.cnt = 0;
+               ss->rx_small.cnt = 0;
+               ss->lro_bad_csum = 0;
+               ss->lro_queued = 0;
+               ss->lro_flushed = 0;
+               if (ss->fw_stats != NULL) {
+                       ss->fw_stats->valid = 0;
+                       ss->fw_stats->send_done_count = 0;
+               }
+       }
+       sc->rdma_tags_available = 15;
+       status = mxge_update_mac_address(sc);
+       mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
+       mxge_change_pause(sc, sc->pause);
+       mxge_set_multicast_list(sc);
+       return status;
+}
+
+static int
+mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
+{
+        mxge_softc_t *sc;
+        unsigned int intr_coal_delay;
+        int err;
+
+        sc = arg1;
+        intr_coal_delay = sc->intr_coal_delay;
+        err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
+        if (err != 0) {
+                return err;
+        }
+        if (intr_coal_delay == sc->intr_coal_delay)
+                return 0;
+
+        if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
+                return EINVAL;
+
+       lwkt_serialize_enter(sc->ifp->if_serializer);
+       *sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
+       sc->intr_coal_delay = intr_coal_delay;
+
+       lwkt_serialize_exit(sc->ifp->if_serializer);
+        return err;
+}
+
+static int
+mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
+{
+        mxge_softc_t *sc;
+        unsigned int enabled;
+        int err;
+
+        sc = arg1;
+        enabled = sc->pause;
+        err = sysctl_handle_int(oidp, &enabled, arg2, req);
+        if (err != 0) {
+                return err;
+        }
+        if (enabled == sc->pause)
+                return 0;
+
+       lwkt_serialize_enter(sc->ifp->if_serializer);
+       err = mxge_change_pause(sc, enabled);
+       lwkt_serialize_exit(sc->ifp->if_serializer);
+        return err;
+}
+
+static int
+mxge_change_lro_locked(mxge_softc_t *sc, int lro_cnt)
+{
+       struct ifnet *ifp;
+       int err = 0;
+
+       ifp = sc->ifp;
+       if (lro_cnt == 0) 
+               ifp->if_capenable &= ~IFCAP_LRO;
+       else
+               ifp->if_capenable |= IFCAP_LRO;
+       sc->lro_cnt = lro_cnt;
+       if (ifp->if_flags & IFF_RUNNING) {
+               mxge_close(sc);
+               err = mxge_open(sc);
+       }
+       return err;
+}
+
+static int
+mxge_change_lro(SYSCTL_HANDLER_ARGS)
+{
+       mxge_softc_t *sc;
+       unsigned int lro_cnt;
+       int err;
+
+       sc = arg1;
+       lro_cnt = sc->lro_cnt;
+       err = sysctl_handle_int(oidp, &lro_cnt, arg2, req);
+       if (err != 0)
+               return err;
+
+       if (lro_cnt == sc->lro_cnt)
+               return 0;
+
+       if (lro_cnt > 128)
+               return EINVAL;
+
+       lwkt_serialize_enter(sc->ifp->if_serializer);
+       err = mxge_change_lro_locked(sc, lro_cnt);
+       lwkt_serialize_exit(sc->ifp->if_serializer);
+       return err;
+}
+
+static int
+mxge_handle_be32(SYSCTL_HANDLER_ARGS)
+{
+        int err;
+
+        if (arg1 == NULL)
+                return EFAULT;
+        arg2 = be32toh(*(int *)arg1);
+        arg1 = NULL;
+        err = sysctl_handle_int(oidp, arg1, arg2, req);
+
+        return err;
+}
+
+static void
+mxge_rem_sysctls(mxge_softc_t *sc)
+{
+       struct mxge_slice_state *ss;
+       int slice;
+
+       if (sc->slice_sysctl_tree == NULL)
+               return;
+
+       for (slice = 0; slice < sc->num_slices; slice++) {
+               ss = &sc->ss[slice];
+               if (ss == NULL || ss->sysctl_tree == NULL)
+                       continue;
+               sysctl_ctx_free(&ss->sysctl_ctx);
+               ss->sysctl_tree = NULL;
+       }
+       sysctl_ctx_free(&sc->slice_sysctl_ctx);
+       sc->slice_sysctl_tree = NULL;
+       sysctl_ctx_free(&sc->sysctl_ctx);
+       sc->sysctl_tree = NULL;
+
+}
+
+static void
+mxge_add_sysctls(mxge_softc_t *sc)
+{
+       struct sysctl_ctx_list *ctx;
+       struct sysctl_oid_list *children;
+       mcp_irq_data_t *fw;
+       struct mxge_slice_state *ss;
+       int slice;
+       char slice_num[8];
+
+       ctx = &sc->sysctl_ctx;
+       sysctl_ctx_init(ctx);
+       sc->sysctl_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_STATIC_CHILDREN(_hw),
+                                         OID_AUTO,
+                                         device_get_nameunit(sc->dev),
+                                         CTLFLAG_RD, 0, "");
+       if (sc->sysctl_tree == NULL) {
+               device_printf(sc->dev, "can't add sysctl node\n");
+               return;
+       }
+
+       children = SYSCTL_CHILDREN(sc->sysctl_tree);
+       fw = sc->ss[0].fw_stats;
+
+       /* random information */
+       SYSCTL_ADD_STRING(ctx, children, OID_AUTO, 
+                      "firmware_version",
+                      CTLFLAG_RD, &sc->fw_version,
+                      0, "firmware version");
+       SYSCTL_ADD_STRING(ctx, children, OID_AUTO, 
+                      "serial_number",
+                      CTLFLAG_RD, &sc->serial_number_string,
+                      0, "serial number");
+       SYSCTL_ADD_STRING(ctx, children, OID_AUTO, 
+                      "product_code",
+                      CTLFLAG_RD, &sc->product_code_string,
+                      0, "product_code");
+       SYSCTL_ADD_INT(ctx, children, OID_AUTO, 
+                      "pcie_link_width",
+                      CTLFLAG_RD, &sc->link_width,
+                      0, "tx_boundary");
+       SYSCTL_ADD_INT(ctx, children, OID_AUTO, 
+                      "tx_boundary",
+                      CTLFLAG_RD, &sc->tx_boundary,
+                      0, "tx_boundary");
+       SYSCTL_ADD_INT(ctx, children, OID_AUTO, 
+                      "write_combine",
+                      CTLFLAG_RD, &sc->wc,
+                      0, "write combining PIO?");
+       SYSCTL_ADD_INT(ctx, children, OID_AUTO, 
+                      "read_dma_MBs",
+                      CTLFLAG_RD, &sc->read_dma,
+                      0, "DMA Read speed in MB/s");
+       SYSCTL_ADD_INT(ctx, children, OID_AUTO, 
+                      "write_dma_MBs",
+                      CTLFLAG_RD, &sc->write_dma,
+                      0, "DMA Write speed in MB/s");
+       SYSCTL_ADD_INT(ctx, children, OID_AUTO, 
+                      "read_write_dma_MBs",
+                      CTLFLAG_RD, &sc->read_write_dma,
+                      0, "DMA concurrent Read/Write speed in MB/s");
+
+
+       /* performance related tunables */
+       SYSCTL_ADD_PROC(ctx, children, OID_AUTO, 
+                       "intr_coal_delay",
+                       CTLTYPE_INT|CTLFLAG_RW, sc,
+                       0, mxge_change_intr_coal, 
+                       "I", "interrupt coalescing delay in usecs");
+
+       SYSCTL_ADD_PROC(ctx, children, OID_AUTO, 
+                       "flow_control_enabled",
+                       CTLTYPE_INT|CTLFLAG_RW, sc,
+                       0, mxge_change_flow_control,
+                       "I", "interrupt coalescing delay in usecs");
+
+       SYSCTL_ADD_INT(ctx, children, OID_AUTO, 
+                      "deassert_wait",
+                      CTLFLAG_RW, &mxge_deassert_wait,
+                      0, "Wait for IRQ line to go low in ihandler");
+
+       /* stats block from firmware is in network byte order.  
+          Need to swap it */
+       SYSCTL_ADD_PROC(ctx, children, OID_AUTO, 
+                       "link_up",
+                       CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
+                       0, mxge_handle_be32,
+                       "I", "link up");
+       SYSCTL_ADD_PROC(ctx, children, OID_AUTO, 
+                       "rdma_tags_available",
+                       CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
+                       0, mxge_handle_be32,
+                       "I", "rdma_tags_available");
+       SYSCTL_ADD_PROC(ctx, children, OID_AUTO, 
+                       "dropped_bad_crc32",
+                       CTLTYPE_INT|CTLFLAG_RD, 
+                       &fw->dropped_bad_crc32,
+                       0, mxge_handle_be32,
+                       "I", "dropped_bad_crc32");
+       SYSCTL_ADD_PROC(ctx, children, OID_AUTO, 
+                       "dropped_bad_phy",
+                       CTLTYPE_INT|CTLFLAG_RD, 
+                       &fw->dropped_bad_phy,
+                       0, mxge_handle_be32,
+                       "I", "dropped_bad_phy");
+       SYSCTL_ADD_PROC(ctx, children, OID_AUTO, 
+                       "dropped_link_error_or_filtered",
+                       CTLTYPE_INT|CTLFLAG_RD, 
+                       &fw->dropped_link_error_or_filtered,
+                       0, mxge_handle_be32,
+                       "I", "dropped_link_error_or_filtered");
+       SYSCTL_ADD_PROC(ctx, children, OID_AUTO, 
+                       "dropped_link_overflow",
+                       CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
+                       0, mxge_handle_be32,
+                       "I", "dropped_link_overflow");
+       SYSCTL_ADD_PROC(ctx, children, OID_AUTO, 
+                       "dropped_multicast_filtered",
+                       CTLTYPE_INT|CTLFLAG_RD, 
+                       &fw->dropped_multicast_filtered,
+                       0, mxge_handle_be32,
+                       "I", "dropped_multicast_filtered");
+       SYSCTL_ADD_PROC(ctx, children, OID_AUTO, 
+                       "dropped_no_big_buffer",
+                       CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
+                       0, mxge_handle_be32,
+                       "I", "dropped_no_big_buffer");
+       SYSCTL_ADD_PROC(ctx, children, OID_AUTO, 
+                       "dropped_no_small_buffer",
+                       CTLTYPE_INT|CTLFLAG_RD, 
+                       &fw->dropped_no_small_buffer,
+                       0, mxge_handle_be32,
+                       "I", "dropped_no_small_buffer");
+       SYSCTL_ADD_PROC(ctx, children, OID_AUTO, 
+                       "dropped_overrun",
+                       CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
+                       0, mxge_handle_be32,
+                       "I", "dropped_overrun");
+       SYSCTL_ADD_PROC(ctx, children, OID_AUTO, 
+                       "dropped_pause",
+                       CTLTYPE_INT|CTLFLAG_RD, 
+                       &fw->dropped_pause,
+                       0, mxge_handle_be32,
+                       "I", "dropped_pause");
+       SYSCTL_ADD_PROC(ctx, children, OID_AUTO, 
+                       "dropped_runt",
+                       CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
+                       0, mxge_handle_be32,
+                       "I", "dropped_runt");
+
+       SYSCTL_ADD_PROC(ctx, children, OID_AUTO, 
+                       "dropped_unicast_filtered",
+                       CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
+                       0, mxge_handle_be32,
+                       "I", "dropped_unicast_filtered");
+
+       /* verbose printing? */
+       SYSCTL_ADD_INT(ctx, children, OID_AUTO, 
+                      "verbose",
+                      CTLFLAG_RW, &mxge_verbose,
+                      0, "verbose printing");
+
+       /* lro */
+       SYSCTL_ADD_PROC(ctx, children, OID_AUTO, 
+                       "lro_cnt",
+                       CTLTYPE_INT|CTLFLAG_RW, sc,
+                       0, mxge_change_lro,
+                       "I", "number of lro merge queues");
+
+
+       /* add counters exported for debugging from all slices */
+       sysctl_ctx_init(&sc->slice_sysctl_ctx);
+       sc->slice_sysctl_tree = 
+               SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
+                               "slice", CTLFLAG_RD, 0, "");
+
+       for (slice = 0; slice < sc->num_slices; slice++) {
+               ss = &sc->ss[slice];
+               sysctl_ctx_init(&ss->sysctl_ctx);
+               ctx = &ss->sysctl_ctx;
+               children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
+               ksprintf(slice_num, "%d", slice);
+               ss->sysctl_tree = 
+                       SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
+                                       CTLFLAG_RD, 0, "");
+               children = SYSCTL_CHILDREN(ss->sysctl_tree);
+               SYSCTL_ADD_INT(ctx, children, OID_AUTO, 
+                              "rx_small_cnt",
+                              CTLFLAG_RD, &ss->rx_small.cnt,
+                              0, "rx_small_cnt");
+               SYSCTL_ADD_INT(ctx, children, OID_AUTO, 
+                              "rx_big_cnt",
+                              CTLFLAG_RD, &ss->rx_big.cnt,
+                              0, "rx_small_cnt");
+               SYSCTL_ADD_INT(ctx, children, OID_AUTO,
+                              "lro_flushed", CTLFLAG_RD, &ss->lro_flushed,
+                              0, "number of lro merge queues flushed");
+
+               SYSCTL_ADD_INT(ctx, children, OID_AUTO,
+                              "lro_queued", CTLFLAG_RD, &ss->lro_queued,
+                              0, "number of frames appended to lro merge"
+                              "queues");
+
+#ifndef IFNET_BUF_RING
+               /* only transmit from slice 0 for now */
+               if (slice > 0)
+                       continue;
+#endif
+               SYSCTL_ADD_INT(ctx, children, OID_AUTO, 
+                              "tx_req",
+                              CTLFLAG_RD, &ss->tx.req,
+                              0, "tx_req");
+
+               SYSCTL_ADD_INT(ctx, children, OID_AUTO, 
+                              "tx_done",
+                              CTLFLAG_RD, &ss->tx.done,
+                              0, "tx_done");
+               SYSCTL_ADD_INT(ctx, children, OID_AUTO, 
+                              "tx_pkt_done",
+                              CTLFLAG_RD, &ss->tx.pkt_done,
+                              0, "tx_done");
+               SYSCTL_ADD_INT(ctx, children, OID_AUTO, 
+                              "tx_stall",
+                              CTLFLAG_RD, &ss->tx.stall,
+                              0, "tx_stall");
+               SYSCTL_ADD_INT(ctx, children, OID_AUTO, 
+                              "tx_wake",
+                              CTLFLAG_RD, &ss->tx.wake,
+                              0, "tx_wake");
+               SYSCTL_ADD_INT(ctx, children, OID_AUTO, 
+                              "tx_defrag",
+                              CTLFLAG_RD, &ss->tx.defrag,
+                              0, "tx_defrag");
+               SYSCTL_ADD_INT(ctx, children, OID_AUTO, 
+                              "tx_queue_active",
+                              CTLFLAG_RD, &ss->tx.queue_active,
+                              0, "tx_queue_active");
+               SYSCTL_ADD_INT(ctx, children, OID_AUTO, 
+                              "tx_activate",
+                              CTLFLAG_RD, &ss->tx.activate,
+                              0, "tx_activate");
+               SYSCTL_ADD_INT(ctx, children, OID_AUTO, 
+                              "tx_deactivate",
+                              CTLFLAG_RD, &ss->tx.deactivate,
+                              0, "tx_deactivate");
+       }
+}
+
+/* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy 
+   backwards one at a time and handle ring wraps */
+
+static inline void 
+mxge_submit_req_backwards(mxge_tx_ring_t *tx, 
+                           mcp_kreq_ether_send_t *src, int cnt)
+{
+        int idx, starting_slot;
+        starting_slot = tx->req;
+        while (cnt > 1) {
+                cnt--;
+                idx = (starting_slot + cnt) & tx->mask;
+                mxge_pio_copy(&tx->lanai[idx],
+                             &src[cnt], sizeof(*src));
+                wmb();
+        }
+}
+
+/*
+ * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
+ * at most 32 bytes at a time, so as to avoid involving the software
+ * pio handler in the nic.   We re-write the first segment's flags
+ * to mark them valid only after writing the entire chain 
+ */
+
+static inline void 
+mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src, 
+                  int cnt)
+{
+        int idx, i;
+        uint32_t *src_ints;
+       volatile uint32_t *dst_ints;
+        mcp_kreq_ether_send_t *srcp;
+       volatile mcp_kreq_ether_send_t *dstp, *dst;
+       uint8_t last_flags;
+        
+        idx = tx->req & tx->mask;
+
+       last_flags = src->flags;
+       src->flags = 0;
+        wmb();
+        dst = dstp = &tx->lanai[idx];
+        srcp = src;
+
+        if ((idx + cnt) < tx->mask) {
+                for (i = 0; i < (cnt - 1); i += 2) {
+                        mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
+                        wmb(); /* force write every 32 bytes */
+                        srcp += 2;
+                        dstp += 2;
+                }
+        } else {
+                /* submit all but the first request, and ensure 
+                   that it is submitted below */
+                mxge_submit_req_backwards(tx, src, cnt);
+                i = 0;
+        }
+        if (i < cnt) {
+                /* submit the first request */
+                mxge_pio_copy(dstp, srcp, sizeof(*src));
+                wmb(); /* barrier before setting valid flag */
+        }
+
+        /* re-write the last 32-bits with the valid flags */
+        src->flags = last_flags;
+        src_ints = (uint32_t *)src;
+        src_ints+=3;
+        dst_ints = (volatile uint32_t *)dst;
+        dst_ints+=3;
+        *dst_ints =  *src_ints;
+        tx->req += cnt;
+        wmb();
+}
+
+#if IFCAP_TSO4
+
+static void
+mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
+              int busdma_seg_cnt, int ip_off)
+{
+       mxge_tx_ring_t *tx;
+       mcp_kreq_ether_send_t *req;
+       bus_dma_segment_t *seg;
+       struct ip *ip;
+       struct tcphdr *tcp;
+       uint32_t low, high_swapped;
+       int len, seglen, cum_len, cum_len_next;
+       int next_is_first, chop, cnt, rdma_count, small;
+       uint16_t pseudo_hdr_offset, cksum_offset, mss;
+       uint8_t flags, flags_next;
+       static int once;
+
+       mss = m->m_pkthdr.tso_segsz;
+
+       /* negative cum_len signifies to the
+        * send loop that we are still in the
+        * header portion of the TSO packet.
+        */
+
+       /* ensure we have the ethernet, IP and TCP
+          header together in the first mbuf, copy
+          it to a scratch buffer if not */
+       if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
+               m_copydata(m, 0, ip_off + sizeof (*ip),
+                          ss->scratch);
+               ip = (struct ip *)(ss->scratch + ip_off);
+       } else {
+               ip = (struct ip *)(mtod(m, char *) + ip_off);
+       }
+       if (__predict_false(m->m_len < ip_off + (ip->ip_hl << 2)
+                           + sizeof (*tcp))) {
+               m_copydata(m, 0, ip_off + (ip->ip_hl << 2)
+                          + sizeof (*tcp),  ss->scratch);
+               ip = (struct ip *)(mtod(m, char *) + ip_off);
+       } 
+
+       tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
+       cum_len = -(ip_off + ((ip->ip_hl + tcp->th_off) << 2));
+
+       /* TSO implies checksum offload on this hardware */
+       cksum_offset = ip_off + (ip->ip_hl << 2);
+       flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
+
+       
+       /* for TSO, pseudo_hdr_offset holds mss.
+        * The firmware figures out where to put
+        * the checksum by parsing the header. */
+       pseudo_hdr_offset = htobe16(mss);
+
+       tx = &ss->tx;
+       req = tx->req_list;
+       seg = tx->seg_list;
+       cnt = 0;
+       rdma_count = 0;
+       /* "rdma_count" is the number of RDMAs belonging to the
+        * current packet BEFORE the current send request. For
+        * non-TSO packets, this is equal to "count".
+        * For TSO packets, rdma_count needs to be reset
+        * to 0 after a segment cut.
+        *
+        * The rdma_count field of the send request is
+        * the number of RDMAs of the packet starting at
+        * that request. For TSO send requests with one ore more cuts
+        * in the middle, this is the number of RDMAs starting
+        * after the last cut in the request. All previous
+        * segments before the last cut implicitly have 1 RDMA.
+        *
+        * Since the number of RDMAs is not known beforehand,
+        * it must be filled-in retroactively - after each
+        * segmentation cut or at the end of the entire packet.
+        */
+
+       while (busdma_seg_cnt) {
+               /* Break the busdma segment up into pieces*/
+               low = MXGE_LOWPART_TO_U32(seg->ds_addr);
+               high_swapped =  htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
+               len = seg->ds_len;
+
+               while (len) {
+                       flags_next = flags & ~MXGEFW_FLAGS_FIRST;
+                       seglen = len;
+                       cum_len_next = cum_len + seglen;
+                       (req-rdma_count)->rdma_count = rdma_count + 1;
+                       if (__predict_true(cum_len >= 0)) {
+                               /* payload */
+                               chop = (cum_len_next > mss);
+                               cum_len_next = cum_len_next % mss;
+                               next_is_first = (cum_len_next == 0);
+                               flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
+                               flags_next |= next_is_first *
+                                       MXGEFW_FLAGS_FIRST;
+                               rdma_count |= -(chop | next_is_first);
+                               rdma_count += chop & !next_is_first;
+                       } else if (cum_len_next >= 0) {
+                               /* header ends */
+                               rdma_count = -1;
+                               cum_len_next = 0;
+                               seglen = -cum_len;
+                               small = (mss <= MXGEFW_SEND_SMALL_SIZE);
+                               flags_next = MXGEFW_FLAGS_TSO_PLD |
+                                       MXGEFW_FLAGS_FIRST | 
+                                       (small * MXGEFW_FLAGS_SMALL);
+                           }
+                       
+                       req->addr_high = high_swapped;
+                       req->addr_low = htobe32(low);
+                       req->pseudo_hdr_offset = pseudo_hdr_offset;
+                       req->pad = 0;
+                       req->rdma_count = 1;
+                       req->length = htobe16(seglen);
+                       req->cksum_offset = cksum_offset;
+                       req->flags = flags | ((cum_len & 1) *
+                                             MXGEFW_FLAGS_ALIGN_ODD);
+                       low += seglen;
+                       len -= seglen;
+                       cum_len = cum_len_next;
+                       flags = flags_next;
+                       req++;
+                       cnt++;
+                       rdma_count++;
+                       if (__predict_false(cksum_offset > seglen))
+                               cksum_offset -= seglen;
+                       else
+                               cksum_offset = 0;
+                       if (__predict_false(cnt > tx->max_desc))
+                               goto drop;
+               }
+               busdma_seg_cnt--;
+               seg++;
+       }
+       (req-rdma_count)->rdma_count = rdma_count;
+
+       do {
+               req--;
+               req->flags |= MXGEFW_FLAGS_TSO_LAST;
+       } while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
+
+       tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
+       mxge_submit_req(tx, tx->req_list, cnt);
+#ifdef IFNET_BUF_RING
+       if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
+               /* tell the NIC to start polling this slice */
+               *tx->send_go = 1;
+               tx->queue_active = 1;
+               tx->activate++;
+               wmb();
+       }
+#endif
+       return;
+
+drop:
+       bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
+       m_freem(m);
+       ss->oerrors++;
+       if (!once) {
+               kprintf("tx->max_desc exceeded via TSO!\n");
+               kprintf("mss = %d, %ld, %d!\n", mss,
+                      (long)seg - (long)tx->seg_list, tx->max_desc);
+               once = 1;
+       }
+       return;
+
+}
+
+#endif /* IFCAP_TSO4 */
+
+#ifdef MXGE_NEW_VLAN_API
+/* 
+ * We reproduce the software vlan tag insertion from
+ * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
+ * vlan tag insertion. We need to advertise this in order to have the
+ * vlan interface respect our csum offload flags.
+ */
+static struct mbuf *
+mxge_vlan_tag_insert(struct mbuf *m)
+{
+       struct ether_vlan_header *evl;
+
+       M_PREPEND(m, EVL_ENCAPLEN, MB_DONTWAIT);
+       if (__predict_false(m == NULL))
+               return NULL;
+       if (m->m_len < sizeof(*evl)) {
+               m = m_pullup(m, sizeof(*evl));
+               if (__predict_false(m == NULL))
+                       return NULL;
+       }
+       /*
+        * Transform the Ethernet header into an Ethernet header
+        * with 802.1Q encapsulation.
+        */
+       evl = mtod(m, struct ether_vlan_header *);
+       bcopy((char *)evl + EVL_ENCAPLEN,
+             (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
+       evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
+       evl->evl_tag = htons(m->m_pkthdr.ether_vlantag);
+       m->m_flags &= ~M_VLANTAG;
+       return m;
+}
+#endif /* MXGE_NEW_VLAN_API */
+
+static void
+mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
+{
+       mxge_softc_t *sc;
+       mcp_kreq_ether_send_t *req;
+       bus_dma_segment_t *seg;
+       struct mbuf *m_tmp;
+       struct ifnet *ifp;
+       mxge_tx_ring_t *tx;
+       struct ip *ip;
+       int cnt, cum_len, err, i, idx, odd_flag, ip_off;
+       uint16_t pseudo_hdr_offset;
+        uint8_t flags, cksum_offset;
+
+
+       sc = ss->sc;
+       ifp = sc->ifp;
+       tx = &ss->tx;
+
+       ip_off = sizeof (struct ether_header);
+#ifdef MXGE_NEW_VLAN_API
+       if (m->m_flags & M_VLANTAG) {
+               m = mxge_vlan_tag_insert(m);
+               if (__predict_false(m == NULL))
+                       goto drop;
+               ip_off += EVL_ENCAPLEN;
+       }
+#endif
+       /* (try to) map the frame for DMA */
+       idx = tx->req & tx->mask;
+       err = bus_dmamap_load_mbuf_segment(tx->dmat, tx->info[idx].map,
+                                          m, tx->seg_list, 1, &cnt, 
+                                          BUS_DMA_NOWAIT);
+       if (__predict_false(err == EFBIG)) {
+               /* Too many segments in the chain.  Try
+                  to defrag */
+               m_tmp = m_defrag(m, M_NOWAIT);
+               if (m_tmp == NULL) {
+                       goto drop;
+               }
+               ss->tx.defrag++;
+               m = m_tmp;
+               err = bus_dmamap_load_mbuf_segment(tx->dmat, 
+                                             tx->info[idx].map,
+                                             m, tx->seg_list, 1, &cnt, 
+                                             BUS_DMA_NOWAIT);
+       }
+       if (__predict_false(err != 0)) {
+               device_printf(sc->dev, "bus_dmamap_load_mbuf_segment returned %d"
+                             " packet len = %d\n", err, m->m_pkthdr.len);
+               goto drop;
+       }
+       bus_dmamap_sync(tx->dmat, tx->info[idx].map,
+                       BUS_DMASYNC_PREWRITE);
+       tx->info[idx].m = m;
+
+#if IFCAP_TSO4
+       /* TSO is different enough, we handle it in another routine */
+       if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
+               mxge_encap_tso(ss, m, cnt, ip_off);
+               return;
+       }
+#endif
+
+       req = tx->req_list;
+       cksum_offset = 0;
+       pseudo_hdr_offset = 0;
+       flags = MXGEFW_FLAGS_NO_TSO;
+
+       /* checksum offloading? */
+       if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
+               /* ensure ip header is in first mbuf, copy
+                  it to a scratch buffer if not */
+               if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
+                       m_copydata(m, 0, ip_off + sizeof (*ip),
+                                  ss->scratch);
+                       ip = (struct ip *)(ss->scratch + ip_off);
+               } else {
+                       ip = (struct ip *)(mtod(m, char *) + ip_off);
+               }
+               cksum_offset = ip_off + (ip->ip_hl << 2);
+               pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
+               pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
+               req->cksum_offset = cksum_offset;
+               flags |= MXGEFW_FLAGS_CKSUM;
+               odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
+       } else {
+               odd_flag = 0;
+       }
+       if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
+               flags |= MXGEFW_FLAGS_SMALL;
+
+       /* convert segments into a request list */
+       cum_len = 0;
+       seg = tx->seg_list;
+       req->flags = MXGEFW_FLAGS_FIRST;
+       for (i = 0; i < cnt; i++) {
+               req->addr_low = 
+                       htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
+               req->addr_high = 
+                       htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
+               req->length = htobe16(seg->ds_len);
+               req->cksum_offset = cksum_offset;
+               if (cksum_offset > seg->ds_len)
+                       cksum_offset -= seg->ds_len;
+               else
+                       cksum_offset = 0;
+               req->pseudo_hdr_offset = pseudo_hdr_offset;
+               req->pad = 0; /* complete solid 16-byte block */
+               req->rdma_count = 1;
+               req->flags |= flags | ((cum_len & 1) * odd_flag);
+               cum_len += seg->ds_len;
+               seg++;
+               req++;
+               req->flags = 0;
+       }
+       req--;
+       /* pad runts to 60 bytes */
+       if (cum_len < 60) {
+               req++;
+               req->addr_low = 
+                       htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
+               req->addr_high = 
+                       htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
+               req->length = htobe16(60 - cum_len);
+               req->cksum_offset = 0;
+               req->pseudo_hdr_offset = pseudo_hdr_offset;
+               req->pad = 0; /* complete solid 16-byte block */
+               req->rdma_count = 1;
+               req->flags |= flags | ((cum_len & 1) * odd_flag);
+               cnt++;
+       }
+
+       tx->req_list[0].rdma_count = cnt;
+#if 0
+       /* print what the firmware will see */
+       for (i = 0; i < cnt; i++) {
+               kprintf("%d: addr: 0x%x 0x%x len:%d pso%d,"
+                   "cso:%d, flags:0x%x, rdma:%d\n",
+                   i, (int)ntohl(tx->req_list[i].addr_high),
+                   (int)ntohl(tx->req_list[i].addr_low),
+                   (int)ntohs(tx->req_list[i].length),
+                   (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
+                   tx->req_list[i].cksum_offset, tx->req_list[i].flags,
+                   tx->req_list[i].rdma_count);
+       }
+       kprintf("--------------\n");
+#endif
+       tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
+       mxge_submit_req(tx, tx->req_list, cnt);
+#ifdef IFNET_BUF_RING
+       if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
+               /* tell the NIC to start polling this slice */
+               *tx->send_go = 1;
+               tx->queue_active = 1;
+               tx->activate++;
+               wmb();
+       }
+#endif
+       return;
+
+drop:
+       m_freem(m);
+       ss->oerrors++;
+       return;
+}
+
+#ifdef IFNET_BUF_RING
+static void
+mxge_qflush(struct ifnet *ifp)
+{
+       mxge_softc_t *sc = ifp->if_softc;
+       mxge_tx_ring_t *tx;
+       struct mbuf *m;
+       int slice;
+
+       for (slice = 0; slice < sc->num_slices; slice++) {
+               tx = &sc->ss[slice].tx;
+               lwkt_serialize_enter(sc->ifp->if_serializer);
+               while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
+                       m_freem(m);
+               lwkt_serialize_exit(sc->ifp->if_serializer);
+       }
+       if_qflush(ifp);
+}
+
+static inline void
+mxge_start_locked(struct mxge_slice_state *ss)
+{
+       mxge_softc_t *sc;
+       struct mbuf *m;
+       struct ifnet *ifp;
+       mxge_tx_ring_t *tx;
+
+       sc = ss->sc;
+       ifp = sc->ifp;
+       tx = &ss->tx;
+
+       while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
+               m = drbr_dequeue(ifp, tx->br);
+               if (m == NULL) {
+                       return;
+               }
+               /* let BPF see it */
+               BPF_MTAP(ifp, m);
+
+               /* give it to the nic */
+               mxge_encap(ss, m);
+       }
+       /* ran out of transmit slots */
+       if (((ss->if_flags & IFF_OACTIVE) == 0)
+           && (!drbr_empty(ifp, tx->br))) {
+               ss->if_flags |= IFF_OACTIVE;
+               tx->stall++;
+       }
+}
+
+static int
+mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
+{
+       mxge_softc_t *sc;
+       struct ifnet *ifp;
+       mxge_tx_ring_t *tx;
+       int err;
+
+       sc = ss->sc;
+       ifp = sc->ifp;
+       tx = &ss->tx;
+
+       if ((ss->if_flags & (IFF_RUNNING|IFF_OACTIVE)) !=
+           IFF_RUNNING) {
+               err = drbr_enqueue(ifp, tx->br, m);
+               return (err);
+       }
+
+       if (drbr_empty(ifp, tx->br) &&
+           ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
+               /* let BPF see it */
+               BPF_MTAP(ifp, m);
+               /* give it to the nic */
+               mxge_encap(ss, m);
+       } else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
+               return (err);
+       }
+       if (!drbr_empty(ifp, tx->br))
+               mxge_start_locked(ss);
+       return (0);
+}
+
+static int
+mxge_transmit(struct ifnet *ifp, struct mbuf *m)
+{
+       mxge_softc_t *sc = ifp->if_softc;
+       struct mxge_slice_state *ss;
+       mxge_tx_ring_t *tx;
+       int err = 0;
+       int slice;
+
+#if 0
+       slice = m->m_pkthdr.flowid;
+#endif
+       slice &= (sc->num_slices - 1);  /* num_slices always power of 2 */
+
+       ss = &sc->ss[slice];
+       tx = &ss->tx;
+
+       if(lwkt_serialize_try(ifp->if_serializer)) {
+               err = mxge_transmit_locked(ss, m);
+               lwkt_serialize_exit(ifp->if_serializer);
+       } else {
+               err = drbr_enqueue(ifp, tx->br, m);
+       }
+
+       return (err);
+}
+
+#else
+
+static inline void
+mxge_start_locked(struct mxge_slice_state *ss)
+{
+       mxge_softc_t *sc;
+       struct mbuf *m;
+       struct ifnet *ifp;
+       mxge_tx_ring_t *tx;
+
+       sc = ss->sc;
+       ifp = sc->ifp;
+       tx = &ss->tx;
+       while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
+               m = ifq_dequeue(&ifp->if_snd, NULL);
+               if (m == NULL) {
+                       return;
+               }
+               /* let BPF see it */
+               BPF_MTAP(ifp, m);
+
+               /* give it to the nic */
+               mxge_encap(ss, m);
+       }
+       /* ran out of transmit slots */
+       if ((sc->ifp->if_flags & IFF_OACTIVE) == 0) {
+               sc->ifp->if_flags |= IFF_OACTIVE;
+               tx->stall++;
+       }
+}
+#endif
+static void
+mxge_start(struct ifnet *ifp)
+{
+       mxge_softc_t *sc = ifp->if_softc;
+       struct mxge_slice_state *ss;
+
+       ASSERT_SERIALIZED(sc->ifp->if_serializer);
+       /* only use the first slice for now */
+       ss = &sc->ss[0];
+       mxge_start_locked(ss);
+}
+
+/*
+ * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
+ * at most 32 bytes at a time, so as to avoid involving the software
+ * pio handler in the nic.   We re-write the first segment's low
+ * DMA address to mark it valid only after we write the entire chunk
+ * in a burst
+ */
+static inline void
+mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
+               mcp_kreq_ether_recv_t *src)
+{
+       uint32_t low;
+
+       low = src->addr_low;
+       src->addr_low = 0xffffffff;
+       mxge_pio_copy(dst, src, 4 * sizeof (*src));
+       wmb();
+       mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
+       wmb();
+       src->addr_low = low;
+       dst->addr_low = low;
+       wmb();
+}
+
+static int
+mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
+{
+       bus_dma_segment_t seg;
+       struct mbuf *m;
+       mxge_rx_ring_t *rx = &ss->rx_small;
+       int cnt, err;
+
+       m = m_gethdr(MB_DONTWAIT, MT_DATA);
+       if (m == NULL) {
+               rx->alloc_fail++;
+               err = ENOBUFS;
+               goto done;
+       }
+       m->m_len = m->m_pkthdr.len = MHLEN;
+       err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m, 
+                                     &seg, 1, &cnt, BUS_DMA_NOWAIT);
+       if (err != 0) {
+               kprintf("can't dmamap small (%d)\n", err);
+               m_free(m);
+               goto done;
+       }
+       rx->info[idx].m = m;
+       rx->shadow[idx].addr_low = 
+               htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
+       rx->shadow[idx].addr_high = 
+               htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
+
+done:
+       if ((idx & 7) == 7)
+               mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
+       return err;
+}
+
+
+static int
+mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
+{
+       bus_dma_segment_t seg[3];
+       struct mbuf *m;
+       mxge_rx_ring_t *rx = &ss->rx_big;
+       int cnt, err, i;
+
+       if (rx->cl_size == MCLBYTES)
+               m = m_getcl(MB_DONTWAIT, MT_DATA, M_PKTHDR);
+       else {
+#if 0
+               m = m_getjcl(MB_DONTWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
+#else
+               /*
+                * XXX: allocate normal sized buffers for big buffers.
+                * We should be fine as long as we don't get any jumbo frames
+                */
+               m = m_getcl(MB_DONTWAIT, MT_DATA, M_PKTHDR);
+#endif
+       }
+       if (m == NULL) {
+               rx->alloc_fail++;
+               err = ENOBUFS;
+               goto done;
+       }
+       m->m_pkthdr.len = 0;
+       m->m_len = m->m_pkthdr.len = rx->mlen;
+       err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m, 
+                                     seg, 1, &cnt, BUS_DMA_NOWAIT);
+       if (err != 0) {
+               kprintf("can't dmamap big (%d)\n", err);
+               m_free(m);
+               goto done;
+       }
+       rx->info[idx].m = m;
+       rx->shadow[idx].addr_low = 
+               htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
+       rx->shadow[idx].addr_high = 
+               htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
+
+#if MXGE_VIRT_JUMBOS
+       for (i = 1; i < cnt; i++) {
+               rx->shadow[idx + i].addr_low = 
+                       htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
+               rx->shadow[idx + i].addr_high = 
+                       htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
+       }
+#endif
+
+done:
+       for (i = 0; i < rx->nbufs; i++) {
+               if ((idx & 7) == 7) {
+                       mxge_submit_8rx(&rx->lanai[idx - 7],
+                                       &rx->shadow[idx - 7]);
+               }
+               idx++;
+       }
+       return err;
+}
+
+/* 
+ *  Myri10GE hardware checksums are not valid if the sender
+ *  padded the frame with non-zero padding.  This is because
+ *  the firmware just does a simple 16-bit 1s complement
+ *  checksum across the entire frame, excluding the first 14
+ *  bytes.  It is best to simply to check the checksum and
+ *  tell the stack about it only if the checksum is good
+ */
+
+static inline uint16_t
+mxge_rx_csum(struct mbuf *m, int csum)
+{
+       struct ether_header *eh;
+       struct ip *ip;
+       uint16_t c;
+
+       eh = mtod(m, struct ether_header *);
+
+       /* only deal with IPv4 TCP & UDP for now */
+       if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
+               return 1;
+       ip = (struct ip *)(eh + 1);
+       if (__predict_false(ip->ip_p != IPPROTO_TCP &&
+                           ip->ip_p != IPPROTO_UDP))
+               return 1;
+#ifdef INET
+       c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
+                     htonl(ntohs(csum) + ntohs(ip->ip_len) +
+                           - (ip->ip_hl << 2) + ip->ip_p));
+#else
+       c = 1;
+#endif
+       c ^= 0xffff;
+       return (c);
+}
+
+static void
+mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
+{
+       struct ether_vlan_header *evl;
+       struct ether_header *eh;
+       uint32_t partial;
+
+       evl = mtod(m, struct ether_vlan_header *);
+       eh = mtod(m, struct ether_header *);
+
+       /*
+        * fix checksum by subtracting EVL_ENCAPLEN bytes
+        * after what the firmware thought was the end of the ethernet
+        * header.
+        */
+
+       /* put checksum into host byte order */
+       *csum = ntohs(*csum); 
+       partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
+       (*csum) += ~partial;
+       (*csum) +=  ((*csum) < ~partial);
+       (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
+       (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
+
+       /* restore checksum to network byte order; 
+          later consumers expect this */
+       *csum = htons(*csum);
+
+       /* save the tag */
+#ifdef MXGE_NEW_VLAN_API       
+       m->m_pkthdr.ether_vlantag = ntohs(evl->evl_tag);
+#else
+       {
+               struct m_tag *mtag;
+               mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
+                                  MB_DONTWAIT);
+               if (mtag == NULL)
+                       return;
+               VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
+               m_tag_prepend(m, mtag);
+       }
+
+#endif
+       m->m_flags |= M_VLANTAG;
+
+       /*
+        * Remove the 802.1q header by copying the Ethernet
+        * addresses over it and adjusting the beginning of
+        * the data in the mbuf.  The encapsulated Ethernet
+        * type field is already in place.
+        */
+       bcopy((char *)evl, (char *)evl + EVL_ENCAPLEN,
+             ETHER_HDR_LEN - ETHER_TYPE_LEN);
+       m_adj(m, EVL_ENCAPLEN);
+}
+
+
+static inline void
+mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len, uint32_t csum,
+                  struct mbuf_chain *chain)
+{
+       mxge_softc_t *sc;
+       struct ifnet *ifp;
+       struct mbuf *m;
+       struct ether_header *eh;
+       mxge_rx_ring_t *rx;
+       bus_dmamap_t old_map;
+       int idx;
+       uint16_t tcpudp_csum;
+
+       sc = ss->sc;
+       ifp = sc->ifp;
+       rx = &ss->rx_big;
+       idx = rx->cnt & rx->mask;
+       rx->cnt += rx->nbufs;
+       /* save a pointer to the received mbuf */
+       m = rx->info[idx].m;
+       /* try to replace the received mbuf */
+       if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
+               /* drop the frame -- the old mbuf is re-cycled */
+               ifp->if_ierrors++;
+               return;
+       }
+
+       /* unmap the received buffer */
+       old_map = rx->info[idx].map;
+       bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
+       bus_dmamap_unload(rx->dmat, old_map);
+
+       /* swap the bus_dmamap_t's */
+       rx->info[idx].map = rx->extra_map;
+       rx->extra_map = old_map;
+
+       /* mcp implicitly skips 1st 2 bytes so that packet is properly
+        * aligned */
+       m->m_data += MXGEFW_PAD;
+
+       m->m_pkthdr.rcvif = ifp;
+       m->m_len = m->m_pkthdr.len = len;
+       ss->ipackets++;
+       eh = mtod(m, struct ether_header *);
+       if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
+               mxge_vlan_tag_remove(m, &csum);
+       }
+       /* if the checksum is valid, mark it in the mbuf header */
+       if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
+               if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
+                       return;
+               /* otherwise, it was a UDP frame, or a TCP frame which
+                  we could not do LRO on.  Tell the stack that the
+                  checksum is good */
+               m->m_pkthdr.csum_data = 0xffff;
+               m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
+       }
+#if 0
+       /* flowid only valid if RSS hashing is enabled */
+       if (sc->num_slices > 1) {
+               m->m_pkthdr.flowid = (ss - sc->ss);
+               m->m_flags |= M_FLOWID;
+       }
+#endif
+       ether_input_chain(ifp, m, NULL, chain);
+}
+
+static inline void
+mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len, uint32_t csum,
+                  struct mbuf_chain *chain)
+{
+       mxge_softc_t *sc;
+       struct ifnet *ifp;
+       struct ether_header *eh;
+       struct mbuf *m;
+       mxge_rx_ring_t *rx;
+       bus_dmamap_t old_map;
+       int idx;
+       uint16_t tcpudp_csum;
+
+       sc = ss->sc;
+       ifp = sc->ifp;
+       rx = &ss->rx_small;
+       idx = rx->cnt & rx->mask;
+       rx->cnt++;
+       /* save a pointer to the received mbuf */
+       m = rx->info[idx].m;
+       /* try to replace the received mbuf */
+       if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
+               /* drop the frame -- the old mbuf is re-cycled */
+               ifp->if_ierrors++;
+               return;
+       }
+
+       /* unmap the received buffer */
+       old_map = rx->info[idx].map;
+       bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
+       bus_dmamap_unload(rx->dmat, old_map);
+
+       /* swap the bus_dmamap_t's */
+       rx->info[idx].map = rx->extra_map;
+       rx->extra_map = old_map;
+
+       /* mcp implicitly skips 1st 2 bytes so that packet is properly
+        * aligned */
+       m->m_data += MXGEFW_PAD;
+
+       m->m_pkthdr.rcvif = ifp;
+       m->m_len = m->m_pkthdr.len = len;
+       ss->ipackets++;
+       eh = mtod(m, struct ether_header *);
+       if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
+               mxge_vlan_tag_remove(m, &csum);
+       }
+       /* if the checksum is valid, mark it in the mbuf header */
+       if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
+               if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
+                       return;
+               /* otherwise, it was a UDP frame, or a TCP frame which
+                  we could not do LRO on.  Tell the stack that the
+                  checksum is good */
+               m->m_pkthdr.csum_data = 0xffff;
+               m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
+       }
+#if 0
+       /* flowid only valid if RSS hashing is enabled */
+       if (sc->num_slices > 1) {
+               m->m_pkthdr.flowid = (ss - sc->ss);
+               m->m_flags |= M_FLOWID;
+       }
+#endif
+       ether_input_chain(ifp, m, NULL, chain);
+}
+
+static inline void
+mxge_clean_rx_done(struct mxge_slice_state *ss)
+{
+       mxge_rx_done_t *rx_done = &ss->rx_done;
+       int limit = 0;
+       uint16_t length;
+       uint16_t checksum;
+       struct mbuf_chain chain[MAXCPU];
+
+       ether_input_chain_init(chain);
+       while (rx_done->entry[rx_done->idx].length != 0) {
+               length = ntohs(rx_done->entry[rx_done->idx].length);
+               rx_done->entry[rx_done->idx].length = 0;
+               checksum = rx_done->entry[rx_done->idx].checksum;
+               if (length <= (MHLEN - MXGEFW_PAD))
+                       mxge_rx_done_small(ss, length, checksum, chain);
+               else
+                       mxge_rx_done_big(ss, length, checksum, chain);
+               rx_done->cnt++;
+               rx_done->idx = rx_done->cnt & rx_done->mask;
+
+               /* limit potential for livelock */
+               if (__predict_false(++limit > rx_done->mask / 2))
+                       break;
+       }
+       ether_input_dispatch(chain);
+#ifdef INET
+       while (!SLIST_EMPTY(&ss->lro_active)) {
+               struct lro_entry *lro = SLIST_FIRST(&ss->lro_active);
+               SLIST_REMOVE_HEAD(&ss->lro_active, next);
+               mxge_lro_flush(ss, lro);
+       }
+#endif
+}
+
+
+static inline void
+mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
+{
+       struct ifnet *ifp;
+       mxge_tx_ring_t *tx;
+       struct mbuf *m;
+       bus_dmamap_t map;
+       int idx;
+       int *flags;
+
+       tx = &ss->tx;
+       ifp = ss->sc->ifp;
+       ASSERT_SERIALIZED(ifp->if_serializer);
+       while (tx->pkt_done != mcp_idx) {
+               idx = tx->done & tx->mask;
+               tx->done++;
+               m = tx->info[idx].m;
+               /* mbuf and DMA map only attached to the first
+                  segment per-mbuf */
+               if (m != NULL) {
+                       ss->obytes += m->m_pkthdr.len;
+                       if (m->m_flags & M_MCAST)
+                               ss->omcasts++;
+                       ss->opackets++;
+                       tx->info[idx].m = NULL;
+                       map = tx->info[idx].map;
+                       bus_dmamap_unload(tx->dmat, map);
+                       m_freem(m);
+               }
+               if (tx->info[idx].flag) {
+                       tx->info[idx].flag = 0;
+                       tx->pkt_done++;
+               }
+       }
+       
+       /* If we have space, clear IFF_OACTIVE to tell the stack that
+           its OK to send packets */
+#ifdef IFNET_BUF_RING
+       flags = &ss->if_flags;
+#else
+       flags = &ifp->if_flags;
+#endif
+       if ((*flags) & IFF_OACTIVE &&
+           tx->req - tx->done < (tx->mask + 1)/4) {
+               *(flags) &= ~IFF_OACTIVE;
+               ss->tx.wake++;
+               mxge_start_locked(ss);
+       }
+#ifdef IFNET_BUF_RING
+       if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
+               /* let the NIC stop polling this queue, since there
+                * are no more transmits pending */
+               if (tx->req == tx->done) {
+                       *tx->send_stop = 1;
+                       tx->queue_active = 0;
+                       tx->deactivate++;
+                       wmb();
+               }
+       }
+#endif
+
+}
+
+static struct mxge_media_type mxge_xfp_media_types[] =
+{
+       {IFM_10G_CX4,   0x7f,           "10GBASE-CX4 (module)"},
+       {IFM_10G_SR,    (1 << 7),       "10GBASE-SR"},
+       {IFM_10G_LR,    (1 << 6),       "10GBASE-LR"},
+       {0,             (1 << 5),       "10GBASE-ER"},
+       {IFM_10G_LRM,   (1 << 4),       "10GBASE-LRM"},
+       {0,             (1 << 3),       "10GBASE-SW"},
+       {0,             (1 << 2),       "10GBASE-LW"},
+       {0,             (1 << 1),       "10GBASE-EW"},
+       {0,             (1 << 0),       "Reserved"}
+};
+static struct mxge_media_type mxge_sfp_media_types[] =
+{
+       {0,             (1 << 7),       "Reserved"},
+       {IFM_10G_LRM,   (1 << 6),       "10GBASE-LRM"},
+       {IFM_10G_LR,    (1 << 5),       "10GBASE-LR"},
+       {IFM_10G_SR,    (1 << 4),       "10GBASE-SR"}
+};
+
+static void
+mxge_set_media(mxge_softc_t *sc, int type)
+{
+       sc->media_flags |= type;
+       ifmedia_add(&sc->media, sc->media_flags, 0, NULL);
+       ifmedia_set(&sc->media, sc->media_flags);
+}
+
+
+/*
+ * Determine the media type for a NIC.  Some XFPs will identify
+ * themselves only when their link is up, so this is initiated via a
+ * link up interrupt.  However, this can potentially take up to
+ * several milliseconds, so it is run via the watchdog routine, rather
+ * than in the interrupt handler itself.   This need only be done
+ * once, not each time the link is up.
+ */
+static void
+mxge_media_probe(mxge_softc_t *sc)
+{
+       mxge_cmd_t cmd;
+       char *cage_type;
+       char *ptr;
+       struct mxge_media_type *mxge_media_types = NULL;
+       int i, err, ms, mxge_media_type_entries;
+       uint32_t byte;
+
+       sc->need_media_probe = 0;
+
+       /* if we've already set a media type, we're done */
+       if (sc->media_flags  != (IFM_ETHER | IFM_AUTO))
+               return;
+
+       /* 
+        * parse the product code to deterimine the interface type
+        * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
+        * after the 3rd dash in the driver's cached copy of the
+        * EEPROM's product code string.
+        */
+       ptr = sc->product_code_string;
+       if (ptr == NULL) {
+               device_printf(sc->dev, "Missing product code\n");
+       }
+
+       for (i = 0; i < 3; i++, ptr++) {
+               ptr = index(ptr, '-');
+               if (ptr == NULL) {
+                       device_printf(sc->dev,
+                                     "only %d dashes in PC?!?\n", i);
+                       return;
+               }
+       }
+       if (*ptr == 'C') {
+               /* -C is CX4 */
+               mxge_set_media(sc, IFM_10G_CX4);
+               return;
+       }
+       else if (*ptr == 'Q') {
+               /* -Q is Quad Ribbon Fiber */
+               device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
+               /* FreeBSD has no media type for Quad ribbon fiber */
+               return;
+       }
+
+       if (*ptr == 'R') {
+               /* -R is XFP */
+               mxge_media_types = mxge_xfp_media_types;
+               mxge_media_type_entries = 
+                       sizeof (mxge_xfp_media_types) /
+                       sizeof (mxge_xfp_media_types[0]);
+               byte = MXGE_XFP_COMPLIANCE_BYTE;
+               cage_type = "XFP";
+       }
+
+       if (*ptr == 'S' || *(ptr +1) == 'S') {
+               /* -S or -2S is SFP+ */
+               mxge_media_types = mxge_sfp_media_types;
+               mxge_media_type_entries = 
+                       sizeof (mxge_sfp_media_types) /
+                       sizeof (mxge_sfp_media_types[0]);
+               cage_type = "SFP+";
+               byte = 3;
+       }
+
+       if (mxge_media_types == NULL) {
+               device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
+               return;
+       }
+
+       /*
+        * At this point we know the NIC has an XFP cage, so now we
+        * try to determine what is in the cage by using the
+        * firmware's XFP I2C commands to read the XFP 10GbE compilance
+        * register.  We read just one byte, which may take over
+        * a millisecond
+        */
+
+       cmd.data0 = 0;   /* just fetch 1 byte, not all 256 */
+       cmd.data1 = byte;
+       err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
+       if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
+               device_printf(sc->dev, "failed to read XFP\n");
+       }
+       if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
+               device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
+       }
+       if (err != MXGEFW_CMD_OK) {
+               return;
+       }
+
+       /* now we wait for the data to be cached */
+       cmd.data0 = byte;
+       err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
+       for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
+               DELAY(1000);
+               cmd.data0 = byte;
+               err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
+       }
+       if (err != MXGEFW_CMD_OK) {
+               device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
+                             cage_type, err, ms);
+               return;
+       }
+               
+       if (cmd.data0 == mxge_media_types[0].bitmask) {
+               if (mxge_verbose)
+                       device_printf(sc->dev, "%s:%s\n", cage_type,
+                                     mxge_media_types[0].name);
+               mxge_set_media(sc, IFM_10G_CX4);
+               return;
+       }
+       for (i = 1; i < mxge_media_type_entries; i++) {
+               if (cmd.data0 & mxge_media_types[i].bitmask) {
+                       if (mxge_verbose)
+                               device_printf(sc->dev, "%s:%s\n",
+                                             cage_type,
+                                             mxge_media_types[i].name);
+
+                       mxge_set_media(sc, mxge_media_types[i].flag);
+                       return;
+               }
+       }
+       device_printf(sc->dev, "%s media 0x%x unknown\n", cage_type,
+                     cmd.data0);
+
+       return;
+}
+
+static void
+mxge_intr(void *arg)
+{
+       struct mxge_slice_state *ss = arg;
+       mxge_softc_t *sc = ss->sc;
+       mcp_irq_data_t *stats = ss->fw_stats;
+       mxge_tx_ring_t *tx = &ss->tx;
+       mxge_rx_done_t *rx_done = &ss->rx_done;
+       uint32_t send_done_count;
+       uint8_t valid;
+
+
+#ifndef IFNET_BUF_RING
+       /* an interrupt on a non-zero slice is implicitly valid
+          since MSI-X irqs are not shared */
+       if (ss != sc->ss) {
+               mxge_clean_rx_done(ss);
+               *ss->irq_claim = be32toh(3);
+               return;
+       }
+#endif
+
+       /* make sure the DMA has finished */
+       if (!stats->valid) {
+               return;
+       }
+       valid = stats->valid;
+
+       if (sc->legacy_irq) {
+               /* lower legacy IRQ  */
+               *sc->irq_deassert = 0;
+               if (!mxge_deassert_wait)
+                       /* don't wait for conf. that irq is low */
+                       stats->valid = 0;
+       } else {
+               stats->valid = 0;
+       }
+
+       /* loop while waiting for legacy irq deassertion */
+       do {
+               /* check for transmit completes and receives */
+               send_done_count = be32toh(stats->send_done_count);
+               while ((send_done_count != tx->pkt_done) ||
+                      (rx_done->entry[rx_done->idx].length != 0)) {
+                       if (send_done_count != tx->pkt_done)
+                               mxge_tx_done(ss, (int)send_done_count);
+                       mxge_clean_rx_done(ss);
+                       send_done_count = be32toh(stats->send_done_count);
+               }
+               if (sc->legacy_irq && mxge_deassert_wait)
+                       wmb();
+       } while (*((volatile uint8_t *) &stats->valid));
+
+       /* fw link & error stats meaningful only on the first slice */
+       if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
+               if (sc->link_state != stats->link_up) {
+                       sc->link_state = stats->link_up;
+                       if (sc->link_state) {
+                               sc->ifp->if_link_state = LINK_STATE_UP;
+                               if_link_state_change(sc->ifp);
+                               if (mxge_verbose)
+                                       device_printf(sc->dev, "link up\n");
+                       } else {
+                               sc->ifp->if_link_state = LINK_STATE_DOWN;
+                               if_link_state_change(sc->ifp);
+                               if (mxge_verbose)
+                                       device_printf(sc->dev, "link down\n");
+                       }
+                       sc->need_media_probe = 1;
+               }
+               if (sc->rdma_tags_available !=
+                   be32toh(stats->rdma_tags_available)) {
+                       sc->rdma_tags_available = 
+                               be32toh(stats->rdma_tags_available);
+                       device_printf(sc->dev, "RDMA timed out! %d tags "
+                                     "left\n", sc->rdma_tags_available);
+               }
+
+               if (stats->link_down) {
+                       sc->down_cnt += stats->link_down;
+                       sc->link_state = 0;
+                       sc->ifp->if_link_state = LINK_STATE_DOWN;
+                       if_link_state_change(sc->ifp);
+               }
+       }
+
+       /* check to see if we have rx token to pass back */
+       if (valid & 0x1)
+           *ss->irq_claim = be32toh(3);
+       *(ss->irq_claim + 1) = be32toh(3);
+}
+
+static void
+mxge_init(void *arg)
+{
+}
+
+
+
+static void
+mxge_free_slice_mbufs(struct mxge_slice_state *ss)
+{
+       struct lro_entry *lro_entry;
+       int i;
+
+       while (!SLIST_EMPTY(&ss->lro_free)) {
+               lro_entry = SLIST_FIRST(&ss->lro_free);
+               SLIST_REMOVE_HEAD(&ss->lro_free, next);
+               kfree(lro_entry, M_DEVBUF);
+       }
+
+       for (i = 0; i <= ss->rx_big.mask; i++) {
+               if (ss->rx_big.info[i].m == NULL)
+                       continue;
+               bus_dmamap_unload(ss->rx_big.dmat,
+                                 ss->rx_big.info[i].map);
+               m_freem(ss->rx_big.info[i].m);
+               ss->rx_big.info[i].m = NULL;
+       }
+
+       for (i = 0; i <= ss->rx_small.mask; i++) {
+               if (ss->rx_small.info[i].m == NULL)
+                       continue;
+               bus_dmamap_unload(ss->rx_small.dmat,
+                                 ss->rx_small.info[i].map);
+               m_freem(ss->rx_small.info[i].m);
+               ss->rx_small.info[i].m = NULL;
+       }
+
+       /* transmit ring used only on the first slice */
+       if (ss->tx.info == NULL)
+               return;
+
+       for (i = 0; i <= ss->tx.mask; i++) {
+               ss->tx.info[i].flag = 0;
+               if (ss->tx.info[i].m == NULL)
+                       continue;
+               bus_dmamap_unload(ss->tx.dmat,
+                                 ss->tx.info[i].map);
+               m_freem(ss->tx.info[i].m);
+               ss->tx.info[i].m = NULL;
+       }
+}
+
+static void
+mxge_free_mbufs(mxge_softc_t *sc)
+{
+       int slice;
+
+       for (slice = 0; slice < sc->num_slices; slice++)
+               mxge_free_slice_mbufs(&sc->ss[slice]);
+}
+
+static void
+mxge_free_slice_rings(struct mxge_slice_state *ss)
+{
+       int i;
+
+
+       if (ss->rx_done.entry != NULL)
+               mxge_dma_free(&ss->rx_done.dma);
+       ss->rx_done.entry = NULL;
+
+       if (ss->tx.req_bytes != NULL)
+               kfree(ss->tx.req_bytes, M_DEVBUF);
+       ss->tx.req_bytes = NULL;
+
+       if (ss->tx.seg_list != NULL)
+               kfree(ss->tx.seg_list, M_DEVBUF);
+       ss->tx.seg_list = NULL;
+
+       if (ss->rx_small.shadow != NULL)
+               kfree(ss->rx_small.shadow, M_DEVBUF);
+       ss->rx_small.shadow = NULL;
+
+       if (ss->rx_big.shadow != NULL)
+               kfree(ss->rx_big.shadow, M_DEVBUF);
+       ss->rx_big.shadow = NULL;
+
+       if (ss->tx.info != NULL) {
+               if (ss->tx.dmat != NULL) {
+                       for (i = 0; i <= ss->tx.mask; i++) {
+                               bus_dmamap_destroy(ss->tx.dmat,
+                                                  ss->tx.info[i].map);
+                       }
+                       bus_dma_tag_destroy(ss->tx.dmat);
+               }
+               kfree(ss->tx.info, M_DEVBUF);
+       }
+       ss->tx.info = NULL;
+
+       if (ss->rx_small.info != NULL) {
+               if (ss->rx_small.dmat != NULL) {
+                       for (i = 0; i <= ss->rx_small.mask; i++) {
+                               bus_dmamap_destroy(ss->rx_small.dmat,
+                                                  ss->rx_small.info[i].map);
+                       }
+                       bus_dmamap_destroy(ss->rx_small.dmat,
+                                          ss->rx_small.extra_map);
+                       bus_dma_tag_destroy(ss->rx_small.dmat);
+               }
+               kfree(ss->rx_small.info, M_DEVBUF);
+       }
+       ss->rx_small.info = NULL;
+
+       if (ss->rx_big.info != NULL) {
+               if (ss->rx_big.dmat != NULL) {
+                       for (i = 0; i <= ss->rx_big.mask; i++) {
+                               bus_dmamap_destroy(ss->rx_big.dmat,
+                                                  ss->rx_big.info[i].map);
+                       }
+                       bus_dmamap_destroy(ss->rx_big.dmat,
+                                          ss->rx_big.extra_map);
+                       bus_dma_tag_destroy(ss->rx_big.dmat);
+               }
+               kfree(ss->rx_big.info, M_DEVBUF);
+       }
+       ss->rx_big.info = NULL;
+}
+
+static void
+mxge_free_rings(mxge_softc_t *sc)
+{
+       int slice;
+
+       for (slice = 0; slice < sc->num_slices; slice++)
+               mxge_free_slice_rings(&sc->ss[slice]);
+}
+
+static int
+mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
+                      int tx_ring_entries)
+{
+       mxge_softc_t *sc = ss->sc;
+       size_t bytes;
+       int err, i;
+
+       err = ENOMEM;
+
+       /* allocate per-slice receive resources */
+
+       ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
+       ss->rx_done.mask = (2 * rx_ring_entries) - 1;
+
+       /* allocate the rx shadow rings */
+       bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
+       ss->rx_small.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
+       if (ss->rx_small.shadow == NULL)
+               return err;;
+
+       bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
+       ss->rx_big.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
+       if (ss->rx_big.shadow == NULL)
+               return err;;
+
+       /* allocate the rx host info rings */
+       bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
+       ss->rx_small.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
+       if (ss->rx_small.info == NULL)
+               return err;;
+
+       bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
+       ss->rx_big.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
+       if (ss->rx_big.info == NULL)
+               return err;;
+
+       /* allocate the rx busdma resources */
+       err = bus_dma_tag_create(sc->parent_dmat,       /* parent */
+                                1,                     /* alignment */
+                                4096,                  /* boundary */
+                                BUS_SPACE_MAXADDR,     /* low */
+                                BUS_SPACE_MAXADDR,     /* high */
+                                NULL, NULL,            /* filter */
+                                MHLEN,                 /* maxsize */
+                                1,                     /* num segs */
+                                MHLEN,                 /* maxsegsize */
+                                BUS_DMA_ALLOCNOW,      /* flags */
+                                &ss->rx_small.dmat);   /* tag */
+       if (err != 0) {
+               device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
+                             err);
+               return err;;
+       }
+
+       err = bus_dma_tag_create(sc->parent_dmat,       /* parent */
+                                1,                     /* alignment */
+#if MXGE_VIRT_JUMBOS
+                                4096,                  /* boundary */
+#else
+                                0,                     /* boundary */
+#endif
+                                BUS_SPACE_MAXADDR,     /* low */
+                                BUS_SPACE_MAXADDR,     /* high */
+                                NULL, NULL,            /* filter */
+                                3*4096,                /* maxsize */
+#if MXGE_VIRT_JUMBOS
+                                3,                     /* num segs */
+                                4096,                  /* maxsegsize*/
+#else
+                                1,                     /* num segs */
+                                MJUM9BYTES,            /* maxsegsize*/
+#endif
+                                BUS_DMA_ALLOCNOW,      /* flags */
+                                &ss->rx_big.dmat);     /* tag */
+       if (err != 0) {
+               device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
+                             err);
+               return err;;
+       }
+       for (i = 0; i <= ss->rx_small.mask; i++) {
+               err = bus_dmamap_create(ss->rx_small.dmat, 0, 
+                                       &ss->rx_small.info[i].map);
+               if (err != 0) {
+                       device_printf(sc->dev, "Err %d  rx_small dmamap\n",
+                                     err);
+                       return err;;
+               }
+       }
+       err = bus_dmamap_create(ss->rx_small.dmat, 0, 
+                               &ss->rx_small.extra_map);
+       if (err != 0) {
+               device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
+                             err);
+               return err;;
+       }
+
+       for (i = 0; i <= ss->rx_big.mask; i++) {
+               err = bus_dmamap_create(ss->rx_big.dmat, 0, 
+                                       &ss->rx_big.info[i].map);
+               if (err != 0) {
+                       device_printf(sc->dev, "Err %d  rx_big dmamap\n",
+                                     err);
+                       return err;;
+               }
+       }
+       err = bus_dmamap_create(ss->rx_big.dmat, 0, 
+                               &ss->rx_big.extra_map);
+       if (err != 0) {
+               device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
+                             err);
+               return err;;
+       }
+
+       /* now allocate TX resouces */
+
+#ifndef IFNET_BUF_RING
+       /* only use a single TX ring for now */
+       if (ss != ss->sc->ss)
+               return 0;
+#endif
+
+       ss->tx.mask = tx_ring_entries - 1;
+       ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
+
+       
+       /* allocate the tx request copy block */
+       bytes = 8 + 
+               sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
+       ss->tx.req_bytes = kmalloc(bytes, M_DEVBUF, M_WAITOK);
+       if (ss->tx.req_bytes == NULL)
+               return err;;
+       /* ensure req_list entries are aligned to 8 bytes */
+       ss->tx.req_list = (mcp_kreq_ether_send_t *)
+               ((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
+
+       /* allocate the tx busdma segment list */
+       bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
+       ss->tx.seg_list = (bus_dma_segment_t *) 
+               kmalloc(bytes, M_DEVBUF, M_WAITOK);
+       if (ss->tx.seg_list == NULL)
+               return err;;
+
+       /* allocate the tx host info ring */
+       bytes = tx_ring_entries * sizeof (*ss->tx.info);
+       ss->tx.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
+       if (ss->tx.info == NULL)
+               return err;;
+       
+       /* allocate the tx busdma resources */
+       err = bus_dma_tag_create(sc->parent_dmat,       /* parent */
+                                1,                     /* alignment */
+                                sc->tx_boundary,       /* boundary */
+                                BUS_SPACE_MAXADDR,     /* low */
+                                BUS_SPACE_MAXADDR,     /* high */
+                                NULL, NULL,            /* filter */
+                                65536 + 256,           /* maxsize */
+                                ss->tx.max_desc - 2,   /* num segs */
+                                sc->tx_boundary,       /* maxsegsz */
+                                BUS_DMA_ALLOCNOW,      /* flags */
+                                &ss->tx.dmat);         /* tag */
+       
+       if (err != 0) {
+               device_printf(sc->dev, "Err %d allocating tx dmat\n",
+                             err);
+               return err;;
+       }
+
+       /* now use these tags to setup dmamaps for each slot
+          in the ring */
+       for (i = 0; i <= ss->tx.mask; i++) {
+               err = bus_dmamap_create(ss->tx.dmat, 0, 
+                                       &ss->tx.info[i].map);
+               if (err != 0) {
+                       device_printf(sc->dev, "Err %d  tx dmamap\n",
+                                     err);
+                       return err;;
+               }
+       }
+       return 0;
+
+}
+
+static int
+mxge_alloc_rings(mxge_softc_t *sc)
+{
+       mxge_cmd_t cmd;
+       int tx_ring_size;
+       int tx_ring_entries, rx_ring_entries;
+       int err, slice;
+       
+       /* get ring sizes */
+       err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
+       tx_ring_size = cmd.data0;
+       if (err != 0) {
+               device_printf(sc->dev, "Cannot determine tx ring sizes\n");
+               goto abort;
+       }
+
+       tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
+       rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
+       ifq_set_maxlen(&sc->ifp->if_snd, tx_ring_entries - 1);
+       ifq_set_ready(&sc->ifp->if_snd);
+
+       for (slice = 0; slice < sc->num_slices; slice++) {
+               err = mxge_alloc_slice_rings(&sc->ss[slice],
+                                            rx_ring_entries,
+                                            tx_ring_entries);
+               if (err != 0)
+                       goto abort;
+       }
+       return 0;
+
+abort:
+       mxge_free_rings(sc);
+       return err;
+
+}
+
+
+static void
+mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
+{
+       int bufsize = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN + MXGEFW_PAD;
+
+       if (bufsize < MCLBYTES) {
+               /* easy, everything fits in a single buffer */
+               *big_buf_size = MCLBYTES;
+               *cl_size = MCLBYTES;
+               *nbufs = 1;
+               return;
+       }
+
+       if (bufsize < MJUMPAGESIZE) {
+               /* still easy, everything still fits in a single buffer */
+               *big_buf_size = MJUMPAGESIZE;
+               *cl_size = MJUMPAGESIZE;
+               *nbufs = 1;
+               return;
+       }
+#if MXGE_VIRT_JUMBOS
+       /* now we need to use virtually contiguous buffers */
+       *cl_size = MJUM9BYTES;
+       *big_buf_size = 4096;
+       *nbufs = mtu / 4096 + 1;
+       /* needs to be a power of two, so round up */
+       if (*nbufs == 3)
+               *nbufs = 4;
+#else
+       *cl_size = MJUM9BYTES;
+       *big_buf_size = MJUM9BYTES;
+       *nbufs = 1;
+#endif
+}
+
+static int
+mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
+{
+       mxge_softc_t *sc;
+       mxge_cmd_t cmd;
+       bus_dmamap_t map;
+       struct lro_entry *lro_entry;    
+       int err, i, slice;
+
+
+       sc = ss->sc;
+       slice = ss - sc->ss;
+
+       SLIST_INIT(&ss->lro_free);
+       SLIST_INIT(&ss->lro_active);
+
+       for (i = 0; i < sc->lro_cnt; i++) {
+               lro_entry = (struct lro_entry *)
+                       kmalloc(sizeof (*lro_entry), M_DEVBUF,
+                              M_NOWAIT | M_ZERO);
+               if (lro_entry == NULL) {
+                       sc->lro_cnt = i;
+                       break;
+               }
+               SLIST_INSERT_HEAD(&ss->lro_free, lro_entry, next);
+       }
+       /* get the lanai pointers to the send and receive rings */
+
+       err = 0;
+#ifndef IFNET_BUF_RING
+       /* We currently only send from the first slice */
+       if (slice == 0) {
+#endif
+               cmd.data0 = slice;
+               err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
+               ss->tx.lanai = 
+                       (volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
+               ss->tx.send_go = (volatile uint32_t *)
+                       (sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
+               ss->tx.send_stop = (volatile uint32_t *)
+               (sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
+#ifndef IFNET_BUF_RING
+       }
+#endif
+       cmd.data0 = slice;
+       err |= mxge_send_cmd(sc, 
+                            MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
+       ss->rx_small.lanai = 
+               (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
+       cmd.data0 = slice;
+       err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
+       ss->rx_big.lanai = 
+               (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
+
+       if (err != 0) {
+               device_printf(sc->dev, 
+                             "failed to get ring sizes or locations\n");
+               return EIO;
+       }
+
+       /* stock receive rings */
+       for (i = 0; i <= ss->rx_small.mask; i++) {
+               map = ss->rx_small.info[i].map;
+               err = mxge_get_buf_small(ss, map, i);
+               if (err) {
+                       device_printf(sc->dev, "alloced %d/%d smalls\n",
+                                     i, ss->rx_small.mask + 1);
+                       return ENOMEM;
+               }
+       }
+       for (i = 0; i <= ss->rx_big.mask; i++) {
+               ss->rx_big.shadow[i].addr_low = 0xffffffff;
+               ss->rx_big.shadow[i].addr_high = 0xffffffff;
+       }
+       ss->rx_big.nbufs = nbufs;
+       ss->rx_big.cl_size = cl_size;
+       ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
+               EVL_ENCAPLEN + MXGEFW_PAD;
+       for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
+               map = ss->rx_big.info[i].map;
+               err = mxge_get_buf_big(ss, map, i);
+               if (err) {
+                       device_printf(sc->dev, "alloced %d/%d bigs\n",
+                                     i, ss->rx_big.mask + 1);
+                       return ENOMEM;
+               }
+       }
+       return 0;
+}
+
+static int 
+mxge_open(mxge_softc_t *sc)
+{
+       mxge_cmd_t cmd;
+       int err, big_bytes, nbufs, slice, cl_size, i;
+       bus_addr_t bus;
+       volatile uint8_t *itable;
+       struct mxge_slice_state *ss;
+
+       ASSERT_SERIALIZED(sc->ifp->if_serializer);
+       /* Copy the MAC address in case it was overridden */
+       bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
+
+       err = mxge_reset(sc, 1);
+       if (err != 0) {
+               device_printf(sc->dev, "failed to reset\n");
+               return EIO;
+       }
+
+       if (sc->num_slices > 1) {
+               /* setup the indirection table */
+               cmd.data0 = sc->num_slices;
+               err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
+                                   &cmd);
+
+               err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
+                                    &cmd);
+               if (err != 0) {
+                       device_printf(sc->dev,
+                                     "failed to setup rss tables\n");
+                       return err;
+               }
+
+               /* just enable an identity mapping */
+               itable = sc->sram + cmd.data0;
+               for (i = 0; i < sc->num_slices; i++)
+                       itable[i] = (uint8_t)i;
+
+               cmd.data0 = 1;
+               cmd.data1 = mxge_rss_hash_type;
+               err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
+               if (err != 0) {
+                       device_printf(sc->dev, "failed to enable slices\n");
+                       return err;
+               }
+       }
+
+
+       mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
+
+       cmd.data0 = nbufs;
+       err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
+                           &cmd);
+       /* error is only meaningful if we're trying to set 
+          MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
+       if (err && nbufs > 1) {
+               device_printf(sc->dev,
+                             "Failed to set alway-use-n to %d\n",
+                             nbufs);
+               return EIO;
+       }
+       /* Give the firmware the mtu and the big and small buffer
+          sizes.  The firmware wants the big buf size to be a power
+          of two. Luckily, FreeBSD's clusters are powers of two */
+       cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
+       err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
+       cmd.data0 = MHLEN - MXGEFW_PAD;
+       err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
+                            &cmd);
+       cmd.data0 = big_bytes;
+       err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
+
+       if (err != 0) {
+               device_printf(sc->dev, "failed to setup params\n");
+               goto abort;
+       }
+
+       /* Now give him the pointer to the stats block */
+       for (slice = 0; 
+#ifdef IFNET_BUF_RING
+            slice < sc->num_slices;
+#else
+            slice < 1;
+#endif
+            slice++) {
+               ss = &sc->ss[slice];
+               cmd.data0 =
+                       MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
+               cmd.data1 =
+                       MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
+               cmd.data2 = sizeof(struct mcp_irq_data);
+               cmd.data2 |= (slice << 16);
+               err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
+       }
+
+       if (err != 0) {
+               bus = sc->ss->fw_stats_dma.bus_addr;
+               bus += offsetof(struct mcp_irq_data, send_done_count);
+               cmd.data0 = MXGE_LOWPART_TO_U32(bus);
+               cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
+               err = mxge_send_cmd(sc,
+                                   MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
+                                   &cmd);
+               /* Firmware cannot support multicast without STATS_DMA_V2 */
+               sc->fw_multicast_support = 0;
+       } else {
+               sc->fw_multicast_support = 1;
+       }
+
+       if (err != 0) {
+               device_printf(sc->dev, "failed to setup params\n");
+               goto abort;
+       }
+
+       for (slice = 0; slice < sc->num_slices; slice++) {
+               err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
+               if (err != 0) {
+                       device_printf(sc->dev, "couldn't open slice %d\n",
+                                     slice);
+                       goto abort;
+               }
+       }
+
+       /* Finally, start the firmware running */
+       err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
+       if (err) {
+               device_printf(sc->dev, "Couldn't bring up link\n");
+               goto abort;
+       }
+#ifdef IFNET_BUF_RING
+       for (slice = 0; slice < sc->num_slices; slice++) {
+               ss = &sc->ss[slice];
+               ss->if_flags |= IFF_RUNNING;
+               ss->if_flags &= ~IFF_OACTIVE;
+       }
+#endif
+       sc->ifp->if_flags |= IFF_RUNNING;
+       sc->ifp->if_flags &= ~IFF_OACTIVE;
+       callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
+
+       return 0;
+
+
+abort:
+       mxge_free_mbufs(sc);
+
+       return err;
+}
+
+static int
+mxge_close(mxge_softc_t *sc)
+{
+       mxge_cmd_t cmd;
+       int err, old_down_cnt;
+#ifdef IFNET_BUF_RING
+       struct mxge_slice_state *ss;    
+       int slice;
+#endif
+
+       ASSERT_SERIALIZED(sc->ifp->if_serializer);
+       callout_stop(&sc->co_hdl);
+#ifdef IFNET_BUF_RING
+       for (slice = 0; slice < sc->num_slices; slice++) {
+               ss = &sc->ss[slice];
+               ss->if_flags &= ~IFF_RUNNING;
+       }
+#endif
+       sc->ifp->if_flags &= ~IFF_RUNNING;
+       old_down_cnt = sc->down_cnt;
+       wmb();
+       err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
+       if (err) {
+               device_printf(sc->dev, "Couldn't bring down link\n");
+       }
+       if (old_down_cnt == sc->down_cnt) {
+               /* wait for down irq */
+               DELAY(10 * sc->intr_coal_delay);
+       }
+       wmb();
+       if (old_down_cnt == sc->down_cnt) {
+               device_printf(sc->dev, "never got down irq\n");
+       }
+
+       mxge_free_mbufs(sc);
+
+       return 0;
+}
+
+static void
+mxge_setup_cfg_space(mxge_softc_t *sc)
+{
+       device_t dev = sc->dev;
+       int reg;
+       uint16_t cmd, lnk, pectl;
+
+       /* find the PCIe link width and set max read request to 4KB*/
+       if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
+               lnk = pci_read_config(dev, reg + 0x12, 2);
+               sc->link_width = (lnk >> 4) & 0x3f;
+               
+               pectl = pci_read_config(dev, reg + 0x8, 2);
+               pectl = (pectl & ~0x7000) | (5 << 12);
+               pci_write_config(dev, reg + 0x8, pectl, 2);
+       }
+
+       /* Enable DMA and Memory space access */
+       pci_enable_busmaster(dev);
+       cmd = pci_read_config(dev, PCIR_COMMAND, 2);
+       cmd |= PCIM_CMD_MEMEN;
+       pci_write_config(dev, PCIR_COMMAND, cmd, 2);
+}
+
+static uint32_t
+mxge_read_reboot(mxge_softc_t *sc)
+{
+       device_t dev = sc->dev;
+       uint32_t vs;
+
+       /* find the vendor specific offset */
+       if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
+               device_printf(sc->dev,
+                             "could not find vendor specific offset\n");
+               return (uint32_t)-1;
+       }
+       /* enable read32 mode */
+       pci_write_config(dev, vs + 0x10, 0x3, 1);
+       /* tell NIC which register to read */
+       pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
+       return (pci_read_config(dev, vs + 0x14, 4));
+}
+
+static int
+mxge_watchdog_reset(mxge_softc_t *sc, int slice)
+{
+       struct pci_devinfo *dinfo;
+       mxge_tx_ring_t *tx;
+       int err;
+       uint32_t reboot;
+       uint16_t cmd;
+
+       err = ENXIO;
+
+       device_printf(sc->dev, "Watchdog reset!\n");
+
+       /* 
+        * check to see if the NIC rebooted.  If it did, then all of
+        * PCI config space has been reset, and things like the
+        * busmaster bit will be zero.  If this is the case, then we
+        * must restore PCI config space before the NIC can be used
+        * again
+        */
+       cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
+       if (cmd == 0xffff) {
+               /* 
+                * maybe the watchdog caught the NIC rebooting; wait
+                * up to 100ms for it to finish.  If it does not come
+                * back, then give up 
+                */
+               DELAY(1000*100);
+               cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
+               if (cmd == 0xffff) {
+                       device_printf(sc->dev, "NIC disappeared!\n");
+                       return (err);
+               }
+       }
+       if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
+               /* print the reboot status */
+               reboot = mxge_read_reboot(sc);
+               device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
+                             reboot);
+               /* restore PCI configuration space */
+               dinfo = device_get_ivars(sc->dev);
+               pci_cfg_restore(sc->dev, dinfo);
+
+               /* and redo any changes we made to our config space */
+               mxge_setup_cfg_space(sc);
+
+               if (sc->ifp->if_flags & IFF_RUNNING) {
+                       mxge_close(sc);
+                       err = mxge_open(sc);
+               }
+       } else {
+               tx = &sc->ss[slice].tx;
+               device_printf(sc->dev,
+                             "NIC did not reboot, slice %d ring state:\n",
+                             slice);
+               device_printf(sc->dev,
+                             "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
+                             tx->req, tx->done, tx->queue_active);
+               device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
+                             tx->activate, tx->deactivate);
+               device_printf(sc->dev, "pkt_done=%d fw=%d\n",
+                             tx->pkt_done,
+                             be32toh(sc->ss->fw_stats->send_done_count));
+               device_printf(sc->dev, "not resetting\n");
+       }
+       return (err);
+}
+
+static int
+mxge_watchdog(mxge_softc_t *sc)
+{
+       mxge_tx_ring_t *tx;
+       uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
+       int i, err = 0;
+
+       /* see if we have outstanding transmits, which
+          have been pending for more than mxge_ticks */
+       for (i = 0; 
+#ifdef IFNET_BUF_RING
+            (i < sc->num_slices) && (err == 0);
+#else
+            (i < 1) && (err == 0);
+#endif
+            i++) {
+               tx = &sc->ss[i].tx;             
+               if (tx->req != tx->done &&
+                   tx->watchdog_req != tx->watchdog_done &&
+                   tx->done == tx->watchdog_done) {
+                       /* check for pause blocking before resetting */
+                       if (tx->watchdog_rx_pause == rx_pause)
+                               err = mxge_watchdog_reset(sc, i);
+                       else
+                               device_printf(sc->dev, "Flow control blocking "
+                                             "xmits, check link partner\n");
+               }
+
+               tx->watchdog_req = tx->req;
+               tx->watchdog_done = tx->done;
+               tx->watchdog_rx_pause = rx_pause;
+       }
+
+       if (sc->need_media_probe)
+               mxge_media_probe(sc);
+       return (err);
+}
+
+static void
+mxge_update_stats(mxge_softc_t *sc)
+{
+       struct mxge_slice_state *ss;
+       u_long ipackets = 0;
+       u_long opackets = 0;
+#ifdef IFNET_BUF_RING
+       u_long obytes = 0;
+       u_long omcasts = 0;
+       u_long odrops = 0;
+#endif
+       u_long oerrors = 0;
+       int slice;
+
+       for (slice = 0; slice < sc->num_slices; slice++) {
+               ss = &sc->ss[slice];
+               ipackets += ss->ipackets;
+               opackets += ss->opackets;
+#ifdef IFNET_BUF_RING
+               obytes += ss->obytes;
+               omcasts += ss->omcasts;
+               odrops += ss->tx.br->br_drops;
+#endif
+               oerrors += ss->oerrors;
+       }
+       sc->ifp->if_ipackets = ipackets;
+       sc->ifp->if_opackets = opackets;
+#ifdef IFNET_BUF_RING
+       sc->ifp->if_obytes = obytes;
+       sc->ifp->if_omcasts = omcasts;
+       sc->ifp->if_snd.ifq_drops = odrops;
+#endif
+       sc->ifp->if_oerrors = oerrors;
+}
+
+static void
+mxge_tick(void *arg)
+{
+       mxge_softc_t *sc = arg;
+       int err = 0;
+
+       lwkt_serialize_enter(sc->ifp->if_serializer);
+       /* aggregate stats from different slices */
+       mxge_update_stats(sc);
+       if (!sc->watchdog_countdown) {
+               err = mxge_watchdog(sc);
+               sc->watchdog_countdown = 4;
+       }
+       sc->watchdog_countdown--;
+       if (err == 0)
+               callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
+       lwkt_serialize_exit(sc->ifp->if_serializer);
+}
+
+static int
+mxge_media_change(struct ifnet *ifp)
+{
+       return EINVAL;
+}
+
+static int
+mxge_change_mtu(mxge_softc_t *sc, int mtu)
+{
+       struct ifnet *ifp = sc->ifp;
+       int real_mtu, old_mtu;
+       int err = 0;
+
+       if (ifp->if_serializer)
+               ASSERT_SERIALIZED(ifp->if_serializer);
+
+       real_mtu = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
+       if ((real_mtu > sc->max_mtu) || real_mtu < 60)
+               return EINVAL;
+       old_mtu = ifp->if_mtu;
+       ifp->if_mtu = mtu;
+       if (ifp->if_flags & IFF_RUNNING) {
+               mxge_close(sc);
+               err = mxge_open(sc);
+               if (err != 0) {
+                       ifp->if_mtu = old_mtu;
+                       mxge_close(sc);
+                       (void) mxge_open(sc);
+               }
+       }
+       return err;
+}      
+
+static void
+mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
+{
+       mxge_softc_t *sc = ifp->if_softc;
+       
+
+       if (sc == NULL)
+               return;
+       ifmr->ifm_status = IFM_AVALID;
+       ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
+       ifmr->ifm_active = IFM_AUTO | IFM_ETHER;
+       ifmr->ifm_active |= sc->link_state ? IFM_FDX : 0;
+}
+
+static int
+mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data, struct ucred *cr)
+{
+       mxge_softc_t *sc = ifp->if_softc;
+       struct ifreq *ifr = (struct ifreq *)data;
+       int err, mask;
+
+       (void)cr;
+       err = 0;
+       ASSERT_SERIALIZED(ifp->if_serializer);
+       switch (command) {
+       case SIOCSIFADDR:
+       case SIOCGIFADDR:
+               err = ether_ioctl(ifp, command, data);
+               break;
+
+       case SIOCSIFMTU:
+               err = mxge_change_mtu(sc, ifr->ifr_mtu);
+               break;
+
+       case SIOCSIFFLAGS:
+               if (sc->dying) {
+                       return EINVAL;
+               }
+               if (ifp->if_flags & IFF_UP) {
+                       if (!(ifp->if_flags & IFF_RUNNING)) {
+                               err = mxge_open(sc);
+                       } else {
+                               /* take care of promis can allmulti
+                                  flag chages */
+                               mxge_change_promisc(sc, 
+                                                   ifp->if_flags & IFF_PROMISC);
+                               mxge_set_multicast_list(sc);
+                       }
+               } else {
+                       if (ifp->if_flags & IFF_RUNNING) {
+                               mxge_close(sc);
+                       }
+               }
+               break;
+
+       case SIOCADDMULTI:
+       case SIOCDELMULTI:
+               mxge_set_multicast_list(sc);
+               break;
+
+       case SIOCSIFCAP:
+               mask = ifr->ifr_reqcap ^ ifp->if_capenable;
+               if (mask & IFCAP_TXCSUM) {
+                       if (IFCAP_TXCSUM & ifp->if_capenable) {
+                               ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
+                               ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
+                                                     | CSUM_TSO);
+                       } else {
+                               ifp->if_capenable |= IFCAP_TXCSUM;
+                               ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
+                       }
+               } else if (mask & IFCAP_RXCSUM) {
+                       if (IFCAP_RXCSUM & ifp->if_capenable) {
+                               ifp->if_capenable &= ~IFCAP_RXCSUM;
+                               sc->csum_flag = 0;
+                       } else {
+                               ifp->if_capenable |= IFCAP_RXCSUM;
+                               sc->csum_flag = 1;
+                       }
+               }
+               if (mask & IFCAP_TSO4) {
+                       if (IFCAP_TSO4 & ifp->if_capenable) {
+                               ifp->if_capenable &= ~IFCAP_TSO4;
+                               ifp->if_hwassist &= ~CSUM_TSO;
+                       } else if (IFCAP_TXCSUM & ifp->if_capenable) {
+                               ifp->if_capenable |= IFCAP_TSO4;
+                               ifp->if_hwassist |= CSUM_TSO;
+                       } else {
+                               kprintf("mxge requires tx checksum offload"
+                                      " be enabled to use TSO\n");
+                               err = EINVAL;
+                       }
+               }
+               if (mask & IFCAP_LRO) {
+                       if (IFCAP_LRO & ifp->if_capenable) 
+                               err = mxge_change_lro_locked(sc, 0);
+                       else
+                               err = mxge_change_lro_locked(sc, mxge_lro_cnt);
+               }
+               if (mask & IFCAP_VLAN_HWTAGGING)
+                       ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
+               VLAN_CAPABILITIES(ifp);
+
+               break;
+
+       case SIOCGIFMEDIA:
+               err = ifmedia_ioctl(ifp, (struct ifreq *)data, 
+                                   &sc->media, command);
+                break;
+
+       default:
+               err = ENOTTY;
+        }
+       return err;
+}
+
+static void
+mxge_fetch_tunables(mxge_softc_t *sc)
+{
+
+       TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
+       TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled", 
+                         &mxge_flow_control);
+       TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay", 
+                         &mxge_intr_coal_delay);       
+       TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable", 
+                         &mxge_nvidia_ecrc_enable);    
+       TUNABLE_INT_FETCH("hw.mxge.force_firmware", 
+                         &mxge_force_firmware);        
+       TUNABLE_INT_FETCH("hw.mxge.deassert_wait", 
+                         &mxge_deassert_wait); 
+       TUNABLE_INT_FETCH("hw.mxge.verbose", 
+                         &mxge_verbose);       
+       TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
+       TUNABLE_INT_FETCH("hw.mxge.lro_cnt", &sc->lro_cnt);
+       TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
+       TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
+       TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
+       if (sc->lro_cnt != 0)
+               mxge_lro_cnt = sc->lro_cnt;
+
+       if (bootverbose)
+               mxge_verbose = 1;
+       if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
+               mxge_intr_coal_delay = 30;
+       if (mxge_ticks == 0)
+               mxge_ticks = hz / 2;
+       sc->pause = mxge_flow_control;
+       if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4 
+           || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
+               mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
+       }
+       if (mxge_initial_mtu > ETHERMTU_JUMBO ||
+           mxge_initial_mtu < ETHER_MIN_LEN)
+               mxge_initial_mtu = ETHERMTU_JUMBO;
+}
+
+
+static void
+mxge_free_slices(mxge_softc_t *sc)
+{
+       struct mxge_slice_state *ss;
+       int i;
+
+
+       if (sc->ss == NULL)
+               return;
+
+       for (i = 0; i < sc->num_slices; i++) {
+               ss = &sc->ss[i];
+               if (ss->fw_stats != NULL) {
+                       mxge_dma_free(&ss->fw_stats_dma);
+                       ss->fw_stats = NULL;
+#ifdef IFNET_BUF_RING
+                       if (ss->tx.br != NULL) {
+                               drbr_free(ss->tx.br, M_DEVBUF);
+                               ss->tx.br = NULL;
+                       }
+#endif
+               }
+               if (ss->rx_done.entry != NULL) {
+                       mxge_dma_free(&ss->rx_done.dma);
+                       ss->rx_done.entry = NULL;
+               }
+       }
+       kfree(sc->ss, M_DEVBUF);
+       sc->ss = NULL;
+}
+
+static int
+mxge_alloc_slices(mxge_softc_t *sc)
+{
+       mxge_cmd_t cmd;
+       struct mxge_slice_state *ss;
+       size_t bytes;
+       int err, i, max_intr_slots;
+
+       err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
+       if (err != 0) {
+               device_printf(sc->dev, "Cannot determine rx ring size\n");
+               return err;
+       }
+       sc->rx_ring_size = cmd.data0;
+       max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
+       
+       bytes = sizeof (*sc->ss) * sc->num_slices;
+       sc->ss = kmalloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
+       if (sc->ss == NULL)
+               return (ENOMEM);
+       for (i = 0; i < sc->num_slices; i++) {
+               ss = &sc->ss[i];
+
+               ss->sc = sc;
+
+               /* allocate per-slice rx interrupt queues */
+               
+               bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
+               err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
+               if (err != 0)
+                       goto abort;
+               ss->rx_done.entry = ss->rx_done.dma.addr;
+               bzero(ss->rx_done.entry, bytes);
+
+               /* 
+                * allocate the per-slice firmware stats; stats
+                * (including tx) are used used only on the first
+                * slice for now
+                */
+#ifndef IFNET_BUF_RING
+               if (i > 0)
+                       continue;
+#endif
+
+               bytes = sizeof (*ss->fw_stats);
+               err = mxge_dma_alloc(sc, &ss->fw_stats_dma, 
+                                    sizeof (*ss->fw_stats), 64);
+               if (err != 0)
+                       goto abort;
+               ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
+#ifdef IFNET_BUF_RING
+               ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
+                                          &ss->tx.lock);
+#endif
+       }
+
+       return (0);
+
+abort:
+       mxge_free_slices(sc);
+       return (ENOMEM);
+}
+
+static void
+mxge_slice_probe(mxge_softc_t *sc)
+{
+       mxge_cmd_t cmd;
+       char *old_fw;
+       int msix_cnt, status, max_intr_slots;
+
+       sc->num_slices = 1;
+       /* 
+        *  don't enable multiple slices if they are not enabled,
+        *  or if this is not an SMP system 
+        */
+       
+       if (mxge_max_slices == 0 || mxge_max_slices == 1 || ncpus < 2)
+               return;
+
+       /* see how many MSI-X interrupts are available */
+       msix_cnt = pci_msix_count(sc->dev);
+       if (msix_cnt < 2)
+               return;
+
+       /* now load the slice aware firmware see what it supports */
+       old_fw = sc->fw_name;
+       if (old_fw == mxge_fw_aligned)
+               sc->fw_name = mxge_fw_rss_aligned;
+       else
+               sc->fw_name = mxge_fw_rss_unaligned;
+       status = mxge_load_firmware(sc, 0);
+       if (status != 0) {
+               device_printf(sc->dev, "Falling back to a single slice\n");
+               return;
+       }
+       
+       /* try to send a reset command to the card to see if it
+          is alive */
+       memset(&cmd, 0, sizeof (cmd));
+       status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
+       if (status != 0) {
+               device_printf(sc->dev, "failed reset\n");
+               goto abort_with_fw;
+       }
+
+       /* get rx ring size */
+       status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
+       if (status != 0) {
+               device_printf(sc->dev, "Cannot determine rx ring size\n");
+               goto abort_with_fw;
+       }
+       max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
+
+       /* tell it the size of the interrupt queues */
+       cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
+       status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
+       if (status != 0) {
+               device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
+               goto abort_with_fw;
+       }
+
+       /* ask the maximum number of slices it supports */
+       status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
+       if (status != 0) {
+               device_printf(sc->dev,
+                             "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
+               goto abort_with_fw;
+       }
+       sc->num_slices = cmd.data0;
+       if (sc->num_slices > msix_cnt)
+               sc->num_slices = msix_cnt;
+
+       if (mxge_max_slices == -1) {
+               /* cap to number of CPUs in system */
+               if (sc->num_slices > ncpus)
+                       sc->num_slices = ncpus;
+       } else {
+               if (sc->num_slices > mxge_max_slices)
+                       sc->num_slices = mxge_max_slices;
+       }
+       /* make sure it is a power of two */
+       while (sc->num_slices & (sc->num_slices - 1))
+               sc->num_slices--;
+
+       if (mxge_verbose)
+               device_printf(sc->dev, "using %d slices\n",
+                             sc->num_slices);
+       
+       return;
+
+abort_with_fw:
+       sc->fw_name = old_fw;
+       (void) mxge_load_firmware(sc, 0);
+}
+
+static int
+mxge_add_msix_irqs(mxge_softc_t *sc)
+{
+       size_t bytes;
+       int count, err, i, rid;
+
+       rid = PCIR_BAR(2);
+       sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
+                                                   &rid, RF_ACTIVE);
+
+       if (sc->msix_table_res == NULL) {
+               device_printf(sc->dev, "couldn't alloc MSIX table res\n");
+               return ENXIO;
+       }
+
+       count = sc->num_slices;
+       err = pci_alloc_msix(sc->dev, &count);
+       if (err != 0) {
+               device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
+                             "err = %d \n", sc->num_slices, err);
+               goto abort_with_msix_table;
+       }
+       if (count < sc->num_slices) {
+               device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
+                             count, sc->num_slices);
+               device_printf(sc->dev,
+                             "Try setting hw.mxge.max_slices to %d\n",
+                             count);
+               err = ENOSPC;
+               goto abort_with_msix;
+       }
+       bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
+       sc->msix_irq_res = kmalloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
+       if (sc->msix_irq_res == NULL) {
+               err = ENOMEM;
+               goto abort_with_msix;
+       }
+
+       for (i = 0; i < sc->num_slices; i++) {
+               rid = i + 1;
+               sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
+                                                         SYS_RES_IRQ,
+                                                         &rid, RF_ACTIVE);
+               if (sc->msix_irq_res[i] == NULL) {
+                       device_printf(sc->dev, "couldn't allocate IRQ res"
+                                     " for message %d\n", i);
+                       err = ENXIO;
+                       goto abort_with_res;
+               }
+       }
+
+       bytes = sizeof (*sc->msix_ih) * sc->num_slices;
+       sc->msix_ih =  kmalloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
+
+       for (i = 0; i < sc->num_slices; i++) {
+               err = bus_setup_intr(sc->dev, sc->msix_irq_res[i], 
+                                    INTR_MPSAFE,
+                                    mxge_intr, &sc->ss[i], &sc->msix_ih[i],
+                                    sc->ifp->if_serializer);
+               if (err != 0) {
+                       device_printf(sc->dev, "couldn't setup intr for "
+                                     "message %d\n", i);
+                       goto abort_with_intr;
+               }
+       }
+
+       if (mxge_verbose) {
+               device_printf(sc->dev, "using %d msix IRQs:",
+                             sc->num_slices);
+               for (i = 0; i < sc->num_slices; i++)
+                       kprintf(" %ld",  rman_get_start(sc->msix_irq_res[i]));
+               kprintf("\n");
+       }
+       return (0);
+
+abort_with_intr:
+       for (i = 0; i < sc->num_slices; i++) {
+               if (sc->msix_ih[i] != NULL) {
+                       bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
+                                         sc->msix_ih[i]);
+                       sc->msix_ih[i] = NULL;
+               }
+       }
+       kfree(sc->msix_ih, M_DEVBUF);
+
+
+abort_with_res:
+       for (i = 0; i < sc->num_slices; i++) {
+               rid = i + 1;
+               if (sc->msix_irq_res[i] != NULL)
+                       bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
+                                            sc->msix_irq_res[i]);
+               sc->msix_irq_res[i] = NULL;
+       }
+       kfree(sc->msix_irq_res, M_DEVBUF);
+
+
+abort_with_msix:
+       pci_release_msi(sc->dev);
+
+abort_with_msix_table:
+       bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
+                            sc->msix_table_res);
+
+       return err;
+}
+
+static int
+mxge_add_single_irq(mxge_softc_t *sc)
+{
+       int count, err, rid;
+
+       count = pci_msi_count(sc->dev);
+       if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
+               rid = 1;
+       } else {
+               rid = 0;
+               sc->legacy_irq = 1;
+       }
+       sc->irq_res = bus_alloc_resource(sc->dev, SYS_RES_IRQ, &rid, 0, ~0,
+                                        1, RF_SHAREABLE | RF_ACTIVE);
+       if (sc->irq_res == NULL) {
+               device_printf(sc->dev, "could not alloc interrupt\n");
+               return ENXIO;
+       }
+       if (mxge_verbose)
+               device_printf(sc->dev, "using %s irq %ld\n",
+                             sc->legacy_irq ? "INTx" : "MSI",
+                             rman_get_start(sc->irq_res));
+       err = bus_setup_intr(sc->dev, sc->irq_res, 
+                            INTR_MPSAFE,
+                            mxge_intr, &sc->ss[0], &sc->ih,
+                            sc->ifp->if_serializer);
+       if (err != 0) {
+               bus_release_resource(sc->dev, SYS_RES_IRQ,
+                                    sc->legacy_irq ? 0 : 1, sc->irq_res);
+               if (!sc->legacy_irq)
+                       pci_release_msi(sc->dev);
+       }
+       return err;
+}
+
+static void
+mxge_rem_msix_irqs(mxge_softc_t *sc)
+{
+       int i, rid;
+
+       for (i = 0; i < sc->num_slices; i++) {
+               if (sc->msix_ih[i] != NULL) {
+                       bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
+                                         sc->msix_ih[i]);
+                       sc->msix_ih[i] = NULL;
+               }
+       }
+       kfree(sc->msix_ih, M_DEVBUF);
+
+       for (i = 0; i < sc->num_slices; i++) {
+               rid = i + 1;
+               if (sc->msix_irq_res[i] != NULL)
+                       bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
+                                            sc->msix_irq_res[i]);
+               sc->msix_irq_res[i] = NULL;
+       }
+       kfree(sc->msix_irq_res, M_DEVBUF);
+
+       bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
+                            sc->msix_table_res);
+
+       pci_release_msi(sc->dev);
+       return;
+}
+
+static void
+mxge_rem_single_irq(mxge_softc_t *sc)
+{
+       bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
+       bus_release_resource(sc->dev, SYS_RES_IRQ,
+                            sc->legacy_irq ? 0 : 1, sc->irq_res);
+       if (!sc->legacy_irq)
+               pci_release_msi(sc->dev);
+}
+
+static void
+mxge_rem_irq(mxge_softc_t *sc)
+{
+       if (sc->num_slices > 1)
+               mxge_rem_msix_irqs(sc);
+       else
+               mxge_rem_single_irq(sc);
+}
+
+static int
+mxge_add_irq(mxge_softc_t *sc)
+{
+       int err;
+
+       if (sc->num_slices > 1)
+               err = mxge_add_msix_irqs(sc);
+       else
+               err = mxge_add_single_irq(sc);
+       
+       if (0 && err == 0 && sc->num_slices > 1) {
+               mxge_rem_msix_irqs(sc);
+               err = mxge_add_msix_irqs(sc);
+       }
+       return err;
+}
+
+
+static int 
+mxge_attach(device_t dev)
+{
+       mxge_softc_t *sc = device_get_softc(dev);
+       struct ifnet *ifp = &sc->arpcom.ac_if;
+       int err, rid;
+
+       /*
+        * avoid rewriting half the lines in this file to use
+        * &sc->arpcom.ac_if instead
+        */
+       sc->ifp = ifp;
+       sc->dev = dev;
+       mxge_fetch_tunables(sc);
+
+       err = bus_dma_tag_create(NULL,                  /* parent */
+                                1,                     /* alignment */
+                                0,                     /* boundary */
+                                BUS_SPACE_MAXADDR,     /* low */
+                                BUS_SPACE_MAXADDR,     /* high */
+                                NULL, NULL,            /* filter */
+                                65536 + 256,           /* maxsize */
+                                MXGE_MAX_SEND_DESC,    /* num segs */
+                                65536,                 /* maxsegsize */
+                                0,                     /* flags */
+                                &sc->parent_dmat);     /* tag */
+
+       if (err != 0) {
+               device_printf(sc->dev, "Err %d allocating parent dmat\n",
+                             err);
+               goto abort_with_nothing;
+       }
+
+       sc->ifp = ifp;
+       if_initname(ifp, device_get_name(dev), device_get_unit(dev));
+
+       callout_init_mp(&sc->co_hdl);
+
+       mxge_setup_cfg_space(sc);
+       
+       /* Map the board into the kernel */
+       rid = PCIR_BARS;
+       sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
+                                        ~0, 1, RF_ACTIVE);
+       if (sc->mem_res == NULL) {
+               device_printf(dev, "could not map memory\n");
+               err = ENXIO;
+               goto abort_with_nothing;
+       }
+       sc->sram = rman_get_virtual(sc->mem_res);
+       sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
+       if (sc->sram_size > rman_get_size(sc->mem_res)) {
+               device_printf(dev, "impossible memory region size %ld\n",
+                             rman_get_size(sc->mem_res));
+               err = ENXIO;
+               goto abort_with_mem_res;
+       }
+
+       /* make NULL terminated copy of the EEPROM strings section of
+          lanai SRAM */
+       bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
+       bus_space_read_region_1(rman_get_bustag(sc->mem_res),
+                               rman_get_bushandle(sc->mem_res),
+                               sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
+                               sc->eeprom_strings, 
+                               MXGE_EEPROM_STRINGS_SIZE - 2);
+       err = mxge_parse_strings(sc);
+       if (err != 0)
+               goto abort_with_mem_res;
+
+       /* Enable write combining for efficient use of PCIe bus */
+       mxge_enable_wc(sc);
+
+       /* Allocate the out of band dma memory */
+       err = mxge_dma_alloc(sc, &sc->cmd_dma, 
+                            sizeof (mxge_cmd_t), 64);
+       if (err != 0) 
+               goto abort_with_mem_res;
+       sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
+       err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
+       if (err != 0) 
+               goto abort_with_cmd_dma;
+
+       err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
+       if (err != 0)
+               goto abort_with_zeropad_dma;
+
+       /* select & load the firmware */
+       err = mxge_select_firmware(sc);
+       if (err != 0)
+               goto abort_with_dmabench;
+       sc->intr_coal_delay = mxge_intr_coal_delay;
+
+       mxge_slice_probe(sc);
+       err = mxge_alloc_slices(sc);
+       if (err != 0)
+               goto abort_with_dmabench;
+
+       err = mxge_reset(sc, 0);
+       if (err != 0)
+               goto abort_with_slices;
+
+       err = mxge_alloc_rings(sc);
+       if (err != 0) {
+               device_printf(sc->dev, "failed to allocate rings\n");
+               goto abort_with_dmabench;
+       }
+
+       ifp->if_baudrate = IF_Gbps(10UL);
+       ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
+               IFCAP_VLAN_MTU;
+#ifdef INET
+       ifp->if_capabilities |= IFCAP_LRO;
+#endif
+
+#ifdef MXGE_NEW_VLAN_API
+       ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
+#endif
+
+       sc->max_mtu = mxge_max_mtu(sc);
+       if (sc->max_mtu >= 9000)
+               ifp->if_capabilities |= IFCAP_JUMBO_MTU;
+       else
+               device_printf(dev, "MTU limited to %d.  Install "
+                             "latest firmware for 9000 byte jumbo support\n",
+                             sc->max_mtu - ETHER_HDR_LEN);
+       ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
+       ifp->if_capenable = ifp->if_capabilities;
+       if (sc->lro_cnt == 0)
+               ifp->if_capenable &= ~IFCAP_LRO;
+       sc->csum_flag = 1;
+        ifp->if_init = mxge_init;
+        ifp->if_softc = sc;
+        ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
+        ifp->if_ioctl = mxge_ioctl;
+        ifp->if_start = mxge_start;
+       /* Initialise the ifmedia structure */
+       ifmedia_init(&sc->media, 0, mxge_media_change, 
+                    mxge_media_status);
+       mxge_set_media(sc, IFM_ETHER | IFM_AUTO);
+       mxge_media_probe(sc);
+       sc->dying = 0;
+       ether_ifattach(ifp, sc->mac_addr, NULL);
+       /* ether_ifattach sets mtu to ETHERMTU */
+       if (mxge_initial_mtu != ETHERMTU) {
+               lwkt_serialize_enter(ifp->if_serializer);
+               mxge_change_mtu(sc, mxge_initial_mtu);
+               lwkt_serialize_exit(ifp->if_serializer);
+       }
+       /* must come after ether_ifattach() */
+       err = mxge_add_irq(sc);
+       if (err != 0) {
+               device_printf(sc->dev, "failed to add irq\n");
+               goto abort_with_rings;
+       }
+
+       mxge_add_sysctls(sc);
+#ifdef IFNET_BUF_RING
+       ifp->if_transmit = mxge_transmit;
+       ifp->if_qflush = mxge_qflush;
+#endif
+       return 0;
+
+abort_with_rings:
+       mxge_free_rings(sc);
+abort_with_slices:
+       mxge_free_slices(sc);
+abort_with_dmabench:
+       mxge_dma_free(&sc->dmabench_dma);
+abort_with_zeropad_dma:
+       mxge_dma_free(&sc->zeropad_dma);
+abort_with_cmd_dma:
+       mxge_dma_free(&sc->cmd_dma);
+abort_with_mem_res:
+       bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
+       pci_disable_busmaster(dev);
+       bus_dma_tag_destroy(sc->parent_dmat);
+abort_with_nothing:
+       return err;
+}
+
+static int
+mxge_detach(device_t dev)
+{
+       mxge_softc_t *sc = device_get_softc(dev);
+
+       lwkt_serialize_enter(sc->ifp->if_serializer);
+       sc->dying = 1;
+       if (sc->ifp->if_flags & IFF_RUNNING)
+               mxge_close(sc);
+       /*
+        * XXX: race: the callout callback could be spinning on
+        * the serializer and run anyway
+        */
+       callout_stop(&sc->co_hdl);
+       lwkt_serialize_exit(sc->ifp->if_serializer);
+
+       ether_ifdetach(sc->ifp);
+       ifmedia_removeall(&sc->media);
+       mxge_dummy_rdma(sc, 0);
+       mxge_rem_sysctls(sc);
+       mxge_rem_irq(sc);
+       mxge_free_rings(sc);
+       mxge_free_slices(sc);
+       mxge_dma_free(&sc->dmabench_dma);
+       mxge_dma_free(&sc->zeropad_dma);
+       mxge_dma_free(&sc->cmd_dma);
+       bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
+       pci_disable_busmaster(dev);
+       bus_dma_tag_destroy(sc->parent_dmat);
+       return 0;
+}
+
+static int
+mxge_shutdown(device_t dev)
+{
+       return 0;
+}
+
+/*
+  This file uses Myri10GE driver indentation.
+
+  Local Variables:
+  c-file-style:"linux"
+  tab-width:8
+  End:
+*/
diff --git a/sys/dev/netif/mxge/if_mxge_var.h b/sys/dev/netif/mxge/if_mxge_var.h
new file mode 100644 (file)
index 0000000..93a98bd
--- /dev/null
@@ -0,0 +1,349 @@
+/*******************************************************************************
+
+Copyright (c) 2006-2009, Myricom Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Myricom Inc, nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+$FreeBSD: src/sys/dev/mxge/if_mxge_var.h,v 1.30 2009/06/24 21:09:56 gallatin Exp $
+
+***************************************************************************/
+
+#define MXGE_ETH_STOPPED 0
+#define MXGE_ETH_STOPPING 1
+#define MXGE_ETH_STARTING 2
+#define MXGE_ETH_RUNNING 3
+#define MXGE_ETH_OPEN_FAILED 4
+
+#define MXGE_FW_OFFSET 1024*1024
+#define MXGE_EEPROM_STRINGS_SIZE 256
+#define MXGE_MAX_SEND_DESC 128
+
+#if ((__FreeBSD_version > 800000 && __FreeBSD_version < 800005) \
+     || __FreeBSD_version < 700111)
+#define MXGE_VIRT_JUMBOS 1
+#else
+#define MXGE_VIRT_JUMBOS 0
+#endif
+
+#if (__FreeBSD_version > 800082)
+#define IFNET_BUF_RING 1
+#endif
+
+#ifndef VLAN_CAPABILITIES
+#define VLAN_CAPABILITIES(ifp)
+#define mxge_vlans_active(sc) (sc)->ifp->if_nvlans
+#else
+#define mxge_vlans_active(sc) (sc)->ifp->if_vlantrunk
+#endif
+
+#ifndef VLAN_TAG_VALUE
+#define MXGE_NEW_VLAN_API
+#endif
+
+#ifndef IFCAP_LRO
+#define IFCAP_LRO 0
+#endif
+
+#ifndef IFCAP_TSO
+#define IFCAP_TSO 0
+#endif
+#ifndef IFCAP_TSO4
+#define IFCAP_TSO4 0
+#endif
+
+#ifndef CSUM_TSO
+#define CSUM_TSO 0
+#endif
+
+
+typedef struct {
+       void *addr;
+       bus_addr_t bus_addr;
+       bus_dma_tag_t dmat;
+       bus_dmamap_t map;
+} mxge_dma_t;
+
+
+typedef struct {
+       mcp_slot_t *entry;
+       mxge_dma_t dma;
+       int cnt;
+       int idx;
+       int mask;
+} mxge_rx_done_t;
+
+typedef struct
+{
+  uint32_t data0;
+  uint32_t data1;
+  uint32_t data2;
+} mxge_cmd_t;
+
+struct mxge_rx_buffer_state {
+       struct mbuf *m;
+       bus_dmamap_t map;
+};
+
+struct mxge_tx_buffer_state {
+       struct mbuf *m;
+       bus_dmamap_t map;
+       int flag;
+};
+
+typedef struct
+{
+       volatile mcp_kreq_ether_recv_t *lanai;  /* lanai ptr for recv ring */
+       mcp_kreq_ether_recv_t *shadow;  /* host shadow of recv ring */
+       struct mxge_rx_buffer_state *info;
+       bus_dma_tag_t dmat;
+       bus_dmamap_t extra_map;
+       int cnt;
+       int nbufs;
+       int cl_size;
+       int alloc_fail;
+       int mask;                       /* number of rx slots -1 */
+       int mlen;
+} mxge_rx_ring_t;
+
+typedef struct
+{
+#ifdef IFNET_BUF_RING
+       struct buf_ring *br;
+#endif
+       volatile mcp_kreq_ether_send_t *lanai;  /* lanai ptr for sendq  */
+       volatile uint32_t *send_go;             /* doorbell for sendq */
+       volatile uint32_t *send_stop;           /* doorbell for sendq */
+       mcp_kreq_ether_send_t *req_list;        /* host shadow of sendq */
+       char *req_bytes;
+       bus_dma_segment_t *seg_list;
+       struct mxge_tx_buffer_state *info;
+       bus_dma_tag_t dmat;
+       int req;                        /* transmits submitted  */
+       int mask;                       /* number of transmit slots -1 */
+       int done;                       /* transmits completed  */
+       int pkt_done;                   /* packets completed */
+       int max_desc;                   /* max descriptors per xmit */
+       int queue_active;               /* fw currently polling this queue*/
+       int activate;
+       int deactivate;
+       int stall;                      /* #times hw queue exhausted */
+       int wake;                       /* #times irq re-enabled xmit */
+       int watchdog_req;               /* cache of req */
+       int watchdog_done;              /* cache of done */
+       int watchdog_rx_pause;          /* cache of pause rq recvd */
+       int defrag;
+} mxge_tx_ring_t;
+
+struct lro_entry;
+struct lro_entry
+{
+       SLIST_ENTRY(lro_entry) next;
+       struct mbuf     *m_head;
+       struct mbuf     *m_tail;
+       int             timestamp;
+       struct ip       *ip;
+       uint32_t        tsval;
+       uint32_t        tsecr;
+       uint32_t        source_ip;
+       uint32_t        dest_ip;
+       uint32_t        next_seq;
+       uint32_t        ack_seq;
+       uint32_t        len;
+       uint32_t        data_csum;
+       uint16_t        window;
+       uint16_t        source_port;
+       uint16_t        dest_port;
+       uint16_t        append_cnt;
+       uint16_t        mss;
+       
+};
+SLIST_HEAD(lro_head, lro_entry);
+
+struct mxge_softc;
+typedef struct mxge_softc mxge_softc_t;
+
+struct mxge_slice_state {
+       mxge_softc_t *sc;
+       mxge_tx_ring_t tx;              /* transmit ring        */
+       mxge_rx_ring_t rx_small;
+       mxge_rx_ring_t rx_big;
+       mxge_rx_done_t rx_done;
+       mcp_irq_data_t *fw_stats;
+       volatile uint32_t *irq_claim;
+       u_long ipackets;
+       u_long opackets;
+       u_long obytes;
+       u_long omcasts;
+       u_long oerrors;
+       int if_drv_flags;
+       struct lro_head lro_active;
+       struct lro_head lro_free;
+       int lro_queued;
+       int lro_flushed;
+       int lro_bad_csum;
+       mxge_dma_t fw_stats_dma;
+       struct sysctl_oid *sysctl_tree;
+       struct sysctl_ctx_list sysctl_ctx;
+       char scratch[256];
+};
+
+struct mxge_softc {
+       struct arpcom arpcom;
+       struct ifnet* ifp;              /* points to arpcom.ac_if */
+       struct mxge_slice_state *ss;
+       int csum_flag;                  /* rx_csums?            */
+       int tx_boundary;                /* boundary transmits cannot cross*/
+       int lro_cnt;
+       bus_dma_tag_t   parent_dmat;
+       volatile uint8_t *sram;
+       int sram_size;
+       volatile uint32_t *irq_deassert;
+       mcp_cmd_response_t *cmd;
+       mxge_dma_t cmd_dma;
+       mxge_dma_t zeropad_dma;
+       struct pci_dev *pdev;
+       int legacy_irq;
+       int link_state;
+       unsigned int rdma_tags_available;
+       int intr_coal_delay;
+       volatile uint32_t *intr_coal_delay_ptr;
+       int wc;
+       int wake_queue;
+       int stop_queue;
+       int down_cnt;
+       int watchdog_resets;
+       int watchdog_countdown;
+       int pause;
+       struct resource *mem_res;
+       struct resource *irq_res;
+       struct resource **msix_irq_res;
+       struct resource *msix_table_res;
+       struct resource *msix_pba_res;
+       void *ih; 
+       void **msix_ih;
+       char *fw_name;
+       char eeprom_strings[MXGE_EEPROM_STRINGS_SIZE];
+       char fw_version[128];
+       int fw_ver_major;
+       int fw_ver_minor;
+       int fw_ver_tiny;
+       int adopted_rx_filter_bug;
+       device_t dev;
+       struct ifmedia media;
+       int read_dma;
+       int write_dma;
+       int read_write_dma;
+       int fw_multicast_support;
+       int link_width;
+       int max_mtu;
+       int tx_defrag;
+       int media_flags;
+       int need_media_probe;
+       int num_slices;
+       int rx_ring_size;
+       int dying;
+       mxge_dma_t dmabench_dma;
+       struct callout co_hdl;
+       struct sysctl_ctx_list sysctl_ctx;
+       struct sysctl_oid *sysctl_tree;
+       struct sysctl_oid *slice_sysctl_tree;
+       struct sysctl_ctx_list slice_sysctl_ctx;
+       char *mac_addr_string;
+       uint8_t mac_addr[6];            /* eeprom mac address */
+       char product_code_string[64];
+       char serial_number_string[64];
+};
+
+#define MXGE_PCI_VENDOR_MYRICOM        0x14c1
+#define MXGE_PCI_DEVICE_Z8E    0x0008
+#define MXGE_PCI_DEVICE_Z8E_9  0x0009
+#define MXGE_PCI_REV_Z8E       0
+#define MXGE_PCI_REV_Z8ES      1
+#define MXGE_XFP_COMPLIANCE_BYTE       131
+#define MXGE_SFP_COMPLIANCE_BYTE         3
+
+#define MXGE_HIGHPART_TO_U32(X) \
+(sizeof (X) == 8) ? ((uint32_t)((uint64_t)(X) >> 32)) : (0)
+#define MXGE_LOWPART_TO_U32(X) ((uint32_t)(X))
+
+struct mxge_media_type
+{
+       int flag;
+       uint8_t bitmask;
+       char *name;
+};
+
+/* implement our own memory barriers, since bus_space_barrier
+   cannot handle write-combining regions */
+
+#if __FreeBSD_version < 800053
+
+#if defined (__GNUC__)
+  #if #cpu(i386) || defined __i386 || defined i386 || defined __i386__ || #cpu(x86_64) || defined __x86_64__
+    #define wmb()  __asm__ __volatile__ ("sfence;": : :"memory")
+  #elif #cpu(sparc64) || defined sparc64 || defined __sparcv9 
+    #define wmb()  __asm__ __volatile__ ("membar #MemIssue": : :"memory")
+  #elif #cpu(sparc) || defined sparc || defined __sparc__
+    #define wmb()  __asm__ __volatile__ ("stbar;": : :"memory")
+  #else
+    #define wmb()      /* XXX just to make this compile */
+  #endif
+#else
+  #error "unknown compiler"
+#endif
+
+#endif
+
+static inline void
+mxge_pio_copy(volatile void *to_v, void *from_v, size_t size)
+{
+  register volatile uintptr_t *to;
+  volatile uintptr_t *from;
+  size_t i;
+
+  to = (volatile uintptr_t *) to_v;
+  from = from_v;
+  for (i = (size / sizeof (uintptr_t)); i; i--) {
+         *to = *from;
+         to++;
+         from++;
+  }
+
+}
+
+void mxge_lro_flush(struct mxge_slice_state *ss, struct lro_entry *lro);
+int mxge_lro_rx(struct mxge_slice_state *ss, struct mbuf *m_head,
+               uint32_t csum);
+               
+
+
+/*
+  This file uses Myri10GE driver indentation.
+
+  Local Variables:
+  c-file-style:"linux"
+  tab-width:8
+  End:
+*/
diff --git a/sys/dev/netif/mxge/mcp_gen_header.h b/sys/dev/netif/mxge/mcp_gen_header.h
new file mode 100644 (file)
index 0000000..58a09fa
--- /dev/null
@@ -0,0 +1,103 @@
+/*******************************************************************************
+
+Copyright (c) 2006-2007, Myricom Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Myricom Inc, nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+$FreeBSD: src/sys/dev/mxge/mcp_gen_header.h,v 1.2 2007/07/12 16:04:55 gallatin Exp $
+***************************************************************************/
+
+#ifndef _mcp_gen_header_h
+#define _mcp_gen_header_h
+
+/* this file define a standard header used as a first entry point to
+   exchange information between firmware/driver and driver.  The
+   header structure can be anywhere in the mcp. It will usually be in
+   the .data section, because some fields needs to be initialized at
+   compile time.
+   The 32bit word at offset MX_HEADER_PTR_OFFSET in the mcp must
+   contains the location of the header. 
+
+   Typically a MCP will start with the following:
+   .text
+     .space 52    ! to help catch MEMORY_INT errors
+     bt start     ! jump to real code
+     nop
+     .long _gen_mcp_header
+   
+   The source will have a definition like:
+
+   mcp_gen_header_t gen_mcp_header = {
+      .header_length = sizeof(mcp_gen_header_t),
+      .mcp_type = MCP_TYPE_XXX,
+      .version = "something $Id: mcp_gen_header.h,v 1.1 2005/12/23 02:10:44 gallatin Exp $",
+      .mcp_globals = (unsigned)&Globals
+   };
+*/
+
+
+#define MCP_HEADER_PTR_OFFSET  0x3c
+
+#define MCP_TYPE_MX 0x4d582020 /* "MX  " */
+#define MCP_TYPE_PCIE 0x70636965 /* "PCIE" pcie-only MCP */
+#define MCP_TYPE_ETH 0x45544820 /* "ETH " */
+#define MCP_TYPE_MCP0 0x4d435030 /* "MCP0" */
+
+
+typedef struct mcp_gen_header {
+  /* the first 4 fields are filled at compile time */
+  unsigned header_length;
+  unsigned mcp_type;
+  char version[128];
+  unsigned mcp_globals; /* pointer to mcp-type specific structure */
+
+  /* filled by the MCP at run-time */
+  unsigned sram_size;
+  unsigned string_specs;  /* either the original STRING_SPECS or a superset */
+  unsigned string_specs_len;
+
+  /* Fields above this comment are guaranteed to be present.
+
+     Fields below this comment are extensions added in later versions
+     of this struct, drivers should compare the header_length against
+     offsetof(field) to check wether a given MCP implements them.
+
+     Never remove any field.  Keep everything naturally align.
+  */
+} mcp_gen_header_t;
+
+/* Macro to create a simple mcp header */
+#define MCP_GEN_HEADER_DECL(type, version_str, global_ptr)     \
+  struct mcp_gen_header mcp_gen_header = {                     \
+    sizeof (struct mcp_gen_header),                            \
+    (type),                                                    \
+    version_str,                                               \
+    (global_ptr),                                              \
+    SRAM_SIZE,                                                 \
+    (unsigned int) STRING_SPECS,                               \
+    256                                                                \
+  }
+
+
+#endif /* _mcp_gen_header_h */
diff --git a/sys/dev/netif/mxge/mxge_lro.c b/sys/dev/netif/mxge/mxge_lro.c
new file mode 100644 (file)
index 0000000..462164b
--- /dev/null
@@ -0,0 +1,355 @@
+/******************************************************************************
+
+Copyright (c) 2007-2008, Myricom Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Myricom Inc, nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+***************************************************************************/
+
+#include <sys/cdefs.h>
+/*__FBSDID("$FreeBSD: src/sys/dev/mxge/mxge_lro.c,v 1.8 2009/06/23 17:42:06 gallatin Exp $");*/
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/endian.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/bus.h>
+
+#include <net/if.h>
+#include <net/ethernet.h>
+#include <net/if_media.h>
+
+#include <netinet/in_systm.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/tcp.h>
+
+#include <machine/bus.h>
+#include <machine/in_cksum.h>
+
+#include <dev/netif/mxge/mxge_mcp.h>
+#include <dev/netif/mxge/if_mxge_var.h>
+
+#include "opt_inet.h"
+
+#ifdef INET
+
+/* Assume len is a multiple of 4 */
+static uint16_t
+mxge_csum_generic(uint16_t *raw, int len)
+{
+       uint32_t csum;
+       csum = 0;
+       while (len > 0) {
+               csum += *raw;
+               raw++;
+               csum += *raw;
+               raw++;
+               len -= 4;
+       }
+       csum = (csum >> 16) + (csum & 0xffff);
+       csum = (csum >> 16) + (csum & 0xffff);
+       return (uint16_t)csum;
+}
+
+
+void
+mxge_lro_flush(struct mxge_slice_state *ss, struct lro_entry *lro)
+{
+       mxge_softc_t *mgp = ss->sc;
+       struct ifnet *ifp;
+       struct ip *ip;
+       struct tcphdr *tcp;
+       uint32_t *ts_ptr;
+       uint32_t tcplen, tcp_csum;
+
+       if (lro->append_cnt) {
+               /* incorporate the new len into the ip header and
+                * re-calculate the checksum */
+               ip = lro->ip;
+               ip->ip_len = htons(lro->len - ETHER_HDR_LEN);
+               ip->ip_sum = 0;
+               ip->ip_sum = 0xffff ^ 
+                       mxge_csum_generic((uint16_t*)ip,
+                                             sizeof (*ip));
+
+               lro->m_head->m_pkthdr.csum_flags = CSUM_IP_CHECKED |
+                       CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
+               lro->m_head->m_pkthdr.csum_data = 0xffff;
+               lro->m_head->m_pkthdr.len = lro->len;
+
+               /* incorporate the latest ack into the tcp header */
+               tcp = (struct tcphdr *) (ip + 1);
+               tcp->th_ack = lro->ack_seq;
+               tcp->th_win = lro->window;
+               /* incorporate latest timestamp into the tcp header */
+               if (lro->timestamp) {
+                       ts_ptr = (uint32_t *)(tcp + 1);
+                       ts_ptr[1] = htonl(lro->tsval);
+                       ts_ptr[2] = lro->tsecr;
+               }
+               /* 
+                * update checksum in tcp header by re-calculating the
+                * tcp pseudoheader checksum, and adding it to the checksum
+                * of the tcp payload data 
+                */
+               tcp->th_sum = 0;
+               tcplen = lro->len - sizeof(*ip) - ETHER_HDR_LEN;
+               tcp_csum = lro->data_csum;
+               tcp_csum += in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
+                                     htons(tcplen + IPPROTO_TCP));
+               tcp_csum += mxge_csum_generic((uint16_t*)tcp,
+                                                 tcp->th_off << 2);
+               tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16);
+               tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16);
+#if 0
+               IOLog("pseudo = 0x%x, generic = 0x%x, sum = %x\n", 
+                     in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
+                               htons(tcplen + IPPROTO_TCP)),
+                     mxge_csum_generic((uint16_t*)tcp,
+                                           tcp->th_off << 2),
+                     htons(0xffff ^ tcp_csum));
+#endif
+               tcp->th_sum = 0xffff ^ tcp_csum;
+       }
+       ifp = mgp->ifp;
+       (*ifp->if_input)(mgp->ifp, lro->m_head);
+       ss->lro_queued += lro->append_cnt + 1;
+       ss->lro_flushed++;
+       lro->m_head = NULL;
+       lro->timestamp = 0;
+       lro->append_cnt = 0;
+       SLIST_INSERT_HEAD(&ss->lro_free, lro, next);
+}
+
+int
+mxge_lro_rx(struct mxge_slice_state *ss, struct mbuf *m_head, uint32_t csum)
+{
+       struct ether_header *eh;
+       struct ip *ip;
+       struct tcphdr *tcp;
+       uint32_t *ts_ptr;
+       struct mbuf *m_nxt, *m_tail;
+       struct lro_entry *lro;
+       int hlen, ip_len, tcp_hdr_len, tcp_data_len, tot_len;
+       int opt_bytes, trim;
+       uint32_t seq, tmp_csum, device_mtu;
+
+       eh = mtod(m_head, struct ether_header *);
+       if (eh->ether_type != htons(ETHERTYPE_IP))
+               return 1;
+       ip = (struct ip *) (eh + 1);
+       if (ip->ip_p != IPPROTO_TCP)
+               return 1;
+       
+       /* ensure there are no options */
+       if ((ip->ip_hl << 2) != sizeof (*ip))
+               return -1;
+
+       /* .. and the packet is not fragmented */
+       if (ip->ip_off & htons(IP_MF|IP_OFFMASK))
+               return -1;
+
+       /* verify that the IP header checksum is correct */
+       tmp_csum = mxge_csum_generic((uint16_t *)ip, sizeof (*ip));
+       if (__predict_false((tmp_csum ^ 0xffff) != 0)) {
+               ss->lro_bad_csum++;
+               return -1;
+       }
+
+       /* find the TCP header */
+       tcp = (struct tcphdr *) (ip + 1);
+
+       /* ensure no bits set besides ack or psh */
+       if ((tcp->th_flags & ~(TH_ACK | TH_PUSH)) != 0)
+               return -1;
+
+       /* check for timestamps. Since the only option we handle are
+          timestamps, we only have to handle the simple case of
+          aligned timestamps */
+
+       opt_bytes = (tcp->th_off << 2) - sizeof (*tcp);
+       tcp_hdr_len =  sizeof (*tcp) + opt_bytes;
+       ts_ptr = (uint32_t *)(tcp + 1);
+       if (opt_bytes != 0) {
+               if (__predict_false(opt_bytes != TCPOLEN_TSTAMP_APPA) ||
+                   (*ts_ptr !=  ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16|TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))
+                       return -1;
+       }
+
+       ip_len = ntohs(ip->ip_len);
+       tcp_data_len = ip_len - (tcp->th_off << 2) - sizeof (*ip);
+       
+
+       /* 
+        * If frame is padded beyond the end of the IP packet,
+        * then we must trim the extra bytes off the end.
+        */
+       tot_len = m_head->m_pkthdr.len;
+       trim = tot_len - (ip_len + ETHER_HDR_LEN);
+       if (trim != 0) {
+               if (trim < 0) {
+                       /* truncated packet */
+                       return -1;
+               }
+               m_adj(m_head, -trim);
+               tot_len = m_head->m_pkthdr.len;
+       }
+
+       m_nxt = m_head;
+       m_tail = NULL; /* -Wuninitialized */
+       while (m_nxt != NULL) {
+               m_tail = m_nxt;
+               m_nxt = m_tail->m_next;
+       }
+
+       hlen = ip_len + ETHER_HDR_LEN - tcp_data_len;
+       seq = ntohl(tcp->th_seq);
+
+       SLIST_FOREACH(lro, &ss->lro_active, next) {
+               if (lro->source_port == tcp->th_sport && 
+                   lro->dest_port == tcp->th_dport &&
+                   lro->source_ip == ip->ip_src.s_addr && 
+                   lro->dest_ip == ip->ip_dst.s_addr) {
+                       /* Try to append it */
+
+                       if (__predict_false(seq != lro->next_seq)) {
+                               /* out of order packet */
+                               SLIST_REMOVE(&ss->lro_active, lro,
+                                            lro_entry, next);
+                               mxge_lro_flush(ss, lro);
+                               return -1;
+                       }
+
+                       if (opt_bytes) {
+                               uint32_t tsval = ntohl(*(ts_ptr + 1));
+                               /* make sure timestamp values are increasing */
+                               if (__predict_false(lro->tsval > tsval || 
+                                            *(ts_ptr + 2) == 0)) {
+                                       return -1;
+                               }
+                               lro->tsval = tsval;
+                               lro->tsecr = *(ts_ptr + 2);
+                       }
+
+                       lro->next_seq += tcp_data_len;
+                       lro->ack_seq = tcp->th_ack;
+                       lro->window = tcp->th_win;
+                       lro->append_cnt++;
+                       if (tcp_data_len == 0) {
+                               m_freem(m_head);
+                               return 0;
+                       }
+                       /* subtract off the checksum of the tcp header
+                         * from the hardware checksum, and add it to the
+                         * stored tcp data checksum.  Byteswap the checksum
+                        * if the total length so far is odd 
+                         */
+                       tmp_csum = mxge_csum_generic((uint16_t*)tcp,
+                                                        tcp_hdr_len);
+                       csum = csum + (tmp_csum ^ 0xffff);
+                       csum = (csum & 0xffff) + (csum >> 16);
+                       csum = (csum & 0xffff) + (csum >> 16);
+                       if (lro->len & 0x1) {
+                               /* Odd number of bytes so far, flip bytes */
+                               csum = ((csum << 8) | (csum >> 8)) & 0xffff;
+                       }
+                       csum = csum + lro->data_csum;
+                       csum = (csum & 0xffff) + (csum >> 16);
+                       csum = (csum & 0xffff) + (csum >> 16);
+                       lro->data_csum = csum;
+
+                       lro->len += tcp_data_len;
+
+                       /* adjust mbuf so that m->m_data points to
+                          the first byte of the payload */
+                       m_adj(m_head, hlen);
+                       /* append mbuf chain */
+                       lro->m_tail->m_next = m_head;
+                       /* advance the last pointer */
+                       lro->m_tail = m_tail;
+                       /* flush packet if required */
+                       device_mtu = ss->sc->ifp->if_mtu;
+                       if (lro->len > (65535 - device_mtu)) {
+                               SLIST_REMOVE(&ss->lro_active, lro,
+                                            lro_entry, next);
+                               mxge_lro_flush(ss, lro);
+                       }
+                       return 0;
+               }
+       }
+
+       if (SLIST_EMPTY(&ss->lro_free))
+           return -1;
+
+       /* start a new chain */
+       lro = SLIST_FIRST(&ss->lro_free);
+       SLIST_REMOVE_HEAD(&ss->lro_free, next);
+       SLIST_INSERT_HEAD(&ss->lro_active, lro, next);
+       lro->source_port = tcp->th_sport;
+       lro->dest_port = tcp->th_dport;
+       lro->source_ip = ip->ip_src.s_addr;
+       lro->dest_ip = ip->ip_dst.s_addr;
+       lro->next_seq = seq + tcp_data_len;
+       lro->mss = tcp_data_len;
+       lro->ack_seq = tcp->th_ack;
+       lro->window = tcp->th_win;
+
+       /* save the checksum of just the TCP payload by
+        * subtracting off the checksum of the TCP header from
+        * the entire hardware checksum 
+        * Since IP header checksum is correct, checksum over
+        * the IP header is -0.  Substracting -0 is unnecessary.
+        */
+       tmp_csum = mxge_csum_generic((uint16_t*)tcp, tcp_hdr_len);
+       csum = csum + (tmp_csum ^ 0xffff);
+       csum = (csum & 0xffff) + (csum >> 16);
+       csum = (csum & 0xffff) + (csum >> 16);
+       lro->data_csum = csum;
+       
+       lro->ip = ip;
+       /* record timestamp if it is present */
+       if (opt_bytes) {
+               lro->timestamp = 1;
+               lro->tsval = ntohl(*(ts_ptr + 1));
+               lro->tsecr = *(ts_ptr + 2);
+       }
+       lro->len = tot_len;
+       lro->m_head = m_head;
+       lro->m_tail = m_tail;
+       return 0;
+}
+
+#endif /* INET */
+/*
+  This file uses Myri10GE driver indentation.
+
+  Local Variables:
+  c-file-style:"linux"
+  tab-width:8
+  End:
+*/
diff --git a/sys/dev/netif/mxge/mxge_mcp.h b/sys/dev/netif/mxge/mxge_mcp.h
new file mode 100644 (file)
index 0000000..88a990b
--- /dev/null
@@ -0,0 +1,522 @@
+/*******************************************************************************
+
+Copyright (c) 2006-2009, Myricom Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Myricom Inc, nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+$FreeBSD: src/sys/dev/mxge/mxge_mcp.h,v 1.11 2009/02/17 22:15:58 gallatin Exp $
+***************************************************************************/
+
+#ifndef _myri10ge_mcp_h
+#define _myri10ge_mcp_h
+
+#define MXGEFW_VERSION_MAJOR   1
+#define MXGEFW_VERSION_MINOR   4
+
+#if defined MXGEFW && !defined _stdint_h_
+typedef signed char          int8_t;
+typedef signed short        int16_t;
+typedef signed int          int32_t;
+typedef signed long long    int64_t;
+typedef unsigned char       uint8_t;
+typedef unsigned short     uint16_t;
+typedef unsigned int       uint32_t;
+typedef unsigned long long uint64_t;
+#endif
+
+/* 8 Bytes */
+struct mcp_dma_addr {
+  uint32_t high;
+  uint32_t low;
+};
+typedef struct mcp_dma_addr mcp_dma_addr_t;
+
+/* 4 Bytes */
+struct mcp_slot {
+  uint16_t checksum;
+  uint16_t length;
+};
+typedef struct mcp_slot mcp_slot_t;
+
+#ifdef MXGEFW_NDIS
+/* 8-byte descriptor, exclusively used by NDIS drivers. */
+struct mcp_slot_8 {
+  /* Place hash value at the top so it gets written before length.
+   * The driver polls length.
+   */
+  uint32_t hash;
+  uint16_t checksum;
+  uint16_t length;
+};
+typedef struct mcp_slot_8 mcp_slot_8_t;
+
+/* Two bits of length in mcp_slot are used to indicate hash type. */
+#define MXGEFW_RSS_HASH_NULL (0 << 14) /* bit 15:14 = 00 */
+#define MXGEFW_RSS_HASH_IPV4 (1 << 14) /* bit 15:14 = 01 */
+#define MXGEFW_RSS_HASH_TCP_IPV4 (2 << 14) /* bit 15:14 = 10 */
+#define MXGEFW_RSS_HASH_MASK (3 << 14) /* bit 15:14 = 11 */
+#endif
+
+/* 64 Bytes */
+struct mcp_cmd {
+  uint32_t cmd;
+  uint32_t data0;      /* will be low portion if data > 32 bits */
+  /* 8 */
+  uint32_t data1;      /* will be high portion if data > 32 bits */
+  uint32_t data2;      /* currently unused.. */
+  /* 16 */
+  struct mcp_dma_addr response_addr;
+  /* 24 */
+  uint8_t pad[40];
+};
+typedef struct mcp_cmd mcp_cmd_t;
+
+/* 8 Bytes */
+struct mcp_cmd_response {
+  uint32_t data;
+  uint32_t result;
+};
+typedef struct mcp_cmd_response mcp_cmd_response_t;
+
+
+
+/* 
+   flags used in mcp_kreq_ether_send_t:
+
+   The SMALL flag is only needed in the first segment. It is raised
+   for packets that are total less or equal 512 bytes.
+
+   The CKSUM flag must be set in all segments.
+
+   The PADDED flags is set if the packet needs to be padded, and it
+   must be set for all segments.
+
+   The  MXGEFW_FLAGS_ALIGN_ODD must be set if the cumulative
+   length of all previous segments was odd.
+*/
+
+
+#define MXGEFW_FLAGS_SMALL      0x1
+#define MXGEFW_FLAGS_TSO_HDR    0x1
+#define MXGEFW_FLAGS_FIRST      0x2
+#define MXGEFW_FLAGS_ALIGN_ODD  0x4
+#define MXGEFW_FLAGS_CKSUM      0x8
+#define MXGEFW_FLAGS_TSO_LAST   0x8
+#define MXGEFW_FLAGS_NO_TSO     0x10
+#define MXGEFW_FLAGS_TSO_CHOP   0x10
+#define MXGEFW_FLAGS_TSO_PLD    0x20
+
+#define MXGEFW_SEND_SMALL_SIZE  1520
+#define MXGEFW_MAX_MTU          9400
+
+union mcp_pso_or_cumlen {
+  uint16_t pseudo_hdr_offset;
+  uint16_t cum_len;
+};
+typedef union mcp_pso_or_cumlen mcp_pso_or_cumlen_t;
+
+#define        MXGEFW_MAX_SEND_DESC 12
+#define MXGEFW_PAD         2
+
+/* 16 Bytes */
+struct mcp_kreq_ether_send {
+  uint32_t addr_high;
+  uint32_t addr_low;
+  uint16_t pseudo_hdr_offset;
+  uint16_t length;
+  uint8_t  pad;
+  uint8_t  rdma_count;
+  uint8_t  cksum_offset;       /* where to start computing cksum */
+  uint8_t  flags;              /* as defined above */
+};
+typedef struct mcp_kreq_ether_send mcp_kreq_ether_send_t;
+
+/* 8 Bytes */
+struct mcp_kreq_ether_recv {
+  uint32_t addr_high;
+  uint32_t addr_low;
+};
+typedef struct mcp_kreq_ether_recv mcp_kreq_ether_recv_t;
+
+
+/* Commands */
+
+#define        MXGEFW_BOOT_HANDOFF     0xfc0000
+#define        MXGEFW_BOOT_DUMMY_RDMA  0xfc01c0
+
+#define        MXGEFW_ETH_CMD          0xf80000
+#define        MXGEFW_ETH_SEND_4       0x200000
+#define        MXGEFW_ETH_SEND_1       0x240000
+#define        MXGEFW_ETH_SEND_2       0x280000
+#define        MXGEFW_ETH_SEND_3       0x2c0000
+#define        MXGEFW_ETH_RECV_SMALL   0x300000
+#define        MXGEFW_ETH_RECV_BIG     0x340000
+#define        MXGEFW_ETH_SEND_GO      0x380000
+#define        MXGEFW_ETH_SEND_STOP    0x3C0000
+
+#define        MXGEFW_ETH_SEND(n)              (0x200000 + (((n) & 0x03) * 0x40000))
+#define        MXGEFW_ETH_SEND_OFFSET(n)       (MXGEFW_ETH_SEND(n) - MXGEFW_ETH_SEND_4)
+
+enum myri10ge_mcp_cmd_type {
+  MXGEFW_CMD_NONE = 0,
+  /* Reset the mcp, it is left in a safe state, waiting
+     for the driver to set all its parameters */
+  MXGEFW_CMD_RESET = 1,
+
+  /* get the version number of the current firmware..
+     (may be available in the eeprom strings..? */
+  MXGEFW_GET_MCP_VERSION = 2,
+
+
+  /* Parameters which must be set by the driver before it can
+     issue MXGEFW_CMD_ETHERNET_UP. They persist until the next
+     MXGEFW_CMD_RESET is issued */
+
+  MXGEFW_CMD_SET_INTRQ_DMA = 3,
+  /* data0 = LSW of the host address
+   * data1 = MSW of the host address
+   * data2 = slice number if multiple slices are used
+   */
+  
+  MXGEFW_CMD_SET_BIG_BUFFER_SIZE = 4,  /* in bytes, power of 2 */
+  MXGEFW_CMD_SET_SMALL_BUFFER_SIZE = 5,        /* in bytes */
+  
+
+  /* Parameters which refer to lanai SRAM addresses where the 
+     driver must issue PIO writes for various things */
+
+  MXGEFW_CMD_GET_SEND_OFFSET = 6,
+  MXGEFW_CMD_GET_SMALL_RX_OFFSET = 7,
+  MXGEFW_CMD_GET_BIG_RX_OFFSET = 8,
+  /* data0 = slice number if multiple slices are used */
+  
+  MXGEFW_CMD_GET_IRQ_ACK_OFFSET = 9,
+  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET = 10,
+
+  /* Parameters which refer to rings stored on the MCP,
+     and whose size is controlled by the mcp */
+
+  MXGEFW_CMD_GET_SEND_RING_SIZE = 11,  /* in bytes */
+  MXGEFW_CMD_GET_RX_RING_SIZE = 12,    /* in bytes */
+
+  /* Parameters which refer to rings stored in the host,
+     and whose size is controlled by the host.  Note that
+     all must be physically contiguous and must contain 
+     a power of 2 number of entries.  */
+
+  MXGEFW_CMD_SET_INTRQ_SIZE = 13,      /* in bytes */
+#define MXGEFW_CMD_SET_INTRQ_SIZE_FLAG_NO_STRICT_SIZE_CHECK  (1 << 31)
+
+  /* command to bring ethernet interface up.  Above parameters
+     (plus mtu & mac address) must have been exchanged prior
+     to issuing this command  */
+  MXGEFW_CMD_ETHERNET_UP = 14,
+
+  /* command to bring ethernet interface down.  No further sends
+     or receives may be processed until an MXGEFW_CMD_ETHERNET_UP
+     is issued, and all interrupt queues must be flushed prior
+     to ack'ing this command */
+
+  MXGEFW_CMD_ETHERNET_DOWN = 15,
+
+  /* commands the driver may issue live, without resetting
+     the nic.  Note that increasing the mtu "live" should
+     only be done if the driver has already supplied buffers
+     sufficiently large to handle the new mtu.  Decreasing
+     the mtu live is safe */
+
+  MXGEFW_CMD_SET_MTU = 16,
+  MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET = 17,  /* in microseconds */
+  MXGEFW_CMD_SET_STATS_INTERVAL = 18,   /* in microseconds */
+  MXGEFW_CMD_SET_STATS_DMA_OBSOLETE = 19, /* replaced by SET_STATS_DMA_V2 */
+
+  MXGEFW_ENABLE_PROMISC = 20,
+  MXGEFW_DISABLE_PROMISC = 21,
+  MXGEFW_SET_MAC_ADDRESS = 22,
+
+  MXGEFW_ENABLE_FLOW_CONTROL = 23,
+  MXGEFW_DISABLE_FLOW_CONTROL = 24,
+
+  /* do a DMA test
+     data0,data1 = DMA address
+     data2       = RDMA length (MSH), WDMA length (LSH)
+     command return data = repetitions (MSH), 0.5-ms ticks (LSH)
+  */
+  MXGEFW_DMA_TEST = 25,
+
+  MXGEFW_ENABLE_ALLMULTI = 26,
+  MXGEFW_DISABLE_ALLMULTI = 27,
+
+  /* returns MXGEFW_CMD_ERROR_MULTICAST
+     if there is no room in the cache
+     data0,MSH(data1) = multicast group address */
+  MXGEFW_JOIN_MULTICAST_GROUP = 28,
+  /* returns MXGEFW_CMD_ERROR_MULTICAST
+     if the address is not in the cache,
+     or is equal to FF-FF-FF-FF-FF-FF
+     data0,MSH(data1) = multicast group address */
+  MXGEFW_LEAVE_MULTICAST_GROUP = 29,
+  MXGEFW_LEAVE_ALL_MULTICAST_GROUPS = 30,
+
+  MXGEFW_CMD_SET_STATS_DMA_V2 = 31,
+  /* data0, data1 = bus addr,
+   * data2 = sizeof(struct mcp_irq_data) from driver point of view, allows
+   * adding new stuff to mcp_irq_data without changing the ABI
+   *
+   * If multiple slices are used, data2 contains both the size of the
+   * structure (in the lower 16 bits) and the slice number
+   * (in the upper 16 bits).
+   */
+
+  MXGEFW_CMD_UNALIGNED_TEST = 32,
+  /* same than DMA_TEST (same args) but abort with UNALIGNED on unaligned
+     chipset */
+
+  MXGEFW_CMD_UNALIGNED_STATUS = 33,
+  /* return data = boolean, true if the chipset is known to be unaligned */
+
+  MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS = 34,
+  /* data0 = number of big buffers to use.  It must be 0 or a power of 2.
+   * 0 indicates that the NIC consumes as many buffers as they are required
+   * for packet. This is the default behavior.
+   * A power of 2 number indicates that the NIC always uses the specified
+   * number of buffers for each big receive packet.
+   * It is up to the driver to ensure that this value is big enough for
+   * the NIC to be able to receive maximum-sized packets.
+   */
+
+  MXGEFW_CMD_GET_MAX_RSS_QUEUES = 35,
+  MXGEFW_CMD_ENABLE_RSS_QUEUES = 36,
+  /* data0 = number of slices n (0, 1, ..., n-1) to enable
+   * data1 = interrupt mode | use of multiple transmit queues.
+   * 0=share one INTx/MSI.
+   * 1=use one MSI-X per queue.
+   * If all queues share one interrupt, the driver must have set
+   * RSS_SHARED_INTERRUPT_DMA before enabling queues.
+   * 2=enable both receive and send queues.
+   * Without this bit set, only one send queue (slice 0's send queue)
+   * is enabled.  The receive queues are always enabled.
+   */
+#define MXGEFW_SLICE_INTR_MODE_SHARED          0x0
+#define MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE   0x1
+#define MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES 0x2
+  
+  MXGEFW_CMD_GET_RSS_SHARED_INTERRUPT_MASK_OFFSET = 37,
+  MXGEFW_CMD_SET_RSS_SHARED_INTERRUPT_DMA = 38,
+  /* data0, data1 = bus address lsw, msw */
+  MXGEFW_CMD_GET_RSS_TABLE_OFFSET = 39,
+  /* get the offset of the indirection table */
+  MXGEFW_CMD_SET_RSS_TABLE_SIZE = 40,
+  /* set the size of the indirection table */
+  MXGEFW_CMD_GET_RSS_KEY_OFFSET = 41,
+  /* get the offset of the secret key */
+  MXGEFW_CMD_RSS_KEY_UPDATED = 42,
+  /* tell nic that the secret key's been updated */
+  MXGEFW_CMD_SET_RSS_ENABLE = 43,
+  /* data0 = enable/disable rss
+   * 0: disable rss.  nic does not distribute receive packets.
+   * 1: enable rss.  nic distributes receive packets among queues.
+   * data1 = hash type
+   * 1: IPV4            (required by RSS)
+   * 2: TCP_IPV4        (required by RSS)
+   * 3: IPV4 | TCP_IPV4 (required by RSS)
+   * 4: source port
+   * 5: source port + destination port
+   */
+#define MXGEFW_RSS_HASH_TYPE_IPV4      0x1
+#define MXGEFW_RSS_HASH_TYPE_TCP_IPV4  0x2
+#define MXGEFW_RSS_HASH_TYPE_SRC_PORT  0x4
+#define MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT 0x5
+#define MXGEFW_RSS_HASH_TYPE_MAX 0x5
+  
+  MXGEFW_CMD_GET_MAX_TSO6_HDR_SIZE = 44,
+  /* Return data = the max. size of the entire headers of a IPv6 TSO packet.
+   * If the header size of a IPv6 TSO packet is larger than the specified
+   * value, then the driver must not use TSO.
+   * This size restriction only applies to IPv6 TSO.
+   * For IPv4 TSO, the maximum size of the headers is fixed, and the NIC
+   * always has enough header buffer to store maximum-sized headers.
+   */
+  
+  MXGEFW_CMD_SET_TSO_MODE = 45,
+  /* data0 = TSO mode.
+   * 0: Linux/FreeBSD style (NIC default)
+   * 1: NDIS/NetBSD style
+   */
+#define MXGEFW_TSO_MODE_LINUX  0
+#define MXGEFW_TSO_MODE_NDIS   1
+
+  MXGEFW_CMD_MDIO_READ = 46,
+  /* data0 = dev_addr (PMA/PMD or PCS ...), data1 = register/addr */
+  MXGEFW_CMD_MDIO_WRITE = 47,
+  /* data0 = dev_addr,  data1 = register/addr, data2 = value  */
+
+  MXGEFW_CMD_I2C_READ = 48,
+  /* Starts to get a fresh copy of one byte or of the module i2c table, the
+   * obtained data is cached inside the xaui-xfi chip :
+   *   data0 :  0 => get one byte, 1=> get 256 bytes
+   *   data1 :  If data0 == 0: location to refresh
+   *               bit 7:0  register location
+   *               bit 8:15 is the i2c slave addr (0 is interpreted as 0xA1)
+   *               bit 23:16 is the i2c bus number (for multi-port NICs)
+   *            If data0 == 1: unused
+   * The operation might take ~1ms for a single byte or ~65ms when refreshing all 256 bytes
+   * During the i2c operation,  MXGEFW_CMD_I2C_READ or MXGEFW_CMD_I2C_BYTE attempts
+   *  will return MXGEFW_CMD_ERROR_BUSY
+   */
+  MXGEFW_CMD_I2C_BYTE = 49,
+  /* Return the last obtained copy of a given byte in the xfp i2c table
+   * (copy cached during the last relevant MXGEFW_CMD_I2C_READ)
+   *   data0 : index of the desired table entry
+   *  Return data = the byte stored at the requested index in the table
+   */
+
+  MXGEFW_CMD_GET_VPUMP_OFFSET = 50,
+  /* Return data = NIC memory offset of mcp_vpump_public_global */
+  MXGEFW_CMD_RESET_VPUMP = 51,
+  /* Resets the VPUMP state */
+
+  MXGEFW_CMD_SET_RSS_MCP_SLOT_TYPE = 52,
+  /* data0 = mcp_slot type to use.
+   * 0 = the default 4B mcp_slot
+   * 1 = 8B mcp_slot_8
+   */
+#define MXGEFW_RSS_MCP_SLOT_TYPE_MIN        0
+#define MXGEFW_RSS_MCP_SLOT_TYPE_WITH_HASH  1
+
+  MXGEFW_CMD_SET_THROTTLE_FACTOR = 53,
+  /* set the throttle factor for ethp_z8e
+     data0 = throttle_factor
+     throttle_factor = 256 * pcie-raw-speed / tx_speed
+     tx_speed = 256 * pcie-raw-speed / throttle_factor
+
+     For PCI-E x8: pcie-raw-speed == 16Gb/s
+     For PCI-E x4: pcie-raw-speed == 8Gb/s
+
+     ex1: throttle_factor == 0x1a0 (416), tx_speed == 1.23GB/s == 9.846 Gb/s
+     ex2: throttle_factor == 0x200 (512), tx_speed == 1.0GB/s == 8 Gb/s
+
+     with tx_boundary == 2048, max-throttle-factor == 8191 => min-speed == 500Mb/s
+     with tx_boundary == 4096, max-throttle-factor == 4095 => min-speed == 1Gb/s
+  */
+  
+  MXGEFW_CMD_VPUMP_UP = 54,
+  /* Allocates VPump Connection, Send Request and Zero copy buffer address tables */
+  MXGEFW_CMD_GET_VPUMP_CLK = 55,
+  /* Get the lanai clock */
+
+  MXGEFW_CMD_GET_DCA_OFFSET = 56,
+  /* offset of dca control for WDMAs */
+
+  /* VMWare NetQueue commands */
+  MXGEFW_CMD_NETQ_GET_FILTERS_PER_QUEUE = 57,
+  MXGEFW_CMD_NETQ_ADD_FILTER = 58,
+  /* data0 = filter_id << 16 | queue << 8 | type */
+  /* data1 = MS4 of MAC Addr */
+  /* data2 = LS2_MAC << 16 | VLAN_tag */
+  MXGEFW_CMD_NETQ_DEL_FILTER = 59,
+  /* data0 = filter_id */
+  MXGEFW_CMD_NETQ_QUERY1 = 60,
+  MXGEFW_CMD_NETQ_QUERY2 = 61,
+  MXGEFW_CMD_NETQ_QUERY3 = 62,
+  MXGEFW_CMD_NETQ_QUERY4 = 63,
+
+  MXGEFW_CMD_RELAX_RXBUFFER_ALIGNMENT = 64,
+  /* When set, small receive buffers can cross page boundaries.
+   * Both small and big receive buffers may start at any address.
+   * This option has performance implications, so use with caution.
+   */
+};
+typedef enum myri10ge_mcp_cmd_type myri10ge_mcp_cmd_type_t;
+
+
+enum myri10ge_mcp_cmd_status {
+  MXGEFW_CMD_OK = 0,
+  MXGEFW_CMD_UNKNOWN = 1,
+  MXGEFW_CMD_ERROR_RANGE = 2,
+  MXGEFW_CMD_ERROR_BUSY = 3,
+  MXGEFW_CMD_ERROR_EMPTY = 4,
+  MXGEFW_CMD_ERROR_CLOSED = 5,
+  MXGEFW_CMD_ERROR_HASH_ERROR = 6,
+  MXGEFW_CMD_ERROR_BAD_PORT = 7,
+  MXGEFW_CMD_ERROR_RESOURCES = 8,
+  MXGEFW_CMD_ERROR_MULTICAST = 9,
+  MXGEFW_CMD_ERROR_UNALIGNED = 10,
+  MXGEFW_CMD_ERROR_NO_MDIO = 11,
+  MXGEFW_CMD_ERROR_I2C_FAILURE = 12,
+  MXGEFW_CMD_ERROR_I2C_ABSENT = 13,
+  MXGEFW_CMD_ERROR_BAD_PCIE_LINK = 14
+};
+typedef enum myri10ge_mcp_cmd_status myri10ge_mcp_cmd_status_t;
+
+
+#define MXGEFW_OLD_IRQ_DATA_LEN 40
+
+struct mcp_irq_data {
+  /* add new counters at the beginning */
+  uint32_t future_use[1];
+  uint32_t dropped_pause;
+  uint32_t dropped_unicast_filtered;
+  uint32_t dropped_bad_crc32;
+  uint32_t dropped_bad_phy;
+  uint32_t dropped_multicast_filtered;
+/* 40 Bytes */
+  uint32_t send_done_count;
+
+#define MXGEFW_LINK_DOWN 0
+#define MXGEFW_LINK_UP 1
+#define MXGEFW_LINK_MYRINET 2
+#define MXGEFW_LINK_UNKNOWN 3
+  uint32_t link_up;
+  uint32_t dropped_link_overflow;
+  uint32_t dropped_link_error_or_filtered;
+  uint32_t dropped_runt;
+  uint32_t dropped_overrun;
+  uint32_t dropped_no_small_buffer;
+  uint32_t dropped_no_big_buffer;
+  uint32_t rdma_tags_available;
+
+  uint8_t tx_stopped;
+  uint8_t link_down;
+  uint8_t stats_updated;
+  uint8_t valid;
+};
+typedef struct mcp_irq_data mcp_irq_data_t;
+
+#ifdef MXGEFW_NDIS
+/* Exclusively used by NDIS drivers */
+struct mcp_rss_shared_interrupt {
+  uint8_t pad[2];
+  uint8_t queue;
+  uint8_t valid;
+};
+#endif
+
+/* definitions for NETQ filter type */
+#define MXGEFW_NETQ_FILTERTYPE_NONE 0
+#define MXGEFW_NETQ_FILTERTYPE_MACADDR 1
+#define MXGEFW_NETQ_FILTERTYPE_VLAN 2
+#define MXGEFW_NETQ_FILTERTYPE_VLANMACADDR 3
+
+#endif /* _myri10ge_mcp_h */
index 13b55f0..bdc13a2 100644 (file)
  * The maximum packet length.
  */
 #define        ETHER_MAX_LEN           1518
-
-/*
- * A macro to validate a length with
- */
-#define        ETHER_IS_VALID_LEN(foo) \
-       ((foo) >= ETHER_MIN_LEN && (foo) <= ETHER_MAX_LEN)
+#define        ETHER_MAX_LEN_JUMBO     9018    /* max jumbo frame len, including CRC */
 
 /*
  * Ethernet CRC32 polynomials (big- and little-endian verions).
@@ -353,6 +348,7 @@ extern const uint8_t        etherbroadcastaddr[ETHER_ADDR_LEN];
 
 #define        ETHERMTU        (ETHER_MAX_LEN-ETHER_HDR_LEN-ETHER_CRC_LEN)
 #define        ETHERMIN        (ETHER_MIN_LEN-ETHER_HDR_LEN-ETHER_CRC_LEN)
+#define ETHERMTU_JUMBO (ETHER_MAX_LEN_JUMBO - ETHER_HDR_LEN - ETHER_CRC_LEN)
 
 #ifdef _KERNEL
 
index 7830c2d..f32ca0d 100644 (file)
@@ -162,6 +162,7 @@ struct if_data {
 #define IFCAP_VLAN_HWTAGGING   0x0010  /* hardware VLAN tag support */
 #define IFCAP_JUMBO_MTU                0x0020  /* 9000 byte MTU support */
 #define IFCAP_RSS              0x0040  /* Receive Side Scaling for IPv4 */
+#define IFCAP_VLAN_HWCSUM      0x0080  /* can do IFCAP_HWCSUM on VLANs */
 
 #define IFCAP_HWCSUM           (IFCAP_RXCSUM | IFCAP_TXCSUM)
 
index baebed2..6f23989 100644 (file)
@@ -152,6 +152,15 @@ int        ifmedia_baudrate(int);
 #define IFM_1000_CX    16              /* 1000BaseCX 150ohm STP */
 #define IFM_1000_T     17              /* 1000BaseT 4 pair cat 5 */
 #define IFM_HPNA_1     18              /* HomePNA media for ethernet frames */
+#define        IFM_10G_LR      18              /* 10GBase-LR 1310nm Single-mode */
+#define        IFM_10G_SR      19              /* 10GBase-SR 850nm Multi-mode */
+#define        IFM_10G_CX4     20              /* 10GBase CX4 copper */
+#define IFM_2500_SX    21              /* 2500BaseSX - multi-mode fiber */
+#define IFM_10G_TWINAX 22              /* 10GBase Twinax copper */
+#define IFM_10G_TWINAX_LONG    23      /* 10GBase Twinax Long copper */
+#define IFM_10G_LRM    24              /* 10GBase-LRM 850nm Multi-mode */
+#define IFM_UNKNOWN    25              /* media types not defined yet */
+#define IFM_10G_T      26              /* 10GBase-T - RJ45 */
 
 #define        IFM_ETH_MASTER  0x00000100      /* master mode (1000baseT) */
 #define        IFM_ETH_RXPAUSE 0x00000200      /* receive PAUSE frames */
index fd1689a..97b09d8 100644 (file)
 #define ctodb(db)                      /* calculates pages to devblks */ \
        ((db) << (PAGE_SHIFT - DEV_BSHIFT))
 
-
+#define MJUMPAGESIZE   PAGE_SIZE       /* jumbo cluster 4k */
+#define MJUM9BYTES     (9 * 1024)      /* jumbo cluster 9k */
+#define MJUM16BYTES    (16 * 1024)     /* jumbo cluster 16k */
 /*
  * Make this available for most of the kernel.  There were too many
  * things that included sys/systm.h just for panic().