From b42386ee03a4e688c864ba8d7094064c63d93dce Mon Sep 17 00:00:00 2001 From: Sepherosa Ziehau Date: Wed, 27 Feb 2013 21:53:37 +0800 Subject: [PATCH] bce: Implement multiple TX/RX rings and MSI-X support for 5709/5716 5709/5716 support 9 RX rings, 9 TX rings and 9 MSI-X vectors; each MSI-X vector has its own interrupt moderation parameters. (N + 1) RX rings, N TX rings and (N + 1) MSI-X vectors will be enabled; N is ncpus2, if ncpus2 is less than 8, else N will be 8. The extra RX ring is enabled mainly because: The first RX ring is only used for packets whose RSS hash could not be calculated, so it is actually _not_ involved in RSS. This extra RX ring is used for the packets whose masked RSS hash equal 0. An extra MSI-X vector is allocated for this extra RX ring; there is no correponding TX ring for this extra RX ring. In polling(4), this extra RX ring is polled along with the first RX ring in the first RX polling handler, in which the packets whose RSS hash equal 0 should be processed. Hardware provided RSS hash and packet type are not utilized yet; they will be supported in the upcoming commits. Related hardware registers and hardware initialization order are infered from Linux's bnx2 (*); NetXtremeII-PG203-R.pdf provided on Broadcom's website does not contain enough information for multiple rings and MSI-X to function. (*) Unlike Linux's bnx2, which limits number of RX rings to 8, DragonFly limits number of RX rings to 9 and 9 RX rings actually work quite well. --- sys/conf/options | 3 +- sys/config/LINT | 3 +- sys/config/LINT64 | 3 +- sys/dev/netif/bce/Makefile | 6 +- sys/dev/netif/bce/if_bce.c | 915 ++++++++++++++++++++++++++++++---- sys/dev/netif/bce/if_bcereg.h | 144 +++++- 6 files changed, 965 insertions(+), 109 deletions(-) diff --git a/sys/conf/options b/sys/conf/options index e1d3639f44..f32f390fc0 100644 --- a/sys/conf/options +++ b/sys/conf/options @@ -612,7 +612,8 @@ ATH_TXBUF opt_ath.h AH_SUPPORT_AR5416 opt_ah.h # bce driver -BCE_DEBUG opt_bce.h +BCE_RSS_DEBUG opt_bce.h +BCE_TSS_DEBUG opt_bce.h # bnx driver BNX_TSO_DEBUG opt_bnx.h diff --git a/sys/config/LINT b/sys/config/LINT index 673cf98d03..12f1b95dbc 100644 --- a/sys/config/LINT +++ b/sys/config/LINT @@ -2460,7 +2460,8 @@ options DEBUG options DEBUG_CRIT_SECTIONS options DEBUG_INTERRUPTS #options DISABLE_PSE -options BCE_DEBUG +options BCE_RSS_DEBUG +options BCE_TSS_DEBUG options BNX_TSO_DEBUG options EMX_RSS_DEBUG options EMX_TSO_DEBUG diff --git a/sys/config/LINT64 b/sys/config/LINT64 index 2cff8a418e..592caef32c 100644 --- a/sys/config/LINT64 +++ b/sys/config/LINT64 @@ -2239,7 +2239,8 @@ options DEBUG options DEBUG_CRIT_SECTIONS options DEBUG_INTERRUPTS #options DISABLE_PSE -options BCE_DEBUG +options BCE_RSS_DEBUG +options BCE_TSS_DEBUG options BNX_TSO_DEBUG options EMX_RSS_DEBUG options EMX_TSO_DEBUG diff --git a/sys/dev/netif/bce/Makefile b/sys/dev/netif/bce/Makefile index 9af5c5964d..5331248457 100644 --- a/sys/dev/netif/bce/Makefile +++ b/sys/dev/netif/bce/Makefile @@ -5,12 +5,14 @@ SRCS= if_bce.c SRCS+= miibus_if.h device_if.h bus_if.h pci_if.h SRCS+= opt_bce.h opt_ifpoll.h -#ifndef BUILDING_WITH_KERNEL +.ifndef BUILDING_WITH_KERNEL + opt_ifpoll.h: echo '#define IFPOLL_ENABLE 1' > ${.OBJDIR}/${.TARGET} opt_bce.h: touch ${.OBJDIR}/${.TARGET} -#endif + +.endif .include diff --git a/sys/dev/netif/bce/if_bce.c b/sys/dev/netif/bce/if_bce.c index 4fac572fc8..06627afbd0 100644 --- a/sys/dev/netif/bce/if_bce.c +++ b/sys/dev/netif/bce/if_bce.c @@ -46,6 +46,29 @@ * BCM5708S A0, B0 * BCM5709C A0, B0, B1 * BCM5709S A0, A1, B0, B1, B2, C0 + * + * + * Note about MSI-X on 5709/5716: + * - 9 MSI-X vectors are supported. + * - MSI-X vectors, RX/TX rings and status blocks' association + * are fixed: + * o The first RX ring and the first TX ring use the first + * status block. + * o The first MSI-X vector is associated with the first + * status block. + * o The second RX ring and the second TX ring use the second + * status block. + * o The second MSI-X vector is associated with the second + * status block. + * ... + * and so on so forth. + * - Status blocks must reside in physically contiguous memory + * and each status block consumes 128bytes. In addition to + * this, the memory for the status blocks is aligned on 128bytes + * in this driver. (see bce_dma_alloc() and HC_CONFIG) + * - Each status block has its own coalesce parameters, which also + * serve as the related MSI-X vector's interrupt moderation + * parameters. (see bce_coal_change()) */ #include "opt_bce.h" @@ -77,6 +100,8 @@ #include #include #include +#include +#include #include #include @@ -94,6 +119,16 @@ #define BCE_MSI_CKINTVL ((10 * hz) / 1000) /* 10ms */ +#ifdef BCE_RSS_DEBUG +#define BCE_RSS_DPRINTF(sc, lvl, fmt, ...) \ +do { \ + if (sc->rss_debug >= lvl) \ + if_printf(&sc->arpcom.ac_if, fmt, __VA_ARGS__); \ +} while (0) +#else /* !BCE_RSS_DEBUG */ +#define BCE_RSS_DPRINTF(sc, lvl, fmt, ...) ((void)0) +#endif /* BCE_RSS_DEBUG */ + /****************************************************************************/ /* PCI Device ID Table */ /* */ @@ -331,6 +366,8 @@ static void bce_init_tpat_cpu(struct bce_softc *); static void bce_init_cp_cpu(struct bce_softc *); static void bce_init_com_cpu(struct bce_softc *); static void bce_init_cpus(struct bce_softc *); +static void bce_setup_msix_table(struct bce_softc *); +static void bce_init_rss(struct bce_softc *); static void bce_stop(struct bce_softc *); static int bce_reset(struct bce_softc *, uint32_t); @@ -345,10 +382,20 @@ static void bce_get_mac_addr(struct bce_softc *); static void bce_set_mac_addr(struct bce_softc *); static void bce_set_rx_mode(struct bce_softc *); static void bce_coal_change(struct bce_softc *); +static void bce_npoll_coal_change(struct bce_softc *); static void bce_setup_serialize(struct bce_softc *); static void bce_serialize_skipmain(struct bce_softc *); static void bce_deserialize_skipmain(struct bce_softc *); static void bce_set_timer_cpuid(struct bce_softc *, boolean_t); +static int bce_alloc_intr(struct bce_softc *); +static void bce_free_intr(struct bce_softc *); +static void bce_try_alloc_msix(struct bce_softc *); +static void bce_free_msix(struct bce_softc *, boolean_t); +static void bce_setup_ring_cnt(struct bce_softc *); +static int bce_setup_intr(struct bce_softc *); +static void bce_teardown_intr(struct bce_softc *); +static int bce_setup_msix(struct bce_softc *); +static void bce_teardown_msix(struct bce_softc *, int); static int bce_create_tx_ring(struct bce_tx_ring *); static void bce_destroy_tx_ring(struct bce_tx_ring *); @@ -381,6 +428,7 @@ static void bce_npoll(struct ifnet *, struct ifpoll_info *); static void bce_npoll_rx(struct ifnet *, void *, int); static void bce_npoll_tx(struct ifnet *, void *, int); static void bce_npoll_status(struct ifnet *); +static void bce_npoll_rx_pack(struct ifnet *, void *, int); #endif static void bce_serialize(struct ifnet *, enum ifnet_serialize); static void bce_deserialize(struct ifnet *, enum ifnet_serialize); @@ -394,6 +442,8 @@ static void bce_intr(struct bce_softc *); static void bce_intr_legacy(void *); static void bce_intr_msi(void *); static void bce_intr_msi_oneshot(void *); +static void bce_intr_msix_rxtx(void *); +static void bce_intr_msix_rx(void *); static void bce_tx_intr(struct bce_tx_ring *, uint16_t); static void bce_rx_intr(struct bce_rx_ring *, int, uint16_t); static void bce_phy_intr(struct bce_softc *); @@ -441,10 +491,14 @@ static uint32_t bce_rx_ticks = 150; /* bcm: 18 */ static int bce_tx_wreg = 8; static int bce_msi_enable = 1; +static int bce_msix_enable = 1; static int bce_rx_pages = RX_PAGES_DEFAULT; static int bce_tx_pages = TX_PAGES_DEFAULT; +static int bce_rx_rings = 0; /* auto */ +static int bce_tx_rings = 0; /* auto */ + TUNABLE_INT("hw.bce.tx_bds_int", &bce_tx_bds_int); TUNABLE_INT("hw.bce.tx_bds", &bce_tx_bds); TUNABLE_INT("hw.bce.tx_ticks_int", &bce_tx_ticks_int); @@ -454,9 +508,12 @@ TUNABLE_INT("hw.bce.rx_bds", &bce_rx_bds); TUNABLE_INT("hw.bce.rx_ticks_int", &bce_rx_ticks_int); TUNABLE_INT("hw.bce.rx_ticks", &bce_rx_ticks); TUNABLE_INT("hw.bce.msi.enable", &bce_msi_enable); +TUNABLE_INT("hw.bce.msix.enable", &bce_msix_enable); TUNABLE_INT("hw.bce.rx_pages", &bce_rx_pages); TUNABLE_INT("hw.bce.tx_pages", &bce_tx_pages); TUNABLE_INT("hw.bce.tx_wreg", &bce_tx_wreg); +TUNABLE_INT("hw.bce.tx_rings", &bce_tx_rings); +TUNABLE_INT("hw.bce.rx_rings", &bce_rx_rings); /****************************************************************************/ /* DragonFly device dispatch table. */ @@ -640,8 +697,6 @@ bce_attach(device_t dev) struct bce_softc *sc = device_get_softc(dev); struct ifnet *ifp = &sc->arpcom.ac_if; uint32_t val; - u_int irq_flags; - void (*irq_handle)(void *); int rid, rc = 0; int i, j; struct mii_probe_args mii_args; @@ -654,6 +709,12 @@ bce_attach(device_t dev) if_initname(ifp, device_get_name(dev), device_get_unit(dev)); lwkt_serialize_init(&sc->main_serialize); + for (i = 0; i < BCE_MSIX_MAX; ++i) { + struct bce_msix_data *msix = &sc->bce_msix[i]; + + msix->msix_cpuid = -1; + msix->msix_rid = -1; + } pci_enable_busmaster(dev); @@ -877,8 +938,7 @@ bce_attach(device_t dev) bce_get_media(sc); /* Find out RX/TX ring count */ - sc->rx_ring_cnt = 1; /* XXX */ - sc->tx_ring_cnt = 1; /* XXX */ + bce_setup_ring_cnt(sc); /* Allocate DMA memory resources. */ rc = bce_dma_alloc(sc); @@ -891,13 +951,13 @@ bce_attach(device_t dev) /* * NPOLLING RX/TX CPU offset */ - if (sc->rx_ring_cnt == ncpus2) { + if (sc->rx_ring_cnt2 == ncpus2) { offset = 0; } else { - offset_def = (sc->rx_ring_cnt * device_get_unit(dev)) % ncpus2; + offset_def = (sc->rx_ring_cnt2 * device_get_unit(dev)) % ncpus2; offset = device_getenv_int(dev, "npoll.offset", offset_def); if (offset >= ncpus2 || - offset % sc->rx_ring_cnt != 0) { + offset % sc->rx_ring_cnt2 != 0) { device_printf(dev, "invalid npoll.offset %d, use %d\n", offset, offset_def); offset = offset_def; @@ -907,31 +967,9 @@ bce_attach(device_t dev) #endif /* Allocate PCI IRQ resources. */ - sc->bce_irq_type = pci_alloc_1intr(dev, bce_msi_enable, - &sc->bce_irq_rid, &irq_flags); - - sc->bce_res_irq = bus_alloc_resource_any(dev, SYS_RES_IRQ, - &sc->bce_irq_rid, irq_flags); - if (sc->bce_res_irq == NULL) { - device_printf(dev, "PCI map interrupt failed\n"); - rc = ENXIO; + rc = bce_alloc_intr(sc); + if (rc != 0) goto fail; - } - - if (sc->bce_irq_type == PCI_INTR_TYPE_LEGACY) { - irq_handle = bce_intr_legacy; - } else if (sc->bce_irq_type == PCI_INTR_TYPE_MSI) { - if (BCE_CHIP_NUM(sc) == BCE_CHIP_NUM_5709) { - irq_handle = bce_intr_msi_oneshot; - sc->bce_flags |= BCE_ONESHOT_MSI_FLAG; - } else { - irq_handle = bce_intr_msi; - sc->bce_flags |= BCE_CHECK_MSI_FLAG; - } - } else { - panic("%s: unsupported intr type %d", - device_get_nameunit(dev), sc->bce_irq_type); - } /* Setup serializer */ bce_setup_serialize(sc); @@ -966,6 +1004,11 @@ bce_attach(device_t dev) ifq_set_ready(&ifp->if_snd); ifq_set_subq_cnt(&ifp->if_snd, sc->tx_ring_cnt); + if (sc->tx_ring_cnt > 1) { + ifp->if_mapsubq = ifq_mapsubq_mask; + ifq_set_subq_mask(&ifp->if_snd, sc->tx_ring_cnt - 1); + } + /* * Look for our PHY. */ @@ -987,22 +1030,18 @@ bce_attach(device_t dev) callout_init_mp(&sc->bce_pulse_callout); callout_init_mp(&sc->bce_ckmsi_callout); - /* Hookup IRQ last. */ - rc = bus_setup_intr(dev, sc->bce_res_irq, INTR_MPSAFE, irq_handle, sc, - &sc->bce_intrhand, &sc->main_serialize); + rc = bce_setup_intr(sc); if (rc != 0) { device_printf(dev, "Failed to setup IRQ!\n"); ether_ifdetach(ifp); goto fail; } - sc->bce_intr_cpuid = rman_get_cpuid(sc->bce_res_irq); - for (i = 0; i < sc->tx_ring_cnt; ++i) { struct ifaltq_subque *ifsq = ifq_get_subq(&ifp->if_snd, i); struct bce_tx_ring *txr = &sc->tx_rings[i]; - ifsq_set_cpuid(ifsq, sc->bce_intr_cpuid); /* XXX */ + ifsq_set_cpuid(ifsq, sc->bce_msix[i].msix_cpuid); ifsq_set_priv(ifsq, txr); txr->ifsq = ifsq; @@ -1063,7 +1102,8 @@ bce_detach(device_t dev) else msg = BCE_DRV_MSG_CODE_UNLOAD; bce_reset(sc, msg); - bus_teardown_intr(dev, sc->bce_res_irq, sc->bce_intrhand); + + bce_teardown_intr(sc); ifnet_deserialize_all(ifp); @@ -1075,13 +1115,7 @@ bce_detach(device_t dev) device_delete_child(dev, sc->bce_miibus); bus_generic_detach(dev); - if (sc->bce_res_irq != NULL) { - bus_release_resource(dev, SYS_RES_IRQ, sc->bce_irq_rid, - sc->bce_res_irq); - } - - if (sc->bce_irq_type == PCI_INTR_TYPE_MSI) - pci_release_msi(dev); + bce_free_intr(sc); if (sc->bce_res_mem != NULL) { bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BAR(0), @@ -2393,7 +2427,7 @@ bce_dma_alloc(struct bce_softc *sc) struct ifnet *ifp = &sc->arpcom.ac_if; int i, rc = 0; bus_addr_t busaddr, max_busaddr; - bus_size_t status_align, stats_align; + bus_size_t status_align, stats_align, status_size; /* * The embedded PCIe to PCI-X bridge (EPB) @@ -2425,6 +2459,17 @@ bce_dma_alloc(struct bce_softc *sc) stats_align = 8; } + /* + * Each MSI-X vector needs a status block; each status block + * consumes 128bytes and is 128bytes aligned. + */ + if (sc->rx_ring_cnt > 1) { + status_size = BCE_MSIX_MAX * BCE_STATUS_BLK_MSIX_ALIGN; + status_align = BCE_STATUS_BLK_MSIX_ALIGN; + } else { + status_size = BCE_STATUS_BLK_SZ; + } + /* * Allocate the parent bus DMA tag appropriate for PCI. */ @@ -2443,7 +2488,7 @@ bce_dma_alloc(struct bce_softc *sc) * Allocate status block. */ sc->status_block = bus_dmamem_coherent_any(sc->parent_tag, - status_align, BCE_STATUS_BLK_SZ, + status_align, status_size, BUS_DMA_WAITOK | BUS_DMA_ZERO, &sc->status_tag, &sc->status_map, &sc->status_block_paddr); @@ -2517,13 +2562,20 @@ bce_dma_alloc(struct bce_softc *sc) M_WAITOK | M_ZERO); for (i = 0; i < sc->tx_ring_cnt; ++i) { sc->tx_rings[i].sc = sc; - - /* - * TODO - */ - sc->tx_rings[i].tx_cid = TX_CID; - sc->tx_rings[i].tx_hw_cons = - &sc->status_block->status_tx_quick_consumer_index0; + if (i == 0) { + sc->tx_rings[i].tx_cid = TX_CID; + sc->tx_rings[i].tx_hw_cons = + &sc->status_block->status_tx_quick_consumer_index0; + } else { + struct status_block_msix *sblk = + (struct status_block_msix *) + (((uint8_t *)(sc->status_block)) + + (i * BCE_STATUS_BLK_MSIX_ALIGN)); + + sc->tx_rings[i].tx_cid = TX_TSS_CID + i - 1; + sc->tx_rings[i].tx_hw_cons = + &sblk->status_tx_quick_consumer_index; + } rc = bce_create_tx_ring(&sc->tx_rings[i]); if (rc != 0) { @@ -2538,15 +2590,24 @@ bce_dma_alloc(struct bce_softc *sc) M_WAITOK | M_ZERO); for (i = 0; i < sc->rx_ring_cnt; ++i) { sc->rx_rings[i].sc = sc; - - /* - * TODO - */ - sc->rx_rings[i].rx_cid = RX_CID; - sc->rx_rings[i].rx_hw_cons = - &sc->status_block->status_rx_quick_consumer_index0; - sc->rx_rings[i].hw_status_idx = - &sc->status_block->status_idx; + sc->rx_rings[i].idx = i; + if (i == 0) { + sc->rx_rings[i].rx_cid = RX_CID; + sc->rx_rings[i].rx_hw_cons = + &sc->status_block->status_rx_quick_consumer_index0; + sc->rx_rings[i].hw_status_idx = + &sc->status_block->status_idx; + } else { + struct status_block_msix *sblk = + (struct status_block_msix *) + (((uint8_t *)(sc->status_block)) + + (i * BCE_STATUS_BLK_MSIX_ALIGN)); + + sc->rx_rings[i].rx_cid = RX_RSS_CID + i - 1; + sc->rx_rings[i].rx_hw_cons = + &sblk->status_rx_quick_consumer_index; + sc->rx_rings[i].hw_status_idx = &sblk->status_idx; + } rc = bce_create_rx_ring(&sc->rx_rings[i]); if (rc != 0) { @@ -3574,6 +3635,14 @@ bce_reset(struct bce_softc *sc, uint32_t reset_code) if_printf(&sc->arpcom.ac_if, "Firmware did not complete initialization!\n"); } + + if (sc->bce_irq_type == PCI_INTR_TYPE_MSIX) { + bce_setup_msix_table(sc); + /* Prevent MSIX table reads and write from timing out */ + REG_WR(sc, BCE_MISC_ECO_HW_CTL, + BCE_MISC_ECO_HW_CTL_LARGE_GRC_TMOUT_EN); + + } return rc; } @@ -3690,6 +3759,7 @@ static int bce_blockinit(struct bce_softc *sc) { uint32_t reg, val; + int i; /* Load the hardware default MAC address. */ bce_set_mac_addr(sc); @@ -3734,14 +3804,50 @@ bce_blockinit(struct bce_softc *sc) REG_WR(sc, BCE_HC_STATS_TICKS, (sc->bce_stats_ticks & 0xffff00)); REG_WR(sc, BCE_HC_STAT_COLLECT_TICKS, 0xbb8); /* 3ms */ + if (sc->bce_irq_type == PCI_INTR_TYPE_MSIX) + REG_WR(sc, BCE_HC_MSIX_BIT_VECTOR, BCE_HC_MSIX_BIT_VECTOR_VAL); + val = BCE_HC_CONFIG_TX_TMR_MODE | BCE_HC_CONFIG_COLLECT_STATS; - if (sc->bce_flags & BCE_ONESHOT_MSI_FLAG) { - if (bootverbose) - if_printf(&sc->arpcom.ac_if, "oneshot MSI\n"); + if ((sc->bce_flags & BCE_ONESHOT_MSI_FLAG) || + sc->bce_irq_type == PCI_INTR_TYPE_MSIX) { + if (bootverbose) { + if (sc->bce_irq_type == PCI_INTR_TYPE_MSIX) { + if_printf(&sc->arpcom.ac_if, + "using MSI-X\n"); + } else { + if_printf(&sc->arpcom.ac_if, + "using oneshot MSI\n"); + } + } val |= BCE_HC_CONFIG_ONE_SHOT | BCE_HC_CONFIG_USE_INT_PARAM; + if (sc->bce_irq_type == PCI_INTR_TYPE_MSIX) + val |= BCE_HC_CONFIG_SB_ADDR_INC_128B; } REG_WR(sc, BCE_HC_CONFIG, val); + for (i = 1; i < sc->rx_ring_cnt; ++i) { + uint32_t base; + + base = ((i - 1) * BCE_HC_SB_CONFIG_SIZE) + BCE_HC_SB_CONFIG_1; + KKASSERT(base <= BCE_HC_SB_CONFIG_8); + + REG_WR(sc, base, + BCE_HC_SB_CONFIG_1_TX_TMR_MODE | + /* BCE_HC_SB_CONFIG_1_RX_TMR_MODE | */ + BCE_HC_SB_CONFIG_1_ONE_SHOT); + + REG_WR(sc, base + BCE_HC_TX_QUICK_CONS_TRIP_OFF, + (sc->bce_tx_quick_cons_trip_int << 16) | + sc->bce_tx_quick_cons_trip); + REG_WR(sc, base + BCE_HC_RX_QUICK_CONS_TRIP_OFF, + (sc->bce_rx_quick_cons_trip_int << 16) | + sc->bce_rx_quick_cons_trip); + REG_WR(sc, base + BCE_HC_TX_TICKS_OFF, + (sc->bce_tx_ticks_int << 16) | sc->bce_tx_ticks); + REG_WR(sc, base + BCE_HC_RX_TICKS_OFF, + (sc->bce_rx_ticks_int << 16) | sc->bce_rx_ticks); + } + /* Clear the internal statistics counters. */ REG_WR(sc, BCE_HC_COMMAND, BCE_HC_COMMAND_CLR_STAT_NOW); @@ -4462,6 +4568,7 @@ bce_rx_int_next_rx: l2fhdr->l2_fhdr_vlan_tag; } ifp->if_input(ifp, m); + rxr->rx_pkts++; } } @@ -4530,6 +4637,9 @@ bce_tx_intr(struct bce_tx_ring *txr, uint16_t hw_tx_cons) txr->tx_mbuf_ptr[sw_tx_chain_cons] = NULL; IFNET_STAT_INC(ifp, opackets, 1); +#ifdef BCE_TSS_DEBUG + txr->tx_pkts++; +#endif } txr->used_tx_bd--; @@ -4557,7 +4667,13 @@ bce_tx_intr(struct bce_tx_ring *txr, uint16_t hw_tx_cons) static void bce_disable_intr(struct bce_softc *sc) { - REG_WR(sc, BCE_PCICFG_INT_ACK_CMD, BCE_PCICFG_INT_ACK_CMD_MASK_INT); + int i; + + for (i = 0; i < sc->rx_ring_cnt; ++i) { + REG_WR(sc, BCE_PCICFG_INT_ACK_CMD, + (sc->rx_rings[i].idx << 24) | + BCE_PCICFG_INT_ACK_CMD_MASK_INT); + } REG_RD(sc, BCE_PCICFG_INT_ACK_CMD); callout_stop(&sc->bce_ckmsi_callout); @@ -4566,7 +4682,8 @@ bce_disable_intr(struct bce_softc *sc) sc->bce_check_tx_cons = 0; sc->bce_check_status_idx = 0xffff; - lwkt_serialize_handler_disable(&sc->main_serialize); + for (i = 0; i < sc->rx_ring_cnt; ++i) + lwkt_serialize_handler_disable(sc->bce_msix[i].msix_serialize); } @@ -4579,16 +4696,22 @@ bce_disable_intr(struct bce_softc *sc) static void bce_enable_intr(struct bce_softc *sc) { - struct bce_rx_ring *rxr = &sc->rx_rings[0]; /* XXX */ + int i; - lwkt_serialize_handler_enable(&sc->main_serialize); + for (i = 0; i < sc->rx_ring_cnt; ++i) + lwkt_serialize_handler_enable(sc->bce_msix[i].msix_serialize); - REG_WR(sc, BCE_PCICFG_INT_ACK_CMD, - BCE_PCICFG_INT_ACK_CMD_INDEX_VALID | - BCE_PCICFG_INT_ACK_CMD_MASK_INT | rxr->last_status_idx); - REG_WR(sc, BCE_PCICFG_INT_ACK_CMD, - BCE_PCICFG_INT_ACK_CMD_INDEX_VALID | rxr->last_status_idx); + for (i = 0; i < sc->rx_ring_cnt; ++i) { + struct bce_rx_ring *rxr = &sc->rx_rings[i]; + REG_WR(sc, BCE_PCICFG_INT_ACK_CMD, (rxr->idx << 24) | + BCE_PCICFG_INT_ACK_CMD_INDEX_VALID | + BCE_PCICFG_INT_ACK_CMD_MASK_INT | + rxr->last_status_idx); + REG_WR(sc, BCE_PCICFG_INT_ACK_CMD, (rxr->idx << 24) | + BCE_PCICFG_INT_ACK_CMD_INDEX_VALID | + rxr->last_status_idx); + } REG_WR(sc, BCE_HC_COMMAND, sc->hc_command | BCE_HC_COMMAND_COAL_NOW); if (sc->bce_flags & BCE_CHECK_MSI_FLAG) { @@ -4601,7 +4724,7 @@ bce_enable_intr(struct bce_softc *sc) if_printf(&sc->arpcom.ac_if, "check msi\n"); callout_reset_bycpu(&sc->bce_ckmsi_callout, BCE_MSI_CKINTVL, - bce_check_msi, sc, sc->bce_intr_cpuid); + bce_check_msi, sc, sc->bce_msix[0].msix_cpuid); } } @@ -4615,7 +4738,7 @@ bce_enable_intr(struct bce_softc *sc) static void bce_reenable_intr(struct bce_rx_ring *rxr) { - REG_WR(rxr->sc, BCE_PCICFG_INT_ACK_CMD, + REG_WR(rxr->sc, BCE_PCICFG_INT_ACK_CMD, (rxr->idx << 24) | BCE_PCICFG_INT_ACK_CMD_INDEX_VALID | rxr->last_status_idx); } @@ -4688,14 +4811,31 @@ bce_init(void *xsc) /* Program appropriate promiscuous/multicast filtering. */ bce_set_rx_mode(sc); - /* Init RX buffer descriptor chain. */ + /* + * Init RX buffer descriptor chain. + */ + REG_WR(sc, BCE_RLUP_RSS_CONFIG, 0); + bce_reg_wr_ind(sc, BCE_RXP_SCRATCH_RSS_TBL_SZ, 0); + for (i = 0; i < sc->rx_ring_cnt; ++i) bce_init_rx_chain(&sc->rx_rings[i]); /* XXX return value */ - /* Init TX buffer descriptor chain. */ + if (sc->rx_ring_cnt > 1) + bce_init_rss(sc); + + /* + * Init TX buffer descriptor chain. + */ + REG_WR(sc, BCE_TSCH_TSS_CFG, 0); + for (i = 0; i < sc->tx_ring_cnt; ++i) bce_init_tx_chain(&sc->tx_rings[i]); + if (sc->tx_ring_cnt > 1) { + REG_WR(sc, BCE_TSCH_TSS_CFG, + ((sc->tx_ring_cnt - 1) << 24) | (TX_TSS_CID << 7)); + } + polling = FALSE; #ifdef IFPOLL_ENABLE if (ifp->if_flags & IFF_NPOLLING) @@ -4706,10 +4846,8 @@ bce_init(void *xsc) /* Disable interrupts if we are polling. */ bce_disable_intr(sc); - REG_WR(sc, BCE_HC_RX_QUICK_CONS_TRIP, - (1 << 16) | sc->bce_rx_quick_cons_trip); - REG_WR(sc, BCE_HC_TX_QUICK_CONS_TRIP, - (1 << 16) | sc->bce_tx_quick_cons_trip); + /* Change coalesce parameters */ + bce_npoll_coal_change(sc); } else { /* Enable host interrupts. */ bce_enable_intr(sc); @@ -5162,6 +5300,26 @@ bce_npoll_rx(struct ifnet *ifp, void *arg, int count) bce_rx_intr(rxr, count, hw_rx_cons); } +static void +bce_npoll_rx_pack(struct ifnet *ifp, void *arg, int count) +{ + struct bce_rx_ring *rxr = arg; + + KASSERT(rxr->idx == 0, ("not the first RX ring, but %d", rxr->idx)); + bce_npoll_rx(ifp, rxr, count); + + KASSERT(rxr->sc->rx_ring_cnt != rxr->sc->rx_ring_cnt2, + ("RX ring count %d, count2 %d", rxr->sc->rx_ring_cnt, + rxr->sc->rx_ring_cnt2)); + + /* Last ring carries packets whose masked hash is 0 */ + rxr = &rxr->sc->rx_rings[rxr->sc->rx_ring_cnt - 1]; + + lwkt_serialize_enter(&rxr->rx_serialize); + bce_npoll_rx(ifp, rxr, count); + lwkt_serialize_exit(&rxr->rx_serialize); +} + static void bce_npoll_tx(struct ifnet *ifp, void *arg, int count __unused) { @@ -5203,12 +5361,29 @@ bce_npoll(struct ifnet *ifp, struct ifpoll_info *info) ifsq_set_cpuid(txr->ifsq, idx); } - for (i = 0; i < sc->rx_ring_cnt; ++i) { + for (i = 0; i < sc->rx_ring_cnt2; ++i) { struct bce_rx_ring *rxr = &sc->rx_rings[i]; int idx = i + sc->npoll_ofs; KKASSERT(idx < ncpus2); - info->ifpi_rx[idx].poll_func = bce_npoll_rx; + if (i == 0 && sc->rx_ring_cnt2 != sc->rx_ring_cnt) { + /* + * If RSS is enabled, the packets whose + * masked hash are 0 are queued to the + * last RX ring; piggyback the last RX + * ring's processing in the first RX + * polling handler. (see also: comment + * in bce_setup_ring_cnt()) + */ + if (bootverbose) { + if_printf(ifp, "npoll pack last " + "RX ring on cpu%d\n", idx); + } + info->ifpi_rx[idx].poll_func = + bce_npoll_rx_pack; + } else { + info->ifpi_rx[idx].poll_func = bce_npoll_rx; + } info->ifpi_rx[idx].arg = rxr; info->ifpi_rx[idx].serializer = &rxr->rx_serialize; } @@ -5216,28 +5391,21 @@ bce_npoll(struct ifnet *ifp, struct ifpoll_info *info) if (ifp->if_flags & IFF_RUNNING) { bce_set_timer_cpuid(sc, TRUE); bce_disable_intr(sc); - - REG_WR(sc, BCE_HC_RX_QUICK_CONS_TRIP, - (1 << 16) | sc->bce_rx_quick_cons_trip); - REG_WR(sc, BCE_HC_TX_QUICK_CONS_TRIP, - (1 << 16) | sc->bce_tx_quick_cons_trip); + bce_npoll_coal_change(sc); } } else { for (i = 0; i < sc->tx_ring_cnt; ++i) { ifsq_set_cpuid(sc->tx_rings[i].ifsq, - sc->bce_intr_cpuid); /* XXX */ + sc->bce_msix[i].msix_cpuid); } if (ifp->if_flags & IFF_RUNNING) { bce_set_timer_cpuid(sc, FALSE); bce_enable_intr(sc); - REG_WR(sc, BCE_HC_TX_QUICK_CONS_TRIP, - (sc->bce_tx_quick_cons_trip_int << 16) | - sc->bce_tx_quick_cons_trip); - REG_WR(sc, BCE_HC_RX_QUICK_CONS_TRIP, - (sc->bce_rx_quick_cons_trip_int << 16) | - sc->bce_rx_quick_cons_trip); + sc->bce_coalchg_mask |= BCE_COALMASK_TX_BDS_INT | + BCE_COALMASK_RX_BDS_INT; + bce_coal_change(sc); } } } @@ -5395,6 +5563,72 @@ bce_intr_msi_oneshot(void *xsc) bce_reenable_intr(&sc->rx_rings[0]); } +static void +bce_intr_msix_rxtx(void *xrxr) +{ + struct bce_rx_ring *rxr = xrxr; + struct bce_tx_ring *txr; + uint16_t hw_rx_cons, hw_tx_cons; + + ASSERT_SERIALIZED(&rxr->rx_serialize); + + KKASSERT(rxr->idx < rxr->sc->tx_ring_cnt); + txr = &rxr->sc->tx_rings[rxr->idx]; + + /* + * Save the status block index value for use during + * the next interrupt. + */ + rxr->last_status_idx = *rxr->hw_status_idx; + + /* Make sure status index is extracted before RX/TX cons */ + cpu_lfence(); + + /* Check if the hardware has finished any work. */ + hw_rx_cons = bce_get_hw_rx_cons(rxr); + if (hw_rx_cons != rxr->rx_cons) + bce_rx_intr(rxr, -1, hw_rx_cons); + + /* Check for any completed TX frames. */ + hw_tx_cons = bce_get_hw_tx_cons(txr); + lwkt_serialize_enter(&txr->tx_serialize); + if (hw_tx_cons != txr->tx_cons) { + bce_tx_intr(txr, hw_tx_cons); + if (!ifsq_is_empty(txr->ifsq)) + ifsq_devstart(txr->ifsq); + } + lwkt_serialize_exit(&txr->tx_serialize); + + /* Re-enable interrupts */ + bce_reenable_intr(rxr); +} + +static void +bce_intr_msix_rx(void *xrxr) +{ + struct bce_rx_ring *rxr = xrxr; + uint16_t hw_rx_cons; + + ASSERT_SERIALIZED(&rxr->rx_serialize); + + /* + * Save the status block index value for use during + * the next interrupt. + */ + rxr->last_status_idx = *rxr->hw_status_idx; + + /* Make sure status index is extracted before RX cons */ + cpu_lfence(); + + /* Check if the hardware has finished any work. */ + hw_rx_cons = bce_get_hw_rx_cons(rxr); + if (hw_rx_cons != rxr->rx_cons) + bce_rx_intr(rxr, -1, hw_rx_cons); + + /* Re-enable interrupts */ + bce_reenable_intr(rxr); +} + /****************************************************************************/ /* Programs the various packet receive modes (broadcast and multicast). */ @@ -5766,7 +6000,7 @@ bce_check_msi(void *xsc) lwkt_serialize_enter(&sc->main_serialize); - KKASSERT(mycpuid == sc->bce_intr_cpuid); + KKASSERT(mycpuid == sc->bce_msix[0].msix_cpuid); if ((ifp->if_flags & (IFF_RUNNING | IFF_NPOLLING)) != IFF_RUNNING) { lwkt_serialize_exit(&sc->main_serialize); @@ -5876,6 +6110,10 @@ bce_add_sysctls(struct bce_softc *sc) { struct sysctl_ctx_list *ctx; struct sysctl_oid_list *children; +#if defined(BCE_TSS_DEBUG) || defined(BCE_RSS_DEBUG) + char node[32]; + int i; +#endif sysctl_ctx_init(&sc->bce_sysctl_ctx); sc->bce_sysctl_tree = SYSCTL_ADD_NODE(&sc->bce_sysctl_ctx, @@ -5925,8 +6163,13 @@ bce_add_sysctls(struct bce_softc *sc) sc, 0, bce_sysctl_rx_ticks, "I", "Receive coalescing ticks"); + SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rx_rings", + CTLFLAG_RD, &sc->rx_ring_cnt, 0, "# of RX rings"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rx_pages", CTLFLAG_RD, &sc->rx_rings[0].rx_pages, 0, "# of RX pages"); + + SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_rings", + CTLFLAG_RD, &sc->tx_ring_cnt, 0, "# of TX rings"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_pages", CTLFLAG_RD, &sc->tx_rings[0].tx_pages, 0, "# of TX pages"); @@ -5940,6 +6183,26 @@ bce_add_sysctls(struct bce_softc *sc) "I", "NPOLLING cpu offset"); #endif +#ifdef BCE_RSS_DEBUG + SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rss_debug", + CTLFLAG_RW, &sc->rss_debug, 0, "RSS debug level"); + for (i = 0; i < sc->rx_ring_cnt; ++i) { + ksnprintf(node, sizeof(node), "rx%d_pkt", i); + SYSCTL_ADD_ULONG(ctx, children, OID_AUTO, node, + CTLFLAG_RW, &sc->rx_rings[i].rx_pkts, + "RXed packets"); + } +#endif + +#ifdef BCE_TSS_DEBUG + for (i = 0; i < sc->tx_ring_cnt; ++i) { + ksnprintf(node, sizeof(node), "tx%d_pkt", i); + SYSCTL_ADD_ULONG(ctx, children, OID_AUTO, node, + CTLFLAG_RW, &sc->tx_rings[i].tx_pkts, + "TXed packets"); + } +#endif + SYSCTL_ADD_ULONG(ctx, children, OID_AUTO, "stat_IfHCInOctets", CTLFLAG_RD, &sc->stat_IfHCInOctets, @@ -6298,7 +6561,7 @@ bce_sysctl_rx_ticks(SYSCTL_HANDLER_ARGS) static int bce_sysctl_coal_change(SYSCTL_HANDLER_ARGS, uint32_t *coal, - uint32_t coalchg_mask) + uint32_t coalchg_mask) { struct bce_softc *sc = arg1; struct ifnet *ifp = &sc->arpcom.ac_if; @@ -6328,6 +6591,7 @@ static void bce_coal_change(struct bce_softc *sc) { struct ifnet *ifp = &sc->arpcom.ac_if; + int i; ASSERT_SERIALIZED(&sc->main_serialize); @@ -6341,6 +6605,15 @@ bce_coal_change(struct bce_softc *sc) REG_WR(sc, BCE_HC_TX_QUICK_CONS_TRIP, (sc->bce_tx_quick_cons_trip_int << 16) | sc->bce_tx_quick_cons_trip); + for (i = 1; i < sc->rx_ring_cnt; ++i) { + uint32_t base; + + base = ((i - 1) * BCE_HC_SB_CONFIG_SIZE) + + BCE_HC_SB_CONFIG_1; + REG_WR(sc, base + BCE_HC_TX_QUICK_CONS_TRIP_OFF, + (sc->bce_tx_quick_cons_trip_int << 16) | + sc->bce_tx_quick_cons_trip); + } if (bootverbose) { if_printf(ifp, "tx_bds %u, tx_bds_int %u\n", sc->bce_tx_quick_cons_trip, @@ -6352,6 +6625,14 @@ bce_coal_change(struct bce_softc *sc) (BCE_COALMASK_TX_TICKS | BCE_COALMASK_TX_TICKS_INT)) { REG_WR(sc, BCE_HC_TX_TICKS, (sc->bce_tx_ticks_int << 16) | sc->bce_tx_ticks); + for (i = 1; i < sc->rx_ring_cnt; ++i) { + uint32_t base; + + base = ((i - 1) * BCE_HC_SB_CONFIG_SIZE) + + BCE_HC_SB_CONFIG_1; + REG_WR(sc, base + BCE_HC_TX_TICKS_OFF, + (sc->bce_tx_ticks_int << 16) | sc->bce_tx_ticks); + } if (bootverbose) { if_printf(ifp, "tx_ticks %u, tx_ticks_int %u\n", sc->bce_tx_ticks, sc->bce_tx_ticks_int); @@ -6363,6 +6644,15 @@ bce_coal_change(struct bce_softc *sc) REG_WR(sc, BCE_HC_RX_QUICK_CONS_TRIP, (sc->bce_rx_quick_cons_trip_int << 16) | sc->bce_rx_quick_cons_trip); + for (i = 1; i < sc->rx_ring_cnt; ++i) { + uint32_t base; + + base = ((i - 1) * BCE_HC_SB_CONFIG_SIZE) + + BCE_HC_SB_CONFIG_1; + REG_WR(sc, base + BCE_HC_RX_QUICK_CONS_TRIP_OFF, + (sc->bce_rx_quick_cons_trip_int << 16) | + sc->bce_rx_quick_cons_trip); + } if (bootverbose) { if_printf(ifp, "rx_bds %u, rx_bds_int %u\n", sc->bce_rx_quick_cons_trip, @@ -6374,6 +6664,14 @@ bce_coal_change(struct bce_softc *sc) (BCE_COALMASK_RX_TICKS | BCE_COALMASK_RX_TICKS_INT)) { REG_WR(sc, BCE_HC_RX_TICKS, (sc->bce_rx_ticks_int << 16) | sc->bce_rx_ticks); + for (i = 1; i < sc->rx_ring_cnt; ++i) { + uint32_t base; + + base = ((i - 1) * BCE_HC_SB_CONFIG_SIZE) + + BCE_HC_SB_CONFIG_1; + REG_WR(sc, base + BCE_HC_RX_TICKS_OFF, + (sc->bce_rx_ticks_int << 16) | sc->bce_rx_ticks); + } if (bootverbose) { if_printf(ifp, "rx_ticks %u, rx_ticks_int %u\n", sc->bce_rx_ticks, sc->bce_rx_ticks_int); @@ -6538,7 +6836,7 @@ bce_sysctl_npoll_offset(SYSCTL_HANDLER_ARGS) return EINVAL; ifnet_serialize_all(ifp); - if (off >= ncpus2 || off % sc->rx_ring_cnt != 0) { + if (off >= ncpus2 || off % sc->rx_ring_cnt2 != 0) { error = EINVAL; } else { error = 0; @@ -6557,5 +6855,420 @@ bce_set_timer_cpuid(struct bce_softc *sc, boolean_t polling) if (polling) sc->bce_timer_cpuid = 0; /* XXX */ else - sc->bce_timer_cpuid = rman_get_cpuid(sc->bce_res_irq); + sc->bce_timer_cpuid = sc->bce_msix[0].msix_cpuid; +} + +static int +bce_alloc_intr(struct bce_softc *sc) +{ + u_int irq_flags; + + bce_try_alloc_msix(sc); + if (sc->bce_irq_type == PCI_INTR_TYPE_MSIX) + return 0; + + sc->bce_irq_type = pci_alloc_1intr(sc->bce_dev, bce_msi_enable, + &sc->bce_irq_rid, &irq_flags); + + sc->bce_res_irq = bus_alloc_resource_any(sc->bce_dev, SYS_RES_IRQ, + &sc->bce_irq_rid, irq_flags); + if (sc->bce_res_irq == NULL) { + device_printf(sc->bce_dev, "PCI map interrupt failed\n"); + return ENXIO; + } + return 0; +} + +static void +bce_try_alloc_msix(struct bce_softc *sc) +{ + struct bce_msix_data *msix; + int offset, i, error; + boolean_t setup = FALSE; + + if (sc->rx_ring_cnt == 1) + return; + + if (sc->rx_ring_cnt2 == ncpus2) { + offset = 0; + } else { + int offset_def = + (sc->rx_ring_cnt2 * device_get_unit(sc->bce_dev)) % ncpus2; + + offset = device_getenv_int(sc->bce_dev, + "msix.offset", offset_def); + if (offset >= ncpus2 || offset % sc->rx_ring_cnt2 != 0) { + device_printf(sc->bce_dev, + "invalid msix.offset %d, use %d\n", + offset, offset_def); + offset = offset_def; + } + } + + msix = &sc->bce_msix[0]; + msix->msix_serialize = &sc->main_serialize; + msix->msix_func = bce_intr_msi_oneshot; + msix->msix_arg = sc; + KKASSERT(offset < ncpus2); + msix->msix_cpuid = offset; + ksnprintf(msix->msix_desc, sizeof(msix->msix_desc), "%s combo", + device_get_nameunit(sc->bce_dev)); + + for (i = 1; i < sc->rx_ring_cnt; ++i) { + struct bce_rx_ring *rxr = &sc->rx_rings[i]; + + msix = &sc->bce_msix[i]; + + msix->msix_serialize = &rxr->rx_serialize; + msix->msix_arg = rxr; + msix->msix_cpuid = offset + (i % sc->rx_ring_cnt2); + KKASSERT(msix->msix_cpuid < ncpus2); + + if (i < sc->tx_ring_cnt) { + msix->msix_func = bce_intr_msix_rxtx; + ksnprintf(msix->msix_desc, sizeof(msix->msix_desc), + "%s rxtx%d", device_get_nameunit(sc->bce_dev), i); + } else { + msix->msix_func = bce_intr_msix_rx; + ksnprintf(msix->msix_desc, sizeof(msix->msix_desc), + "%s rx%d", device_get_nameunit(sc->bce_dev), i); + } + } + + /* + * Setup MSI-X table + */ + bce_setup_msix_table(sc); + REG_WR(sc, BCE_PCI_MSIX_CONTROL, BCE_MSIX_MAX - 1); + REG_WR(sc, BCE_PCI_MSIX_TBL_OFF_BIR, BCE_PCI_GRC_WINDOW2_BASE); + REG_WR(sc, BCE_PCI_MSIX_PBA_OFF_BIT, BCE_PCI_GRC_WINDOW3_BASE); + /* Flush */ + REG_RD(sc, BCE_PCI_MSIX_CONTROL); + + error = pci_setup_msix(sc->bce_dev); + if (error) { + device_printf(sc->bce_dev, "Setup MSI-X failed\n"); + goto back; + } + setup = TRUE; + + for (i = 0; i < sc->rx_ring_cnt; ++i) { + msix = &sc->bce_msix[i]; + + error = pci_alloc_msix_vector(sc->bce_dev, i, &msix->msix_rid, + msix->msix_cpuid); + if (error) { + device_printf(sc->bce_dev, + "Unable to allocate MSI-X %d on cpu%d\n", + i, msix->msix_cpuid); + goto back; + } + + msix->msix_res = bus_alloc_resource_any(sc->bce_dev, + SYS_RES_IRQ, &msix->msix_rid, RF_ACTIVE); + if (msix->msix_res == NULL) { + device_printf(sc->bce_dev, + "Unable to allocate MSI-X %d resource\n", i); + error = ENOMEM; + goto back; + } + } + + pci_enable_msix(sc->bce_dev); + sc->bce_irq_type = PCI_INTR_TYPE_MSIX; +back: + if (error) + bce_free_msix(sc, setup); +} + +static void +bce_setup_ring_cnt(struct bce_softc *sc) +{ + int msix_enable, ring_max, msix_cnt2, msix_cnt, i; + + sc->rx_ring_cnt = 1; + sc->rx_ring_cnt2 = 1; + sc->tx_ring_cnt = 1; + + if (BCE_CHIP_NUM(sc) != BCE_CHIP_NUM_5709) + return; + + msix_enable = device_getenv_int(sc->bce_dev, "msix.enable", + bce_msix_enable); + if (!msix_enable) + return; + + if (ncpus2 == 1) + return; + + msix_cnt = pci_msix_count(sc->bce_dev); + if (msix_cnt <= 1) + return; + + i = 0; + while ((1 << (i + 1)) <= msix_cnt) + ++i; + msix_cnt2 = 1 << i; + + /* + * One extra RX ring will be needed (see below), so make sure + * that there are enough MSI-X vectors. + */ + if (msix_cnt == msix_cnt2) { + /* + * XXX + * This probably will not happen; 5709/5716 + * come with 9 MSI-X vectors. + */ + msix_cnt2 >>= 1; + if (msix_cnt2 <= 1) { + device_printf(sc->bce_dev, + "MSI-X count %d could not be used\n", msix_cnt); + return; + } + device_printf(sc->bce_dev, "MSI-X count %d is power of 2\n", + msix_cnt); + } + + /* + * Setup RX ring count + */ + ring_max = BCE_RX_RING_MAX; + if (ring_max > msix_cnt2) + ring_max = msix_cnt2; + sc->rx_ring_cnt2 = device_getenv_int(sc->bce_dev, "rx_rings", + bce_rx_rings); + sc->rx_ring_cnt2 = if_ring_count2(sc->rx_ring_cnt2, ring_max); + + /* + * One extra RX ring is allocated, since the first RX ring + * could not be used for RSS hashed packets whose masked + * hash is 0. The first RX ring is only used for packets + * whose RSS hash could not be calculated, e.g. ARP packets. + * This extra RX ring will be used for packets whose masked + * hash is 0. The effective RX ring count involved in RSS + * is still sc->rx_ring_cnt2. + */ + KKASSERT(sc->rx_ring_cnt2 + 1 <= msix_cnt); + sc->rx_ring_cnt = sc->rx_ring_cnt2 + 1; + + /* + * Setup TX ring count + * + * NOTE: + * TX ring count must be less than the effective RSS RX ring + * count, since we use RX ring software data struct to save + * status index and various other MSI-X related stuffs. + */ + ring_max = BCE_TX_RING_MAX; + if (ring_max > msix_cnt2) + ring_max = msix_cnt2; + if (ring_max > sc->rx_ring_cnt2) + ring_max = sc->rx_ring_cnt2; + sc->tx_ring_cnt = device_getenv_int(sc->bce_dev, "tx_rings", + bce_tx_rings); + sc->tx_ring_cnt = if_ring_count2(sc->tx_ring_cnt, ring_max); +} + +static void +bce_free_msix(struct bce_softc *sc, boolean_t setup) +{ + int i; + + KKASSERT(sc->rx_ring_cnt > 1); + + for (i = 0; i < sc->rx_ring_cnt; ++i) { + struct bce_msix_data *msix = &sc->bce_msix[i]; + + if (msix->msix_res != NULL) { + bus_release_resource(sc->bce_dev, SYS_RES_IRQ, + msix->msix_rid, msix->msix_res); + } + if (msix->msix_rid >= 0) + pci_release_msix_vector(sc->bce_dev, msix->msix_rid); + } + if (setup) + pci_teardown_msix(sc->bce_dev); +} + +static void +bce_free_intr(struct bce_softc *sc) +{ + if (sc->bce_irq_type != PCI_INTR_TYPE_MSIX) { + if (sc->bce_res_irq != NULL) { + bus_release_resource(sc->bce_dev, SYS_RES_IRQ, + sc->bce_irq_rid, sc->bce_res_irq); + } + if (sc->bce_irq_type == PCI_INTR_TYPE_MSI) + pci_release_msi(sc->bce_dev); + } else { + bce_free_msix(sc, TRUE); + } +} + +static void +bce_setup_msix_table(struct bce_softc *sc) +{ + REG_WR(sc, BCE_PCI_GRC_WINDOW_ADDR, BCE_PCI_GRC_WINDOW_ADDR_SEP_WIN); + REG_WR(sc, BCE_PCI_GRC_WINDOW2_ADDR, BCE_MSIX_TABLE_ADDR); + REG_WR(sc, BCE_PCI_GRC_WINDOW3_ADDR, BCE_MSIX_PBA_ADDR); +} + +static int +bce_setup_intr(struct bce_softc *sc) +{ + void (*irq_handle)(void *); + int error; + + if (sc->bce_irq_type == PCI_INTR_TYPE_MSIX) + return bce_setup_msix(sc); + + if (sc->bce_irq_type == PCI_INTR_TYPE_LEGACY) { + irq_handle = bce_intr_legacy; + } else if (sc->bce_irq_type == PCI_INTR_TYPE_MSI) { + if (BCE_CHIP_NUM(sc) == BCE_CHIP_NUM_5709) { + irq_handle = bce_intr_msi_oneshot; + sc->bce_flags |= BCE_ONESHOT_MSI_FLAG; + } else { + irq_handle = bce_intr_msi; + sc->bce_flags |= BCE_CHECK_MSI_FLAG; + } + } else { + panic("%s: unsupported intr type %d", + device_get_nameunit(sc->bce_dev), sc->bce_irq_type); + } + + error = bus_setup_intr(sc->bce_dev, sc->bce_res_irq, INTR_MPSAFE, + irq_handle, sc, &sc->bce_intrhand, &sc->main_serialize); + if (error != 0) { + device_printf(sc->bce_dev, "Failed to setup IRQ!\n"); + return error; + } + sc->bce_msix[0].msix_cpuid = rman_get_cpuid(sc->bce_res_irq); + sc->bce_msix[0].msix_serialize = &sc->main_serialize; + + return 0; +} + +static void +bce_teardown_intr(struct bce_softc *sc) +{ + if (sc->bce_irq_type != PCI_INTR_TYPE_MSIX) + bus_teardown_intr(sc->bce_dev, sc->bce_res_irq, sc->bce_intrhand); + else + bce_teardown_msix(sc, sc->rx_ring_cnt); +} + +static int +bce_setup_msix(struct bce_softc *sc) +{ + int i; + + for (i = 0; i < sc->rx_ring_cnt; ++i) { + struct bce_msix_data *msix = &sc->bce_msix[i]; + int error; + + error = bus_setup_intr_descr(sc->bce_dev, msix->msix_res, + INTR_MPSAFE, msix->msix_func, msix->msix_arg, + &msix->msix_handle, msix->msix_serialize, msix->msix_desc); + if (error) { + device_printf(sc->bce_dev, "could not set up %s " + "interrupt handler.\n", msix->msix_desc); + bce_teardown_msix(sc, i); + return error; + } + } + return 0; +} + +static void +bce_teardown_msix(struct bce_softc *sc, int msix_cnt) +{ + int i; + + for (i = 0; i < msix_cnt; ++i) { + struct bce_msix_data *msix = &sc->bce_msix[i]; + + bus_teardown_intr(sc->bce_dev, msix->msix_res, + msix->msix_handle); + } +} + +static void +bce_init_rss(struct bce_softc *sc) +{ + uint8_t key[BCE_RLUP_RSS_KEY_CNT * BCE_RLUP_RSS_KEY_SIZE]; + uint32_t tbl = 0; + int i; + + KKASSERT(sc->rx_ring_cnt > 2); + + /* + * Configure RSS keys + */ + toeplitz_get_key(key, sizeof(key)); + for (i = 0; i < BCE_RLUP_RSS_KEY_CNT; ++i) { + uint32_t rss_key; + + rss_key = BCE_RLUP_RSS_KEYVAL(key, i); + BCE_RSS_DPRINTF(sc, 1, "rss_key%d 0x%08x\n", i, rss_key); + + REG_WR(sc, BCE_RLUP_RSS_KEY(i), rss_key); + } + + /* + * Configure the redirect table + * + * NOTE: + * - The "queue ID" in redirect table is the software RX ring's + * index _minus_ one. + * - The last RX ring, whose "queue ID" is (sc->rx_ring_cnt - 2) + * will be used for packets whose masked hash is 0. + * (see also: comment in bce_setup_ring_cnt()) + * + * The redirect table is configured in following fashion, except + * for the masked hash 0, which is noted above: + * (hash & ring_cnt_mask) == rdr_table[(hash & rdr_table_mask)] + */ + for (i = 0; i < BCE_RXP_SCRATCH_RSS_TBL_MAX_ENTRIES; i++) { + int shift = (i % 8) << 2, qid; + + qid = i % sc->rx_ring_cnt2; + if (qid > 0) + --qid; + else + qid = sc->rx_ring_cnt - 2; + KKASSERT(qid < (sc->rx_ring_cnt - 1)); + + tbl |= qid << shift; + if (i % 8 == 7) { + BCE_RSS_DPRINTF(sc, 1, "tbl 0x%08x\n", tbl); + REG_WR(sc, BCE_RLUP_RSS_DATA, tbl); + REG_WR(sc, BCE_RLUP_RSS_COMMAND, (i >> 3) | + BCE_RLUP_RSS_COMMAND_RSS_WRITE_MASK | + BCE_RLUP_RSS_COMMAND_WRITE | + BCE_RLUP_RSS_COMMAND_HASH_MASK); + tbl = 0; + } + } + REG_WR(sc, BCE_RLUP_RSS_CONFIG, + BCE_RLUP_RSS_CONFIG_IPV4_RSS_TYPE_ALL_XI); +} + +static void +bce_npoll_coal_change(struct bce_softc *sc) +{ + uint32_t old_rx_cons, old_tx_cons; + + old_rx_cons = sc->bce_rx_quick_cons_trip_int; + old_tx_cons = sc->bce_tx_quick_cons_trip_int; + sc->bce_rx_quick_cons_trip_int = 1; + sc->bce_tx_quick_cons_trip_int = 1; + + sc->bce_coalchg_mask |= BCE_COALMASK_TX_BDS_INT | + BCE_COALMASK_RX_BDS_INT; + bce_coal_change(sc); + + sc->bce_rx_quick_cons_trip_int = old_rx_cons; + sc->bce_tx_quick_cons_trip_int = old_tx_cons; } diff --git a/sys/dev/netif/bce/if_bcereg.h b/sys/dev/netif/bce/if_bcereg.h index 8d12088553..c3fecf3bcb 100644 --- a/sys/dev/netif/bce/if_bcereg.h +++ b/sys/dev/netif/bce/if_bcereg.h @@ -698,6 +698,31 @@ struct status_block { }; +struct status_block_msix { +#if BYTE_ORDER == BIG_ENDIAN + uint16_t status_tx_quick_consumer_index; + uint16_t status_rx_quick_consumer_index; + uint16_t status_completion_producer_index; + uint16_t status_cmd_consumer_index; + uint32_t status_unused; + uint16_t status_idx; + uint8_t status_unused2; + uint8_t status_blk_num; +#else + uint16_t status_rx_quick_consumer_index; + uint16_t status_tx_quick_consumer_index; + uint16_t status_cmd_consumer_index; + uint16_t status_completion_producer_index; + uint32_t status_unused; + uint8_t status_blk_num; + uint8_t status_unused2; + uint16_t status_idx; +#endif +}; + +#define BCE_STATUS_BLK_MSIX_ALIGN 128 + + /* * statistics_block definition */ @@ -1005,8 +1030,12 @@ struct l2_fhdr { * pci_reg definition * offset: 0x400 */ -#define BCE_PCI_GRC_WINDOW_ADDR 0x00000400 -#define BCE_PCI_GRC_WINDOW_ADDR_PCI_GRC_WINDOW_ADDR_VALUE (0x3ffffL<<8) +#define BCE_PCI_GRC_WINDOW_ADDR 0x00000400 +#define BCE_PCI_GRC_WINDOW_ADDR_VALUE (0x1ffL<<13) +#define BCE_PCI_GRC_WINDOW_ADDR_SEP_WIN (1L<<31) + +#define BCE_PCI_GRC_WINDOW2_BASE 0xc000 +#define BCE_PCI_GRC_WINDOW3_BASE 0xe000 #define BCE_PCI_CONFIG_1 0x00000404 #define BCE_PCI_CONFIG_1_READ_BOUNDARY (0x7L<<8) @@ -1182,6 +1211,26 @@ struct l2_fhdr { #define BCE_PCI_MSI_ADDR_H 0x00000454 #define BCE_PCI_MSI_ADDR_L 0x00000458 +#define BCE_PCI_MSIX_CONTROL 0x000004c0 +#define BCE_PCI_MSIX_CONTROL_MSIX_TBL_SIZ (0x7ffL<<0) +#define BCE_PCI_MSIX_CONTROL_RESERVED0 (0x1fffffL<<11) + +#define BCE_PCI_MSIX_TBL_OFF_BIR 0x000004c4 +#define BCE_PCI_MSIX_TBL_OFF_BIR_MSIX_TBL_BIR (0x7L<<0) +#define BCE_PCI_MSIX_TBL_OFF_BIR_MSIX_TBL_OFF (0x1fffffffL<<3) + +#define BCE_PCI_MSIX_PBA_OFF_BIT 0x000004c8 +#define BCE_PCI_MSIX_PBA_OFF_BIT_MSIX_PBA_BIR (0x7L<<0) +#define BCE_PCI_MSIX_PBA_OFF_BIT_MSIX_PBA_OFF (0x1fffffffL<<3) + +#define BCE_PCI_GRC_WINDOW2_ADDR 0x00000614 +#define BCE_PCI_GRC_WINDOW2_ADDR_VALUE (0x1ffL<<13) + +#define BCE_PCI_GRC_WINDOW3_ADDR 0x00000618 +#define BCE_PCI_GRC_WINDOW3_ADDR_VALUE (0x1ffL<<13) + +#define BCE_MSIX_TABLE_ADDR 0x318000 +#define BCE_MSIX_PBA_ADDR 0x31c000 /* * misc_reg definition @@ -3480,6 +3529,38 @@ struct l2_fhdr { * rlup_reg definition * offset: 0x2000 */ +#define BCE_RLUP_RSS_CONFIG 0x0000201c +#define BCE_RLUP_RSS_CONFIG_IPV4_RSS_TYPE_XI (0x3L<<0) +#define BCE_RLUP_RSS_CONFIG_IPV4_RSS_TYPE_OFF_XI (0L<<0) +#define BCE_RLUP_RSS_CONFIG_IPV4_RSS_TYPE_ALL_XI (1L<<0) +#define BCE_RLUP_RSS_CONFIG_IPV4_RSS_TYPE_IP_ONLY_XI (2L<<0) +#define BCE_RLUP_RSS_CONFIG_IPV4_RSS_TYPE_RES_XI (3L<<0) +#define BCE_RLUP_RSS_CONFIG_IPV6_RSS_TYPE_XI (0x3L<<2) +#define BCE_RLUP_RSS_CONFIG_IPV6_RSS_TYPE_OFF_XI (0L<<2) +#define BCE_RLUP_RSS_CONFIG_IPV6_RSS_TYPE_ALL_XI (1L<<2) +#define BCE_RLUP_RSS_CONFIG_IPV6_RSS_TYPE_IP_ONLY_XI (2L<<2) +#define BCE_RLUP_RSS_CONFIG_IPV6_RSS_TYPE_RES_XI (3L<<2) + +#define BCE_RLUP_RSS_KEY_BASE 0x00002020 +#define BCE_RLUP_RSS_KEY_SIZE 4 +#define BCE_RLUP_RSS_KEY_CNT 10 +#define BCE_RLUP_RSS_KEY(i) \ + (BCE_RLUP_RSS_KEY_BASE + (i * BCE_RLUP_RSS_KEY_SIZE)) +#define BCE_RLUP_RSS_KEYVAL(key, i) \ + (key[(i) * BCE_RLUP_RSS_KEY_SIZE] << 24 | \ + key[(i) * BCE_RLUP_RSS_KEY_SIZE + 1] << 16 | \ + key[(i) * BCE_RLUP_RSS_KEY_SIZE + 2] << 8 | \ + key[(i) * BCE_RLUP_RSS_KEY_SIZE + 3]) + +#define BCE_RLUP_RSS_COMMAND 0x00002048 +#define BCE_RLUP_RSS_COMMAND_RSS_IND_TABLE_ADDR (0xfUL<<0) +#define BCE_RLUP_RSS_COMMAND_RSS_WRITE_MASK (0xffUL<<4) +#define BCE_RLUP_RSS_COMMAND_WRITE (1UL<<12) +#define BCE_RLUP_RSS_COMMAND_READ (1UL<<13) +#define BCE_RLUP_RSS_COMMAND_HASH_MASK (0x7UL<<14) + +#define BCE_RLUP_RSS_DATA 0x0000204c + #define BCE_RLUP_FTQ_CMD 0x000023f8 #define BCE_RLUP_FTQ_CTL 0x000023fc #define BCE_RLUP_FTQ_CTL_MAX_DEPTH (0x3ffL<<12) @@ -3869,6 +3950,15 @@ struct l2_fhdr { #define BCE_CSCH_CH_FTQ_CTL_CUR_DEPTH (0x3ffL<<22) +/* + * tsch_reg definition + * offset: 0x4c00 + */ +#define BCE_TSCH_TSS_CFG 0x00004c1c +#define BCE_TSCH_TSS_CFG_TSS_START_CID (0x7ffL<<8) +#define BCE_TSCH_TSS_CFG_NUM_OF_TSS_CON (0xfL<<24) + + /* * tbdr_reg definition * offset: 0x5000 @@ -4920,6 +5010,23 @@ struct l2_fhdr { #define BCE_HC_PERIODIC_TICKS_8_HC_PERIODIC_TICKS (0xffffL<<0) #define BCE_HC_PERIODIC_TICKS_8_HC_INT_PERIODIC_TICKS (0xffffL<<16) +#define BCE_HC_SB_CONFIG_SIZE \ + (BCE_HC_SB_CONFIG_2 - BCE_HC_SB_CONFIG_1) +#define BCE_HC_COMP_PROD_TRIP_OFF \ + (BCE_HC_COMP_PROD_TRIP_1 - BCE_HC_SB_CONFIG_1) +#define BCE_HC_COM_TICKS_OFF \ + (BCE_HC_COM_TICKS_1 - BCE_HC_SB_CONFIG_1) +#define BCE_HC_CMD_TICKS_OFF \ + (BCE_HC_CMD_TICKS_1 - BCE_HC_SB_CONFIG_1) +#define BCE_HC_TX_QUICK_CONS_TRIP_OFF \ + (BCE_HC_TX_QUICK_CONS_TRIP_1 - BCE_HC_SB_CONFIG_1) +#define BCE_HC_TX_TICKS_OFF \ + (BCE_HC_TX_TICKS_1 - BCE_HC_SB_CONFIG_1) +#define BCE_HC_RX_QUICK_CONS_TRIP_OFF \ + (BCE_HC_RX_QUICK_CONS_TRIP_1 - BCE_HC_SB_CONFIG_1) +#define BCE_HC_RX_TICKS_OFF \ + (BCE_HC_RX_TICKS_1 - BCE_HC_SB_CONFIG_1) + /* * txp_reg definition @@ -5183,6 +5290,10 @@ struct l2_fhdr { #define BCE_RXP_FTQ_CTL_CUR_DEPTH (0x3ffL<<22) #define BCE_RXP_SCRATCH 0x000e0000 +#define BCE_RXP_SCRATCH_RXP_FLOOD 0x000e0024 +#define BCE_RXP_SCRATCH_RSS_TBL_SZ 0x000e0038 +#define BCE_RXP_SCRATCH_RSS_TBL 0x000e003c +#define BCE_RXP_SCRATCH_RSS_TBL_MAX_ENTRIES 128 /* @@ -5593,7 +5704,9 @@ struct l2_fhdr { #define MAX_CID_ADDR (GET_CID_ADDR(MAX_CID_CNT)) #define INVALID_CID_ADDR 0xffffffff +#define TX_TSS_CID 32 #define TX_CID 16 +#define RX_RSS_CID 4 #define RX_CID 0 /****************************************************************************/ @@ -5750,14 +5863,34 @@ struct bce_tx_ring { struct ifsubq_watchdog tx_watchdog; + u_long tx_pkts; + bus_dma_tag_t tx_bd_chain_tag; bus_dmamap_t *tx_bd_chain_map; /* tx_pages */ bus_addr_t *tx_bd_chain_paddr; /* tx_pages */ } __cachealign; +struct bce_msix_data { + struct lwkt_serialize *msix_serialize; + + driver_intr_t *msix_func; + void *msix_arg; + + int msix_cpuid; + char msix_desc[32]; + int msix_rid; + struct resource *msix_res; + void *msix_handle; +}; + +#define BCE_RX_RING_MAX 8 +#define BCE_TX_RING_MAX 8 +#define BCE_MSIX_MAX 9 + struct bce_rx_ring { struct lwkt_serialize rx_serialize; struct bce_softc *sc; + int idx; volatile uint16_t *hw_status_idx; uint16_t last_status_idx; @@ -5782,6 +5915,8 @@ struct bce_rx_ring { struct mbuf **rx_mbuf_ptr; /* TOTAL_RX_BD */ bus_addr_t *rx_mbuf_paddr; /* TOTAL_RX_BD */ + u_long rx_pkts; + bus_dma_tag_t rx_bd_chain_tag; bus_dmamap_t *rx_bd_chain_map; /* rx_pages */ bus_addr_t *rx_bd_chain_paddr; /* rx_pages */ @@ -5896,7 +6031,6 @@ struct bce_softc { struct callout bce_tick_callout; struct callout bce_pulse_callout; - int bce_intr_cpuid; boolean_t bce_msi_maylose; uint16_t bce_check_rx_cons; uint16_t bce_check_tx_cons; @@ -5936,9 +6070,11 @@ struct bce_softc { struct lwkt_serialize **serializes; struct lwkt_serialize main_serialize; + int rss_debug; int npoll_ofs; int tx_ring_cnt; int rx_ring_cnt; + int rx_ring_cnt2; struct bce_tx_ring *tx_rings; struct bce_rx_ring *rx_rings; @@ -6005,6 +6141,8 @@ struct bce_softc { /* Provides access to certain firmware statistics. */ uint32_t com_no_buffers; + + struct bce_msix_data bce_msix[BCE_MSIX_MAX]; }; #define BCE_COALMASK_TX_BDS_INT 0x01 -- 2.41.0