5e2c67a9857c88766e35089559d7e4d15b8d0218
[dragonfly.git] / sys / dev / netif / igb / if_igb.c
1 /*
2  * Copyright (c) 2001-2013, Intel Corporation 
3  * All rights reserved.
4  * 
5  * Redistribution and use in source and binary forms, with or without 
6  * modification, are permitted provided that the following conditions are met:
7  * 
8  *  1. Redistributions of source code must retain the above copyright notice, 
9  *     this list of conditions and the following disclaimer.
10  * 
11  *  2. Redistributions in binary form must reproduce the above copyright 
12  *     notice, this list of conditions and the following disclaimer in the 
13  *     documentation and/or other materials provided with the distribution.
14  * 
15  *  3. Neither the name of the Intel Corporation nor the names of its 
16  *     contributors may be used to endorse or promote products derived from 
17  *     this software without specific prior written permission.
18  * 
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
22  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
23  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31
32 #include "opt_ifpoll.h"
33 #include "opt_igb.h"
34
35 #include <sys/param.h>
36 #include <sys/bus.h>
37 #include <sys/endian.h>
38 #include <sys/interrupt.h>
39 #include <sys/kernel.h>
40 #include <sys/malloc.h>
41 #include <sys/mbuf.h>
42 #include <sys/proc.h>
43 #include <sys/rman.h>
44 #include <sys/serialize.h>
45 #include <sys/serialize2.h>
46 #include <sys/socket.h>
47 #include <sys/sockio.h>
48 #include <sys/sysctl.h>
49 #include <sys/systm.h>
50
51 #include <net/bpf.h>
52 #include <net/ethernet.h>
53 #include <net/if.h>
54 #include <net/if_arp.h>
55 #include <net/if_dl.h>
56 #include <net/if_media.h>
57 #include <net/ifq_var.h>
58 #include <net/toeplitz.h>
59 #include <net/toeplitz2.h>
60 #include <net/vlan/if_vlan_var.h>
61 #include <net/vlan/if_vlan_ether.h>
62 #include <net/if_poll.h>
63
64 #include <netinet/in_systm.h>
65 #include <netinet/in.h>
66 #include <netinet/ip.h>
67
68 #include <bus/pci/pcivar.h>
69 #include <bus/pci/pcireg.h>
70
71 #include <dev/netif/ig_hal/e1000_api.h>
72 #include <dev/netif/ig_hal/e1000_82575.h>
73 #include <dev/netif/ig_hal/e1000_dragonfly.h>
74 #include <dev/netif/igb/if_igb.h>
75
76 #ifdef IGB_RSS_DEBUG
77 #define IGB_RSS_DPRINTF(sc, lvl, fmt, ...) \
78 do { \
79         if (sc->rss_debug >= lvl) \
80                 if_printf(&sc->arpcom.ac_if, fmt, __VA_ARGS__); \
81 } while (0)
82 #else   /* !IGB_RSS_DEBUG */
83 #define IGB_RSS_DPRINTF(sc, lvl, fmt, ...)      ((void)0)
84 #endif  /* IGB_RSS_DEBUG */
85
86 #define IGB_NAME        "Intel(R) PRO/1000 "
87 #define IGB_DEVICE(id)  \
88         { IGB_VENDOR_ID, E1000_DEV_ID_##id, IGB_NAME #id }
89 #define IGB_DEVICE_NULL { 0, 0, NULL }
90
91 static struct igb_device {
92         uint16_t        vid;
93         uint16_t        did;
94         const char      *desc;
95 } igb_devices[] = {
96         IGB_DEVICE(82575EB_COPPER),
97         IGB_DEVICE(82575EB_FIBER_SERDES),
98         IGB_DEVICE(82575GB_QUAD_COPPER),
99         IGB_DEVICE(82576),
100         IGB_DEVICE(82576_NS),
101         IGB_DEVICE(82576_NS_SERDES),
102         IGB_DEVICE(82576_FIBER),
103         IGB_DEVICE(82576_SERDES),
104         IGB_DEVICE(82576_SERDES_QUAD),
105         IGB_DEVICE(82576_QUAD_COPPER),
106         IGB_DEVICE(82576_QUAD_COPPER_ET2),
107         IGB_DEVICE(82576_VF),
108         IGB_DEVICE(82580_COPPER),
109         IGB_DEVICE(82580_FIBER),
110         IGB_DEVICE(82580_SERDES),
111         IGB_DEVICE(82580_SGMII),
112         IGB_DEVICE(82580_COPPER_DUAL),
113         IGB_DEVICE(82580_QUAD_FIBER),
114         IGB_DEVICE(DH89XXCC_SERDES),
115         IGB_DEVICE(DH89XXCC_SGMII),
116         IGB_DEVICE(DH89XXCC_SFP),
117         IGB_DEVICE(DH89XXCC_BACKPLANE),
118         IGB_DEVICE(I350_COPPER),
119         IGB_DEVICE(I350_FIBER),
120         IGB_DEVICE(I350_SERDES),
121         IGB_DEVICE(I350_SGMII),
122         IGB_DEVICE(I350_VF),
123         IGB_DEVICE(I210_COPPER),
124         IGB_DEVICE(I210_COPPER_IT),
125         IGB_DEVICE(I210_COPPER_OEM1),
126         IGB_DEVICE(I210_COPPER_FLASHLESS),
127         IGB_DEVICE(I210_SERDES_FLASHLESS),
128         IGB_DEVICE(I210_FIBER),
129         IGB_DEVICE(I210_SERDES),
130         IGB_DEVICE(I210_SGMII),
131         IGB_DEVICE(I211_COPPER),
132         IGB_DEVICE(I354_BACKPLANE_1GBPS),
133         IGB_DEVICE(I354_BACKPLANE_2_5GBPS),
134         IGB_DEVICE(I354_SGMII),
135
136         /* required last entry */
137         IGB_DEVICE_NULL
138 };
139
140 static int      igb_probe(device_t);
141 static int      igb_attach(device_t);
142 static int      igb_detach(device_t);
143 static int      igb_shutdown(device_t);
144 static int      igb_suspend(device_t);
145 static int      igb_resume(device_t);
146
147 static boolean_t igb_is_valid_ether_addr(const uint8_t *);
148 static void     igb_setup_ifp(struct igb_softc *);
149 static boolean_t igb_txcsum_ctx(struct igb_tx_ring *, struct mbuf *);
150 static int      igb_tso_pullup(struct igb_tx_ring *, struct mbuf **);
151 static void     igb_tso_ctx(struct igb_tx_ring *, struct mbuf *, uint32_t *);
152 static void     igb_add_sysctl(struct igb_softc *);
153 static void     igb_add_intr_rate_sysctl(struct igb_softc *, int,
154                     const char *, const char *);
155 static int      igb_sysctl_intr_rate(SYSCTL_HANDLER_ARGS);
156 static int      igb_sysctl_tx_intr_nsegs(SYSCTL_HANDLER_ARGS);
157 static int      igb_sysctl_tx_wreg_nsegs(SYSCTL_HANDLER_ARGS);
158 static int      igb_sysctl_rx_wreg_nsegs(SYSCTL_HANDLER_ARGS);
159 static void     igb_set_ring_inuse(struct igb_softc *, boolean_t);
160 static int      igb_get_rxring_inuse(const struct igb_softc *, boolean_t);
161 static int      igb_get_txring_inuse(const struct igb_softc *, boolean_t);
162 static void     igb_set_timer_cpuid(struct igb_softc *, boolean_t);
163 #ifdef IFPOLL_ENABLE
164 static int      igb_sysctl_npoll_rxoff(SYSCTL_HANDLER_ARGS);
165 static int      igb_sysctl_npoll_txoff(SYSCTL_HANDLER_ARGS);
166 #endif
167
168 static void     igb_vf_init_stats(struct igb_softc *);
169 static void     igb_reset(struct igb_softc *, boolean_t);
170 static void     igb_update_stats_counters(struct igb_softc *);
171 static void     igb_update_vf_stats_counters(struct igb_softc *);
172 static void     igb_update_link_status(struct igb_softc *);
173 static void     igb_init_tx_unit(struct igb_softc *);
174 static void     igb_init_rx_unit(struct igb_softc *);
175 static void     igb_init_dmac(struct igb_softc *, uint32_t);
176
177 static void     igb_set_vlan(struct igb_softc *);
178 static void     igb_set_multi(struct igb_softc *);
179 static void     igb_set_promisc(struct igb_softc *);
180 static void     igb_disable_promisc(struct igb_softc *);
181
182 static int      igb_alloc_rings(struct igb_softc *);
183 static void     igb_free_rings(struct igb_softc *);
184 static int      igb_create_tx_ring(struct igb_tx_ring *);
185 static int      igb_create_rx_ring(struct igb_rx_ring *);
186 static void     igb_free_tx_ring(struct igb_tx_ring *);
187 static void     igb_free_rx_ring(struct igb_rx_ring *);
188 static void     igb_destroy_tx_ring(struct igb_tx_ring *, int);
189 static void     igb_destroy_rx_ring(struct igb_rx_ring *, int);
190 static void     igb_init_tx_ring(struct igb_tx_ring *);
191 static int      igb_init_rx_ring(struct igb_rx_ring *);
192 static int      igb_newbuf(struct igb_rx_ring *, int, boolean_t);
193 static int      igb_encap(struct igb_tx_ring *, struct mbuf **, int *, int *);
194 static void     igb_rx_refresh(struct igb_rx_ring *, int);
195 static void     igb_setup_serialize(struct igb_softc *);
196
197 static void     igb_stop(struct igb_softc *);
198 static void     igb_init(void *);
199 static int      igb_ioctl(struct ifnet *, u_long, caddr_t, struct ucred *);
200 static void     igb_media_status(struct ifnet *, struct ifmediareq *);
201 static int      igb_media_change(struct ifnet *);
202 static void     igb_timer(void *);
203 static void     igb_watchdog(struct ifaltq_subque *);
204 static void     igb_start(struct ifnet *, struct ifaltq_subque *);
205 #ifdef IFPOLL_ENABLE
206 static void     igb_npoll(struct ifnet *, struct ifpoll_info *);
207 static void     igb_npoll_rx(struct ifnet *, void *, int);
208 static void     igb_npoll_tx(struct ifnet *, void *, int);
209 static void     igb_npoll_status(struct ifnet *);
210 #endif
211 static void     igb_serialize(struct ifnet *, enum ifnet_serialize);
212 static void     igb_deserialize(struct ifnet *, enum ifnet_serialize);
213 static int      igb_tryserialize(struct ifnet *, enum ifnet_serialize);
214 #ifdef INVARIANTS
215 static void     igb_serialize_assert(struct ifnet *, enum ifnet_serialize,
216                     boolean_t);
217 #endif
218
219 static void     igb_intr(void *);
220 static void     igb_intr_shared(void *);
221 static void     igb_rxeof(struct igb_rx_ring *, int);
222 static void     igb_txeof(struct igb_tx_ring *, int);
223 static void     igb_set_eitr(struct igb_softc *, int, int);
224 static void     igb_enable_intr(struct igb_softc *);
225 static void     igb_disable_intr(struct igb_softc *);
226 static void     igb_init_unshared_intr(struct igb_softc *);
227 static void     igb_init_intr(struct igb_softc *);
228 static int      igb_setup_intr(struct igb_softc *);
229 static void     igb_set_txintr_mask(struct igb_tx_ring *, int *, int);
230 static void     igb_set_rxintr_mask(struct igb_rx_ring *, int *, int);
231 static void     igb_set_intr_mask(struct igb_softc *);
232 static int      igb_alloc_intr(struct igb_softc *);
233 static void     igb_free_intr(struct igb_softc *);
234 static void     igb_teardown_intr(struct igb_softc *, int);
235 static void     igb_alloc_msix(struct igb_softc *);
236 static void     igb_free_msix(struct igb_softc *, boolean_t);
237 static void     igb_msix_rx_conf(struct igb_softc *, int, int *, int);
238 static void     igb_msix_tx_conf(struct igb_softc *, int, int *, int);
239 static void     igb_msix_rx(void *);
240 static void     igb_msix_tx(void *);
241 static void     igb_msix_status(void *);
242 static void     igb_msix_rxtx(void *);
243
244 /* Management and WOL Support */
245 static void     igb_get_mgmt(struct igb_softc *);
246 static void     igb_rel_mgmt(struct igb_softc *);
247 static void     igb_get_hw_control(struct igb_softc *);
248 static void     igb_rel_hw_control(struct igb_softc *);
249 static void     igb_enable_wol(device_t);
250
251 static device_method_t igb_methods[] = {
252         /* Device interface */
253         DEVMETHOD(device_probe,         igb_probe),
254         DEVMETHOD(device_attach,        igb_attach),
255         DEVMETHOD(device_detach,        igb_detach),
256         DEVMETHOD(device_shutdown,      igb_shutdown),
257         DEVMETHOD(device_suspend,       igb_suspend),
258         DEVMETHOD(device_resume,        igb_resume),
259         DEVMETHOD_END
260 };
261
262 static driver_t igb_driver = {
263         "igb",
264         igb_methods,
265         sizeof(struct igb_softc),
266 };
267
268 static devclass_t igb_devclass;
269
270 DECLARE_DUMMY_MODULE(if_igb);
271 MODULE_DEPEND(igb, ig_hal, 1, 1, 1);
272 DRIVER_MODULE(if_igb, pci, igb_driver, igb_devclass, NULL, NULL);
273
274 static int      igb_rxd = IGB_DEFAULT_RXD;
275 static int      igb_txd = IGB_DEFAULT_TXD;
276 static int      igb_rxr = 0;
277 static int      igb_txr = 0;
278 static int      igb_msi_enable = 1;
279 static int      igb_msix_enable = 1;
280 static int      igb_msix_agg_rxtx = 1;
281 static int      igb_eee_disabled = 1;   /* Energy Efficient Ethernet */
282
283 static char     igb_flowctrl[IFM_ETH_FC_STRLEN] = IFM_ETH_FC_RXPAUSE;
284
285 /*
286  * DMA Coalescing, only for i350 - default to off,
287  * this feature is for power savings
288  */
289 static int      igb_dma_coalesce = 0;
290
291 TUNABLE_INT("hw.igb.rxd", &igb_rxd);
292 TUNABLE_INT("hw.igb.txd", &igb_txd);
293 TUNABLE_INT("hw.igb.rxr", &igb_rxr);
294 TUNABLE_INT("hw.igb.txr", &igb_txr);
295 TUNABLE_INT("hw.igb.msi.enable", &igb_msi_enable);
296 TUNABLE_INT("hw.igb.msix.enable", &igb_msix_enable);
297 TUNABLE_INT("hw.igb.msix.agg_rxtx", &igb_msix_agg_rxtx);
298 TUNABLE_STR("hw.igb.flow_ctrl", igb_flowctrl, sizeof(igb_flowctrl));
299
300 /* i350 specific */
301 TUNABLE_INT("hw.igb.eee_disabled", &igb_eee_disabled);
302 TUNABLE_INT("hw.igb.dma_coalesce", &igb_dma_coalesce);
303
304 static __inline void
305 igb_rxcsum(uint32_t staterr, struct mbuf *mp)
306 {
307         /* Ignore Checksum bit is set */
308         if (staterr & E1000_RXD_STAT_IXSM)
309                 return;
310
311         if ((staterr & (E1000_RXD_STAT_IPCS | E1000_RXDEXT_STATERR_IPE)) ==
312             E1000_RXD_STAT_IPCS)
313                 mp->m_pkthdr.csum_flags |= CSUM_IP_CHECKED | CSUM_IP_VALID;
314
315         if (staterr & (E1000_RXD_STAT_TCPCS | E1000_RXD_STAT_UDPCS)) {
316                 if ((staterr & E1000_RXDEXT_STATERR_TCPE) == 0) {
317                         mp->m_pkthdr.csum_flags |= CSUM_DATA_VALID |
318                             CSUM_PSEUDO_HDR | CSUM_FRAG_NOT_CHECKED;
319                         mp->m_pkthdr.csum_data = htons(0xffff);
320                 }
321         }
322 }
323
324 static __inline struct pktinfo *
325 igb_rssinfo(struct mbuf *m, struct pktinfo *pi,
326     uint32_t hash, uint32_t hashtype, uint32_t staterr)
327 {
328         switch (hashtype) {
329         case E1000_RXDADV_RSSTYPE_IPV4_TCP:
330                 pi->pi_netisr = NETISR_IP;
331                 pi->pi_flags = 0;
332                 pi->pi_l3proto = IPPROTO_TCP;
333                 break;
334
335         case E1000_RXDADV_RSSTYPE_IPV4:
336                 if (staterr & E1000_RXD_STAT_IXSM)
337                         return NULL;
338
339                 if ((staterr &
340                      (E1000_RXD_STAT_TCPCS | E1000_RXDEXT_STATERR_TCPE)) ==
341                     E1000_RXD_STAT_TCPCS) {
342                         pi->pi_netisr = NETISR_IP;
343                         pi->pi_flags = 0;
344                         pi->pi_l3proto = IPPROTO_UDP;
345                         break;
346                 }
347                 /* FALL THROUGH */
348         default:
349                 return NULL;
350         }
351
352         m->m_flags |= M_HASH;
353         m->m_pkthdr.hash = toeplitz_hash(hash);
354         return pi;
355 }
356
357 static int
358 igb_probe(device_t dev)
359 {
360         const struct igb_device *d;
361         uint16_t vid, did;
362
363         vid = pci_get_vendor(dev);
364         did = pci_get_device(dev);
365
366         for (d = igb_devices; d->desc != NULL; ++d) {
367                 if (vid == d->vid && did == d->did) {
368                         device_set_desc(dev, d->desc);
369                         return 0;
370                 }
371         }
372         return ENXIO;
373 }
374
375 static int
376 igb_attach(device_t dev)
377 {
378         struct igb_softc *sc = device_get_softc(dev);
379         uint16_t eeprom_data;
380         int error = 0, ring_max;
381         char flowctrl[IFM_ETH_FC_STRLEN];
382 #ifdef IFPOLL_ENABLE
383         int offset, offset_def;
384 #endif
385
386 #ifdef notyet
387         /* SYSCTL stuff */
388         SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev),
389             SYSCTL_CHILDREN(device_get_sysctl_tree(dev)),
390             OID_AUTO, "nvm", CTLTYPE_INT|CTLFLAG_RW, adapter, 0,
391             igb_sysctl_nvm_info, "I", "NVM Information");
392 #endif
393
394         ifmedia_init(&sc->media, IFM_IMASK | IFM_ETH_FCMASK,
395             igb_media_change, igb_media_status);
396         callout_init_mp(&sc->timer);
397         lwkt_serialize_init(&sc->main_serialize);
398
399         if_initname(&sc->arpcom.ac_if, device_get_name(dev),
400             device_get_unit(dev));
401         sc->dev = sc->osdep.dev = dev;
402
403         /* Enable bus mastering */
404         pci_enable_busmaster(dev);
405
406         /*
407          * Determine hardware and mac type
408          */
409         sc->hw.vendor_id = pci_get_vendor(dev);
410         sc->hw.device_id = pci_get_device(dev);
411         sc->hw.revision_id = pci_read_config(dev, PCIR_REVID, 1);
412         sc->hw.subsystem_vendor_id = pci_read_config(dev, PCIR_SUBVEND_0, 2);
413         sc->hw.subsystem_device_id = pci_read_config(dev, PCIR_SUBDEV_0, 2);
414
415         if (e1000_set_mac_type(&sc->hw))
416                 return ENXIO;
417
418         /* Are we a VF device? */
419         if (sc->hw.mac.type == e1000_vfadapt ||
420             sc->hw.mac.type == e1000_vfadapt_i350)
421                 sc->vf_ifp = 1;
422         else
423                 sc->vf_ifp = 0;
424
425         /*
426          * Configure total supported RX/TX ring count
427          */
428         switch (sc->hw.mac.type) {
429         case e1000_82575:
430                 ring_max = IGB_MAX_RING_82575;
431                 break;
432
433         case e1000_82576:
434                 ring_max = IGB_MAX_RING_82576;
435                 break;
436
437         case e1000_82580:
438                 ring_max = IGB_MAX_RING_82580;
439                 break;
440
441         case e1000_i350:
442                 ring_max = IGB_MAX_RING_I350;
443                 break;
444
445         case e1000_i354:
446                 ring_max = IGB_MAX_RING_I354;
447                 break;
448
449         case e1000_i210:
450                 ring_max = IGB_MAX_RING_I210;
451                 break;
452
453         case e1000_i211:
454                 ring_max = IGB_MAX_RING_I211;
455                 break;
456
457         default:
458                 ring_max = IGB_MIN_RING;
459                 break;
460         }
461
462         sc->rx_ring_cnt = device_getenv_int(dev, "rxr", igb_rxr);
463         sc->rx_ring_cnt = if_ring_count2(sc->rx_ring_cnt, ring_max);
464 #ifdef IGB_RSS_DEBUG
465         sc->rx_ring_cnt = device_getenv_int(dev, "rxr_debug", sc->rx_ring_cnt);
466 #endif
467         sc->rx_ring_inuse = sc->rx_ring_cnt;
468
469         sc->tx_ring_cnt = device_getenv_int(dev, "txr", igb_txr);
470         sc->tx_ring_cnt = if_ring_count2(sc->tx_ring_cnt, ring_max);
471 #ifdef IGB_TSS_DEBUG
472         sc->tx_ring_cnt = device_getenv_int(dev, "txr_debug", sc->tx_ring_cnt);
473 #endif
474         sc->tx_ring_inuse = sc->tx_ring_cnt;
475
476         /* Setup flow control. */
477         device_getenv_string(dev, "flow_ctrl", flowctrl, sizeof(flowctrl),
478             igb_flowctrl);
479         sc->ifm_flowctrl = ifmedia_str2ethfc(flowctrl);
480
481         /*
482          * Allocate IO memory
483          */
484         sc->mem_rid = PCIR_BAR(0);
485         sc->mem_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &sc->mem_rid,
486             RF_ACTIVE);
487         if (sc->mem_res == NULL) {
488                 device_printf(dev, "Unable to allocate bus resource: memory\n");
489                 error = ENXIO;
490                 goto failed;
491         }
492         sc->osdep.mem_bus_space_tag = rman_get_bustag(sc->mem_res);
493         sc->osdep.mem_bus_space_handle = rman_get_bushandle(sc->mem_res);
494
495         sc->hw.hw_addr = (uint8_t *)&sc->osdep.mem_bus_space_handle;
496
497         /* Save PCI command register for Shared Code */
498         sc->hw.bus.pci_cmd_word = pci_read_config(dev, PCIR_COMMAND, 2);
499         sc->hw.back = &sc->osdep;
500
501         /* Do Shared Code initialization */
502         if (e1000_setup_init_funcs(&sc->hw, TRUE)) {
503                 device_printf(dev, "Setup of Shared code failed\n");
504                 error = ENXIO;
505                 goto failed;
506         }
507
508         e1000_get_bus_info(&sc->hw);
509
510         sc->hw.mac.autoneg = DO_AUTO_NEG;
511         sc->hw.phy.autoneg_wait_to_complete = FALSE;
512         sc->hw.phy.autoneg_advertised = AUTONEG_ADV_DEFAULT;
513
514         /* Copper options */
515         if (sc->hw.phy.media_type == e1000_media_type_copper) {
516                 sc->hw.phy.mdix = AUTO_ALL_MODES;
517                 sc->hw.phy.disable_polarity_correction = FALSE;
518                 sc->hw.phy.ms_type = IGB_MASTER_SLAVE;
519         }
520
521         /* Set the frame limits assuming  standard ethernet sized frames. */
522         sc->max_frame_size = ETHERMTU + ETHER_HDR_LEN + ETHER_CRC_LEN;
523
524         /* Allocate RX/TX rings */
525         error = igb_alloc_rings(sc);
526         if (error)
527                 goto failed;
528
529 #ifdef IFPOLL_ENABLE
530         /*
531          * NPOLLING RX CPU offset
532          */
533         if (sc->rx_ring_cnt == ncpus2) {
534                 offset = 0;
535         } else {
536                 offset_def = (sc->rx_ring_cnt * device_get_unit(dev)) % ncpus2;
537                 offset = device_getenv_int(dev, "npoll.rxoff", offset_def);
538                 if (offset >= ncpus2 ||
539                     offset % sc->rx_ring_cnt != 0) {
540                         device_printf(dev, "invalid npoll.rxoff %d, use %d\n",
541                             offset, offset_def);
542                         offset = offset_def;
543                 }
544         }
545         sc->rx_npoll_off = offset;
546
547         /*
548          * NPOLLING TX CPU offset
549          */
550         if (sc->tx_ring_cnt == ncpus2) {
551                 offset = 0;
552         } else {
553                 offset_def = (sc->tx_ring_cnt * device_get_unit(dev)) % ncpus2;
554                 offset = device_getenv_int(dev, "npoll.txoff", offset_def);
555                 if (offset >= ncpus2 ||
556                     offset % sc->tx_ring_cnt != 0) {
557                         device_printf(dev, "invalid npoll.txoff %d, use %d\n",
558                             offset, offset_def);
559                         offset = offset_def;
560                 }
561         }
562         sc->tx_npoll_off = offset;
563 #endif
564
565         /* Allocate interrupt */
566         error = igb_alloc_intr(sc);
567         if (error)
568                 goto failed;
569
570         /* Setup serializes */
571         igb_setup_serialize(sc);
572
573         /* Allocate the appropriate stats memory */
574         if (sc->vf_ifp) {
575                 sc->stats = kmalloc(sizeof(struct e1000_vf_stats), M_DEVBUF,
576                     M_WAITOK | M_ZERO);
577                 igb_vf_init_stats(sc);
578         } else {
579                 sc->stats = kmalloc(sizeof(struct e1000_hw_stats), M_DEVBUF,
580                     M_WAITOK | M_ZERO);
581         }
582
583         /* Allocate multicast array memory. */
584         sc->mta = kmalloc(ETHER_ADDR_LEN * MAX_NUM_MULTICAST_ADDRESSES,
585             M_DEVBUF, M_WAITOK);
586
587         /* Some adapter-specific advanced features */
588         if (sc->hw.mac.type >= e1000_i350) {
589 #ifdef notyet
590                 igb_set_sysctl_value(adapter, "dma_coalesce",
591                     "configure dma coalesce",
592                     &adapter->dma_coalesce, igb_dma_coalesce);
593                 igb_set_sysctl_value(adapter, "eee_disabled",
594                     "enable Energy Efficient Ethernet",
595                     &adapter->hw.dev_spec._82575.eee_disable,
596                     igb_eee_disabled);
597 #else
598                 sc->dma_coalesce = igb_dma_coalesce;
599                 sc->hw.dev_spec._82575.eee_disable = igb_eee_disabled;
600 #endif
601                 if (sc->hw.phy.media_type == e1000_media_type_copper) {
602                         if (sc->hw.mac.type == e1000_i354)
603                                 e1000_set_eee_i354(&sc->hw, TRUE, TRUE);
604                         else
605                                 e1000_set_eee_i350(&sc->hw, TRUE, TRUE);
606                 }
607         }
608
609         /*
610          * Start from a known state, this is important in reading the nvm and
611          * mac from that.
612          */
613         e1000_reset_hw(&sc->hw);
614
615         /* Make sure we have a good EEPROM before we read from it */
616         if (sc->hw.mac.type != e1000_i210 && sc->hw.mac.type != e1000_i211 &&
617             e1000_validate_nvm_checksum(&sc->hw) < 0) {
618                 /*
619                  * Some PCI-E parts fail the first check due to
620                  * the link being in sleep state, call it again,
621                  * if it fails a second time its a real issue.
622                  */
623                 if (e1000_validate_nvm_checksum(&sc->hw) < 0) {
624                         device_printf(dev,
625                             "The EEPROM Checksum Is Not Valid\n");
626                         error = EIO;
627                         goto failed;
628                 }
629         }
630
631         /* Copy the permanent MAC address out of the EEPROM */
632         if (e1000_read_mac_addr(&sc->hw) < 0) {
633                 device_printf(dev, "EEPROM read error while reading MAC"
634                     " address\n");
635                 error = EIO;
636                 goto failed;
637         }
638         if (!igb_is_valid_ether_addr(sc->hw.mac.addr)) {
639                 device_printf(dev, "Invalid MAC address\n");
640                 error = EIO;
641                 goto failed;
642         }
643
644         /* Setup OS specific network interface */
645         igb_setup_ifp(sc);
646
647         /* Add sysctl tree, must after igb_setup_ifp() */
648         igb_add_sysctl(sc);
649
650         /* Now get a good starting state */
651         igb_reset(sc, FALSE);
652
653         /* Initialize statistics */
654         igb_update_stats_counters(sc);
655
656         sc->hw.mac.get_link_status = 1;
657         igb_update_link_status(sc);
658
659         /* Indicate SOL/IDER usage */
660         if (e1000_check_reset_block(&sc->hw)) {
661                 device_printf(dev,
662                     "PHY reset is blocked due to SOL/IDER session.\n");
663         }
664
665         /* Determine if we have to control management hardware */
666         if (e1000_enable_mng_pass_thru(&sc->hw))
667                 sc->flags |= IGB_FLAG_HAS_MGMT;
668
669         /*
670          * Setup Wake-on-Lan
671          */
672         /* APME bit in EEPROM is mapped to WUC.APME */
673         eeprom_data = E1000_READ_REG(&sc->hw, E1000_WUC) & E1000_WUC_APME;
674         if (eeprom_data)
675                 sc->wol = E1000_WUFC_MAG;
676         /* XXX disable WOL */
677         sc->wol = 0; 
678
679 #ifdef notyet
680         /* Register for VLAN events */
681         adapter->vlan_attach = EVENTHANDLER_REGISTER(vlan_config,
682              igb_register_vlan, adapter, EVENTHANDLER_PRI_FIRST);
683         adapter->vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig,
684              igb_unregister_vlan, adapter, EVENTHANDLER_PRI_FIRST);
685 #endif
686
687 #ifdef notyet
688         igb_add_hw_stats(adapter);
689 #endif
690
691         /*
692          * Disable interrupt to prevent spurious interrupts (line based
693          * interrupt, MSI or even MSI-X), which had been observed on
694          * several types of LOMs, from being handled.
695          */
696         igb_disable_intr(sc);
697
698         error = igb_setup_intr(sc);
699         if (error) {
700                 ether_ifdetach(&sc->arpcom.ac_if);
701                 goto failed;
702         }
703         return 0;
704
705 failed:
706         igb_detach(dev);
707         return error;
708 }
709
710 static int
711 igb_detach(device_t dev)
712 {
713         struct igb_softc *sc = device_get_softc(dev);
714
715         if (device_is_attached(dev)) {
716                 struct ifnet *ifp = &sc->arpcom.ac_if;
717
718                 ifnet_serialize_all(ifp);
719
720                 igb_stop(sc);
721
722                 e1000_phy_hw_reset(&sc->hw);
723
724                 /* Give control back to firmware */
725                 igb_rel_mgmt(sc);
726                 igb_rel_hw_control(sc);
727
728                 if (sc->wol) {
729                         E1000_WRITE_REG(&sc->hw, E1000_WUC, E1000_WUC_PME_EN);
730                         E1000_WRITE_REG(&sc->hw, E1000_WUFC, sc->wol);
731                         igb_enable_wol(dev);
732                 }
733
734                 igb_teardown_intr(sc, sc->intr_cnt);
735
736                 ifnet_deserialize_all(ifp);
737
738                 ether_ifdetach(ifp);
739         } else if (sc->mem_res != NULL) {
740                 igb_rel_hw_control(sc);
741         }
742
743         ifmedia_removeall(&sc->media);
744         bus_generic_detach(dev);
745
746         igb_free_intr(sc);
747
748         if (sc->msix_mem_res != NULL) {
749                 bus_release_resource(dev, SYS_RES_MEMORY, sc->msix_mem_rid,
750                     sc->msix_mem_res);
751         }
752         if (sc->mem_res != NULL) {
753                 bus_release_resource(dev, SYS_RES_MEMORY, sc->mem_rid,
754                     sc->mem_res);
755         }
756
757         igb_free_rings(sc);
758
759         if (sc->mta != NULL)
760                 kfree(sc->mta, M_DEVBUF);
761         if (sc->stats != NULL)
762                 kfree(sc->stats, M_DEVBUF);
763         if (sc->serializes != NULL)
764                 kfree(sc->serializes, M_DEVBUF);
765
766         return 0;
767 }
768
769 static int
770 igb_shutdown(device_t dev)
771 {
772         return igb_suspend(dev);
773 }
774
775 static int
776 igb_suspend(device_t dev)
777 {
778         struct igb_softc *sc = device_get_softc(dev);
779         struct ifnet *ifp = &sc->arpcom.ac_if;
780
781         ifnet_serialize_all(ifp);
782
783         igb_stop(sc);
784
785         igb_rel_mgmt(sc);
786         igb_rel_hw_control(sc);
787
788         if (sc->wol) {
789                 E1000_WRITE_REG(&sc->hw, E1000_WUC, E1000_WUC_PME_EN);
790                 E1000_WRITE_REG(&sc->hw, E1000_WUFC, sc->wol);
791                 igb_enable_wol(dev);
792         }
793
794         ifnet_deserialize_all(ifp);
795
796         return bus_generic_suspend(dev);
797 }
798
799 static int
800 igb_resume(device_t dev)
801 {
802         struct igb_softc *sc = device_get_softc(dev);
803         struct ifnet *ifp = &sc->arpcom.ac_if;
804         int i;
805
806         ifnet_serialize_all(ifp);
807
808         igb_init(sc);
809         igb_get_mgmt(sc);
810
811         for (i = 0; i < sc->tx_ring_inuse; ++i)
812                 ifsq_devstart_sched(sc->tx_rings[i].ifsq);
813
814         ifnet_deserialize_all(ifp);
815
816         return bus_generic_resume(dev);
817 }
818
819 static int
820 igb_ioctl(struct ifnet *ifp, u_long command, caddr_t data, struct ucred *cr)
821 {
822         struct igb_softc *sc = ifp->if_softc;
823         struct ifreq *ifr = (struct ifreq *)data;
824         int max_frame_size, mask, reinit;
825         int error = 0;
826
827         ASSERT_IFNET_SERIALIZED_ALL(ifp);
828
829         switch (command) {
830         case SIOCSIFMTU:
831                 max_frame_size = 9234;
832                 if (ifr->ifr_mtu > max_frame_size - ETHER_HDR_LEN -
833                     ETHER_CRC_LEN) {
834                         error = EINVAL;
835                         break;
836                 }
837
838                 ifp->if_mtu = ifr->ifr_mtu;
839                 sc->max_frame_size = ifp->if_mtu + ETHER_HDR_LEN +
840                     ETHER_CRC_LEN;
841
842                 if (ifp->if_flags & IFF_RUNNING)
843                         igb_init(sc);
844                 break;
845
846         case SIOCSIFFLAGS:
847                 if (ifp->if_flags & IFF_UP) {
848                         if (ifp->if_flags & IFF_RUNNING) {
849                                 if ((ifp->if_flags ^ sc->if_flags) &
850                                     (IFF_PROMISC | IFF_ALLMULTI)) {
851                                         igb_disable_promisc(sc);
852                                         igb_set_promisc(sc);
853                                 }
854                         } else {
855                                 igb_init(sc);
856                         }
857                 } else if (ifp->if_flags & IFF_RUNNING) {
858                         igb_stop(sc);
859                 }
860                 sc->if_flags = ifp->if_flags;
861                 break;
862
863         case SIOCADDMULTI:
864         case SIOCDELMULTI:
865                 if (ifp->if_flags & IFF_RUNNING) {
866                         igb_disable_intr(sc);
867                         igb_set_multi(sc);
868 #ifdef IFPOLL_ENABLE
869                         if (!(ifp->if_flags & IFF_NPOLLING))
870 #endif
871                                 igb_enable_intr(sc);
872                 }
873                 break;
874
875         case SIOCSIFMEDIA:
876                 /* Check SOL/IDER usage */
877                 if (e1000_check_reset_block(&sc->hw)) {
878                         if_printf(ifp, "Media change is "
879                             "blocked due to SOL/IDER session.\n");
880                         break;
881                 }
882                 /* FALL THROUGH */
883
884         case SIOCGIFMEDIA:
885                 error = ifmedia_ioctl(ifp, ifr, &sc->media, command);
886                 break;
887
888         case SIOCSIFCAP:
889                 reinit = 0;
890                 mask = ifr->ifr_reqcap ^ ifp->if_capenable;
891                 if (mask & IFCAP_RXCSUM) {
892                         ifp->if_capenable ^= IFCAP_RXCSUM;
893                         reinit = 1;
894                 }
895                 if (mask & IFCAP_VLAN_HWTAGGING) {
896                         ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
897                         reinit = 1;
898                 }
899                 if (mask & IFCAP_TXCSUM) {
900                         ifp->if_capenable ^= IFCAP_TXCSUM;
901                         if (ifp->if_capenable & IFCAP_TXCSUM)
902                                 ifp->if_hwassist |= IGB_CSUM_FEATURES;
903                         else
904                                 ifp->if_hwassist &= ~IGB_CSUM_FEATURES;
905                 }
906                 if (mask & IFCAP_TSO) {
907                         ifp->if_capenable ^= IFCAP_TSO;
908                         if (ifp->if_capenable & IFCAP_TSO)
909                                 ifp->if_hwassist |= CSUM_TSO;
910                         else
911                                 ifp->if_hwassist &= ~CSUM_TSO;
912                 }
913                 if (mask & IFCAP_RSS)
914                         ifp->if_capenable ^= IFCAP_RSS;
915                 if (reinit && (ifp->if_flags & IFF_RUNNING))
916                         igb_init(sc);
917                 break;
918
919         default:
920                 error = ether_ioctl(ifp, command, data);
921                 break;
922         }
923         return error;
924 }
925
926 static void
927 igb_init(void *xsc)
928 {
929         struct igb_softc *sc = xsc;
930         struct ifnet *ifp = &sc->arpcom.ac_if;
931         boolean_t polling;
932         int i;
933
934         ASSERT_IFNET_SERIALIZED_ALL(ifp);
935
936         igb_stop(sc);
937
938         /* Get the latest mac address, User can use a LAA */
939         bcopy(IF_LLADDR(ifp), sc->hw.mac.addr, ETHER_ADDR_LEN);
940
941         /* Put the address into the Receive Address Array */
942         e1000_rar_set(&sc->hw, sc->hw.mac.addr, 0);
943
944         igb_reset(sc, FALSE);
945         igb_update_link_status(sc);
946
947         E1000_WRITE_REG(&sc->hw, E1000_VET, ETHERTYPE_VLAN);
948
949         /* Configure for OS presence */
950         igb_get_mgmt(sc);
951
952         polling = FALSE;
953 #ifdef IFPOLL_ENABLE
954         if (ifp->if_flags & IFF_NPOLLING)
955                 polling = TRUE;
956 #endif
957
958         /* Configured used RX/TX rings */
959         igb_set_ring_inuse(sc, polling);
960         ifq_set_subq_mask(&ifp->if_snd, sc->tx_ring_inuse - 1);
961
962         /* Initialize interrupt */
963         igb_init_intr(sc);
964
965         /* Prepare transmit descriptors and buffers */
966         for (i = 0; i < sc->tx_ring_inuse; ++i)
967                 igb_init_tx_ring(&sc->tx_rings[i]);
968         igb_init_tx_unit(sc);
969
970         /* Setup Multicast table */
971         igb_set_multi(sc);
972
973 #if 0
974         /*
975          * Figure out the desired mbuf pool
976          * for doing jumbo/packetsplit
977          */
978         if (adapter->max_frame_size <= 2048)
979                 adapter->rx_mbuf_sz = MCLBYTES;
980         else if (adapter->max_frame_size <= 4096)
981                 adapter->rx_mbuf_sz = MJUMPAGESIZE;
982         else
983                 adapter->rx_mbuf_sz = MJUM9BYTES;
984 #endif
985
986         /* Prepare receive descriptors and buffers */
987         for (i = 0; i < sc->rx_ring_inuse; ++i) {
988                 int error;
989
990                 error = igb_init_rx_ring(&sc->rx_rings[i]);
991                 if (error) {
992                         if_printf(ifp, "Could not setup receive structures\n");
993                         igb_stop(sc);
994                         return;
995                 }
996         }
997         igb_init_rx_unit(sc);
998
999         /* Enable VLAN support */
1000         if (ifp->if_capenable & IFCAP_VLAN_HWTAGGING)
1001                 igb_set_vlan(sc);
1002
1003         /* Don't lose promiscuous settings */
1004         igb_set_promisc(sc);
1005
1006         ifp->if_flags |= IFF_RUNNING;
1007         for (i = 0; i < sc->tx_ring_inuse; ++i) {
1008                 ifsq_clr_oactive(sc->tx_rings[i].ifsq);
1009                 ifsq_watchdog_start(&sc->tx_rings[i].tx_watchdog);
1010         }
1011
1012         igb_set_timer_cpuid(sc, polling);
1013         callout_reset_bycpu(&sc->timer, hz, igb_timer, sc, sc->timer_cpuid);
1014         e1000_clear_hw_cntrs_base_generic(&sc->hw);
1015
1016         /* This clears any pending interrupts */
1017         E1000_READ_REG(&sc->hw, E1000_ICR);
1018
1019         /*
1020          * Only enable interrupts if we are not polling, make sure
1021          * they are off otherwise.
1022          */
1023         if (polling) {
1024                 igb_disable_intr(sc);
1025         } else {
1026                 igb_enable_intr(sc);
1027                 E1000_WRITE_REG(&sc->hw, E1000_ICS, E1000_ICS_LSC);
1028         }
1029
1030         /* Set Energy Efficient Ethernet */
1031         if (sc->hw.phy.media_type == e1000_media_type_copper) {
1032                 if (sc->hw.mac.type == e1000_i354)
1033                         e1000_set_eee_i354(&sc->hw, TRUE, TRUE);
1034                 else
1035                         e1000_set_eee_i350(&sc->hw, TRUE, TRUE);
1036         }
1037 }
1038
1039 static void
1040 igb_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
1041 {
1042         struct igb_softc *sc = ifp->if_softc;
1043
1044         ASSERT_IFNET_SERIALIZED_ALL(ifp);
1045
1046         if ((ifp->if_flags & IFF_RUNNING) == 0)
1047                 sc->hw.mac.get_link_status = 1;
1048         igb_update_link_status(sc);
1049
1050         ifmr->ifm_status = IFM_AVALID;
1051         ifmr->ifm_active = IFM_ETHER;
1052
1053         if (!sc->link_active) {
1054                 if (sc->hw.mac.autoneg)
1055                         ifmr->ifm_active |= IFM_NONE;
1056                 else
1057                         ifmr->ifm_active |= sc->media.ifm_media;
1058                 return;
1059         }
1060
1061         ifmr->ifm_status |= IFM_ACTIVE;
1062         if (sc->ifm_flowctrl & IFM_ETH_FORCEPAUSE)
1063                 ifmr->ifm_active |= sc->ifm_flowctrl;
1064
1065         switch (sc->link_speed) {
1066         case 10:
1067                 ifmr->ifm_active |= IFM_10_T;
1068                 break;
1069
1070         case 100:
1071                 /*
1072                  * Support for 100Mb SFP - these are Fiber 
1073                  * but the media type appears as serdes
1074                  */
1075                 if (sc->hw.phy.media_type == e1000_media_type_fiber ||
1076                     sc->hw.phy.media_type == e1000_media_type_internal_serdes)
1077                         ifmr->ifm_active |= IFM_100_FX;
1078                 else
1079                         ifmr->ifm_active |= IFM_100_TX;
1080                 break;
1081
1082         case 1000:
1083                 if (sc->hw.phy.media_type == e1000_media_type_fiber ||
1084                     sc->hw.phy.media_type == e1000_media_type_internal_serdes)
1085                         ifmr->ifm_active |= IFM_1000_SX;
1086                 else
1087                         ifmr->ifm_active |= IFM_1000_T;
1088                 break;
1089
1090         case 2500:
1091                 ifmr->ifm_active |= IFM_2500_SX;
1092                 break;
1093         }
1094
1095         if (sc->link_duplex == FULL_DUPLEX)
1096                 ifmr->ifm_active |= IFM_FDX;
1097         else
1098                 ifmr->ifm_active |= IFM_HDX;
1099
1100         if (sc->link_duplex == FULL_DUPLEX)
1101                 ifmr->ifm_active |= e1000_fc2ifmedia(sc->hw.fc.current_mode);
1102 }
1103
1104 static int
1105 igb_media_change(struct ifnet *ifp)
1106 {
1107         struct igb_softc *sc = ifp->if_softc;
1108         struct ifmedia *ifm = &sc->media;
1109
1110         ASSERT_IFNET_SERIALIZED_ALL(ifp);
1111
1112         if (IFM_TYPE(ifm->ifm_media) != IFM_ETHER)
1113                 return EINVAL;
1114
1115         switch (IFM_SUBTYPE(ifm->ifm_media)) {
1116         case IFM_AUTO:
1117                 sc->hw.mac.autoneg = DO_AUTO_NEG;
1118                 sc->hw.phy.autoneg_advertised = AUTONEG_ADV_DEFAULT;
1119                 break;
1120
1121         case IFM_1000_SX:
1122         case IFM_1000_T:
1123                 sc->hw.mac.autoneg = DO_AUTO_NEG;
1124                 sc->hw.phy.autoneg_advertised = ADVERTISE_1000_FULL;
1125                 break;
1126
1127         case IFM_100_TX:
1128                 if (IFM_OPTIONS(ifm->ifm_media) & IFM_FDX) {
1129                         sc->hw.mac.forced_speed_duplex = ADVERTISE_100_FULL;
1130                 } else {
1131                         if (IFM_OPTIONS(ifm->ifm_media) &
1132                             (IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE)) {
1133                                 if (bootverbose) {
1134                                         if_printf(ifp, "Flow control is not "
1135                                             "allowed for half-duplex\n");
1136                                 }
1137                                 return EINVAL;
1138                         }
1139                         sc->hw.mac.forced_speed_duplex = ADVERTISE_100_HALF;
1140                 }
1141                 sc->hw.mac.autoneg = FALSE;
1142                 sc->hw.phy.autoneg_advertised = 0;
1143                 break;
1144
1145         case IFM_10_T:
1146                 if (IFM_OPTIONS(ifm->ifm_media) & IFM_FDX) {
1147                         sc->hw.mac.forced_speed_duplex = ADVERTISE_10_FULL;
1148                 } else {
1149                         if (IFM_OPTIONS(ifm->ifm_media) &
1150                             (IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE)) {
1151                                 if (bootverbose) {
1152                                         if_printf(ifp, "Flow control is not "
1153                                             "allowed for half-duplex\n");
1154                                 }
1155                                 return EINVAL;
1156                         }
1157                         sc->hw.mac.forced_speed_duplex = ADVERTISE_10_HALF;
1158                 }
1159                 sc->hw.mac.autoneg = FALSE;
1160                 sc->hw.phy.autoneg_advertised = 0;
1161                 break;
1162
1163         default:
1164                 if (bootverbose) {
1165                         if_printf(ifp, "Unsupported media type %d\n",
1166                             IFM_SUBTYPE(ifm->ifm_media));
1167                 }
1168                 return EINVAL;
1169         }
1170         sc->ifm_flowctrl = ifm->ifm_media & IFM_ETH_FCMASK;
1171
1172         if (ifp->if_flags & IFF_RUNNING)
1173                 igb_init(sc);
1174
1175         return 0;
1176 }
1177
1178 static void
1179 igb_set_promisc(struct igb_softc *sc)
1180 {
1181         struct ifnet *ifp = &sc->arpcom.ac_if;
1182         struct e1000_hw *hw = &sc->hw;
1183         uint32_t reg;
1184
1185         if (sc->vf_ifp) {
1186                 e1000_promisc_set_vf(hw, e1000_promisc_enabled);
1187                 return;
1188         }
1189
1190         reg = E1000_READ_REG(hw, E1000_RCTL);
1191         if (ifp->if_flags & IFF_PROMISC) {
1192                 reg |= (E1000_RCTL_UPE | E1000_RCTL_MPE);
1193                 E1000_WRITE_REG(hw, E1000_RCTL, reg);
1194         } else if (ifp->if_flags & IFF_ALLMULTI) {
1195                 reg |= E1000_RCTL_MPE;
1196                 reg &= ~E1000_RCTL_UPE;
1197                 E1000_WRITE_REG(hw, E1000_RCTL, reg);
1198         }
1199 }
1200
1201 static void
1202 igb_disable_promisc(struct igb_softc *sc)
1203 {
1204         struct e1000_hw *hw = &sc->hw;
1205         struct ifnet *ifp = &sc->arpcom.ac_if;
1206         uint32_t reg;
1207         int mcnt = 0;
1208
1209         if (sc->vf_ifp) {
1210                 e1000_promisc_set_vf(hw, e1000_promisc_disabled);
1211                 return;
1212         }
1213         reg = E1000_READ_REG(hw, E1000_RCTL);
1214         reg &= ~E1000_RCTL_UPE;
1215         if (ifp->if_flags & IFF_ALLMULTI) {
1216                 mcnt = MAX_NUM_MULTICAST_ADDRESSES;
1217         } else {
1218                 struct  ifmultiaddr *ifma;
1219                 TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1220                         if (ifma->ifma_addr->sa_family != AF_LINK)
1221                                 continue;
1222                         if (mcnt == MAX_NUM_MULTICAST_ADDRESSES)
1223                                 break;
1224                         mcnt++;
1225                 }
1226         }
1227         /* Don't disable if in MAX groups */
1228         if (mcnt < MAX_NUM_MULTICAST_ADDRESSES)
1229                 reg &= ~E1000_RCTL_MPE;
1230         E1000_WRITE_REG(hw, E1000_RCTL, reg);
1231 }
1232
1233 static void
1234 igb_set_multi(struct igb_softc *sc)
1235 {
1236         struct ifnet *ifp = &sc->arpcom.ac_if;
1237         struct ifmultiaddr *ifma;
1238         uint32_t reg_rctl = 0;
1239         uint8_t *mta;
1240         int mcnt = 0;
1241
1242         mta = sc->mta;
1243         bzero(mta, ETH_ADDR_LEN * MAX_NUM_MULTICAST_ADDRESSES);
1244
1245         TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1246                 if (ifma->ifma_addr->sa_family != AF_LINK)
1247                         continue;
1248
1249                 if (mcnt == MAX_NUM_MULTICAST_ADDRESSES)
1250                         break;
1251
1252                 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1253                     &mta[mcnt * ETH_ADDR_LEN], ETH_ADDR_LEN);
1254                 mcnt++;
1255         }
1256
1257         if (mcnt >= MAX_NUM_MULTICAST_ADDRESSES) {
1258                 reg_rctl = E1000_READ_REG(&sc->hw, E1000_RCTL);
1259                 reg_rctl |= E1000_RCTL_MPE;
1260                 E1000_WRITE_REG(&sc->hw, E1000_RCTL, reg_rctl);
1261         } else {
1262                 e1000_update_mc_addr_list(&sc->hw, mta, mcnt);
1263         }
1264 }
1265
1266 static void
1267 igb_timer(void *xsc)
1268 {
1269         struct igb_softc *sc = xsc;
1270
1271         lwkt_serialize_enter(&sc->main_serialize);
1272
1273         igb_update_link_status(sc);
1274         igb_update_stats_counters(sc);
1275
1276         callout_reset_bycpu(&sc->timer, hz, igb_timer, sc, sc->timer_cpuid);
1277
1278         lwkt_serialize_exit(&sc->main_serialize);
1279 }
1280
1281 static void
1282 igb_update_link_status(struct igb_softc *sc)
1283 {
1284         struct ifnet *ifp = &sc->arpcom.ac_if;
1285         struct e1000_hw *hw = &sc->hw;
1286         uint32_t link_check, thstat, ctrl;
1287
1288         link_check = thstat = ctrl = 0;
1289
1290         /* Get the cached link value or read for real */
1291         switch (hw->phy.media_type) {
1292         case e1000_media_type_copper:
1293                 if (hw->mac.get_link_status) {
1294                         /* Do the work to read phy */
1295                         e1000_check_for_link(hw);
1296                         link_check = !hw->mac.get_link_status;
1297                 } else {
1298                         link_check = TRUE;
1299                 }
1300                 break;
1301
1302         case e1000_media_type_fiber:
1303                 e1000_check_for_link(hw);
1304                 link_check = E1000_READ_REG(hw, E1000_STATUS) & E1000_STATUS_LU;
1305                 break;
1306
1307         case e1000_media_type_internal_serdes:
1308                 e1000_check_for_link(hw);
1309                 link_check = hw->mac.serdes_has_link;
1310                 break;
1311
1312         /* VF device is type_unknown */
1313         case e1000_media_type_unknown:
1314                 e1000_check_for_link(hw);
1315                 link_check = !hw->mac.get_link_status;
1316                 /* Fall thru */
1317         default:
1318                 break;
1319         }
1320
1321         /* Check for thermal downshift or shutdown */
1322         if (hw->mac.type == e1000_i350) {
1323                 thstat = E1000_READ_REG(hw, E1000_THSTAT);
1324                 ctrl = E1000_READ_REG(hw, E1000_CTRL_EXT);
1325         }
1326
1327         /* Now we check if a transition has happened */
1328         if (link_check && sc->link_active == 0) {
1329                 e1000_get_speed_and_duplex(hw, 
1330                     &sc->link_speed, &sc->link_duplex);
1331                 if (bootverbose) {
1332                         char flowctrl[IFM_ETH_FC_STRLEN];
1333
1334                         /* Get the flow control for display */
1335                         e1000_fc2str(hw->fc.current_mode, flowctrl,
1336                             sizeof(flowctrl));
1337
1338                         if_printf(ifp, "Link is up %d Mbps %s, "
1339                             "Flow control: %s\n",
1340                             sc->link_speed,
1341                             sc->link_duplex == FULL_DUPLEX ?
1342                             "Full Duplex" : "Half Duplex",
1343                             flowctrl);
1344                 }
1345                 if (sc->ifm_flowctrl & IFM_ETH_FORCEPAUSE)
1346                         e1000_force_flowctrl(hw, sc->ifm_flowctrl);
1347                 sc->link_active = 1;
1348
1349                 ifp->if_baudrate = sc->link_speed * 1000000;
1350                 if ((ctrl & E1000_CTRL_EXT_LINK_MODE_GMII) &&
1351                     (thstat & E1000_THSTAT_LINK_THROTTLE))
1352                         if_printf(ifp, "Link: thermal downshift\n");
1353                 /* Delay Link Up for Phy update */
1354                 if ((hw->mac.type == e1000_i210 ||
1355                      hw->mac.type == e1000_i211) &&
1356                     hw->phy.id == I210_I_PHY_ID)
1357                         msec_delay(IGB_I210_LINK_DELAY);
1358                 /*
1359                  * Reset if the media type changed.
1360                  * Support AutoMediaDetect for Marvell M88 PHY in i354.
1361                  */
1362                 if (hw->dev_spec._82575.media_changed) {
1363                         hw->dev_spec._82575.media_changed = FALSE;
1364                         igb_reset(sc, TRUE);
1365                 }
1366                 /* This can sleep */
1367                 ifp->if_link_state = LINK_STATE_UP;
1368                 if_link_state_change(ifp);
1369         } else if (!link_check && sc->link_active == 1) {
1370                 ifp->if_baudrate = sc->link_speed = 0;
1371                 sc->link_duplex = 0;
1372                 if (bootverbose)
1373                         if_printf(ifp, "Link is Down\n");
1374                 if ((ctrl & E1000_CTRL_EXT_LINK_MODE_GMII) &&
1375                     (thstat & E1000_THSTAT_PWR_DOWN))
1376                         if_printf(ifp, "Link: thermal shutdown\n");
1377                 sc->link_active = 0;
1378                 /* This can sleep */
1379                 ifp->if_link_state = LINK_STATE_DOWN;
1380                 if_link_state_change(ifp);
1381         }
1382 }
1383
1384 static void
1385 igb_stop(struct igb_softc *sc)
1386 {
1387         struct ifnet *ifp = &sc->arpcom.ac_if;
1388         int i;
1389
1390         ASSERT_IFNET_SERIALIZED_ALL(ifp);
1391
1392         igb_disable_intr(sc);
1393
1394         callout_stop(&sc->timer);
1395
1396         ifp->if_flags &= ~IFF_RUNNING;
1397         for (i = 0; i < sc->tx_ring_cnt; ++i) {
1398                 ifsq_clr_oactive(sc->tx_rings[i].ifsq);
1399                 ifsq_watchdog_stop(&sc->tx_rings[i].tx_watchdog);
1400                 sc->tx_rings[i].tx_flags &= ~IGB_TXFLAG_ENABLED;
1401         }
1402
1403         e1000_reset_hw(&sc->hw);
1404         E1000_WRITE_REG(&sc->hw, E1000_WUC, 0);
1405
1406         e1000_led_off(&sc->hw);
1407         e1000_cleanup_led(&sc->hw);
1408
1409         for (i = 0; i < sc->tx_ring_cnt; ++i)
1410                 igb_free_tx_ring(&sc->tx_rings[i]);
1411         for (i = 0; i < sc->rx_ring_cnt; ++i)
1412                 igb_free_rx_ring(&sc->rx_rings[i]);
1413 }
1414
1415 static void
1416 igb_reset(struct igb_softc *sc, boolean_t media_reset)
1417 {
1418         struct ifnet *ifp = &sc->arpcom.ac_if;
1419         struct e1000_hw *hw = &sc->hw;
1420         struct e1000_fc_info *fc = &hw->fc;
1421         uint32_t pba = 0;
1422         uint16_t hwm;
1423
1424         /* Let the firmware know the OS is in control */
1425         igb_get_hw_control(sc);
1426
1427         /*
1428          * Packet Buffer Allocation (PBA)
1429          * Writing PBA sets the receive portion of the buffer
1430          * the remainder is used for the transmit buffer.
1431          */
1432         switch (hw->mac.type) {
1433         case e1000_82575:
1434                 pba = E1000_PBA_32K;
1435                 break;
1436
1437         case e1000_82576:
1438         case e1000_vfadapt:
1439                 pba = E1000_READ_REG(hw, E1000_RXPBS);
1440                 pba &= E1000_RXPBS_SIZE_MASK_82576;
1441                 break;
1442
1443         case e1000_82580:
1444         case e1000_i350:
1445         case e1000_i354:
1446         case e1000_vfadapt_i350:
1447                 pba = E1000_READ_REG(hw, E1000_RXPBS);
1448                 pba = e1000_rxpbs_adjust_82580(pba);
1449                 break;
1450
1451         case e1000_i210:
1452         case e1000_i211:
1453                 pba = E1000_PBA_34K;
1454                 break;
1455
1456         default:
1457                 break;
1458         }
1459
1460         /* Special needs in case of Jumbo frames */
1461         if (hw->mac.type == e1000_82575 && ifp->if_mtu > ETHERMTU) {
1462                 uint32_t tx_space, min_tx, min_rx;
1463
1464                 pba = E1000_READ_REG(hw, E1000_PBA);
1465                 tx_space = pba >> 16;
1466                 pba &= 0xffff;
1467
1468                 min_tx = (sc->max_frame_size +
1469                     sizeof(struct e1000_tx_desc) - ETHER_CRC_LEN) * 2;
1470                 min_tx = roundup2(min_tx, 1024);
1471                 min_tx >>= 10;
1472                 min_rx = sc->max_frame_size;
1473                 min_rx = roundup2(min_rx, 1024);
1474                 min_rx >>= 10;
1475                 if (tx_space < min_tx && (min_tx - tx_space) < pba) {
1476                         pba = pba - (min_tx - tx_space);
1477                         /*
1478                          * if short on rx space, rx wins
1479                          * and must trump tx adjustment
1480                          */
1481                         if (pba < min_rx)
1482                                 pba = min_rx;
1483                 }
1484                 E1000_WRITE_REG(hw, E1000_PBA, pba);
1485         }
1486
1487         /*
1488          * These parameters control the automatic generation (Tx) and
1489          * response (Rx) to Ethernet PAUSE frames.
1490          * - High water mark should allow for at least two frames to be
1491          *   received after sending an XOFF.
1492          * - Low water mark works best when it is very near the high water mark.
1493          *   This allows the receiver to restart by sending XON when it has
1494          *   drained a bit.
1495          */
1496         hwm = min(((pba << 10) * 9 / 10),
1497             ((pba << 10) - 2 * sc->max_frame_size));
1498
1499         if (hw->mac.type < e1000_82576) {
1500                 fc->high_water = hwm & 0xFFF8; /* 8-byte granularity */
1501                 fc->low_water = fc->high_water - 8;
1502         } else {
1503                 fc->high_water = hwm & 0xFFF0; /* 16-byte granularity */
1504                 fc->low_water = fc->high_water - 16;
1505         }
1506         fc->pause_time = IGB_FC_PAUSE_TIME;
1507         fc->send_xon = TRUE;
1508         fc->requested_mode = e1000_ifmedia2fc(sc->ifm_flowctrl);
1509
1510         /* Issue a global reset */
1511         e1000_reset_hw(hw);
1512         E1000_WRITE_REG(hw, E1000_WUC, 0);
1513
1514         /* Reset for AutoMediaDetect */
1515         if (media_reset) {
1516                 e1000_setup_init_funcs(hw, TRUE);
1517                 e1000_get_bus_info(hw);
1518         }
1519
1520         if (e1000_init_hw(hw) < 0)
1521                 if_printf(ifp, "Hardware Initialization Failed\n");
1522
1523         /* Setup DMA Coalescing */
1524         igb_init_dmac(sc, pba);
1525
1526         E1000_WRITE_REG(&sc->hw, E1000_VET, ETHERTYPE_VLAN);
1527         e1000_get_phy_info(hw);
1528         e1000_check_for_link(hw);
1529 }
1530
1531 static void
1532 igb_setup_ifp(struct igb_softc *sc)
1533 {
1534         struct ifnet *ifp = &sc->arpcom.ac_if;
1535         int i;
1536
1537         ifp->if_softc = sc;
1538         ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
1539         ifp->if_init = igb_init;
1540         ifp->if_ioctl = igb_ioctl;
1541         ifp->if_start = igb_start;
1542         ifp->if_serialize = igb_serialize;
1543         ifp->if_deserialize = igb_deserialize;
1544         ifp->if_tryserialize = igb_tryserialize;
1545 #ifdef INVARIANTS
1546         ifp->if_serialize_assert = igb_serialize_assert;
1547 #endif
1548 #ifdef IFPOLL_ENABLE
1549         ifp->if_npoll = igb_npoll;
1550 #endif
1551
1552         ifp->if_nmbclusters = sc->rx_ring_cnt * sc->rx_rings[0].num_rx_desc;
1553
1554         ifq_set_maxlen(&ifp->if_snd, sc->tx_rings[0].num_tx_desc - 1);
1555         ifq_set_ready(&ifp->if_snd);
1556         ifq_set_subq_cnt(&ifp->if_snd, sc->tx_ring_cnt);
1557
1558         ifp->if_mapsubq = ifq_mapsubq_mask;
1559         ifq_set_subq_mask(&ifp->if_snd, 0);
1560
1561         ether_ifattach(ifp, sc->hw.mac.addr, NULL);
1562
1563         ifp->if_capabilities =
1564             IFCAP_HWCSUM | IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_TSO;
1565         if (IGB_ENABLE_HWRSS(sc))
1566                 ifp->if_capabilities |= IFCAP_RSS;
1567         ifp->if_capenable = ifp->if_capabilities;
1568         ifp->if_hwassist = IGB_CSUM_FEATURES | CSUM_TSO;
1569
1570         /*
1571          * Tell the upper layer(s) we support long frames
1572          */
1573         ifp->if_data.ifi_hdrlen = sizeof(struct ether_vlan_header);
1574
1575         /* Setup TX rings and subqueues */
1576         for (i = 0; i < sc->tx_ring_cnt; ++i) {
1577                 struct ifaltq_subque *ifsq = ifq_get_subq(&ifp->if_snd, i);
1578                 struct igb_tx_ring *txr = &sc->tx_rings[i];
1579
1580                 ifsq_set_cpuid(ifsq, txr->tx_intr_cpuid);
1581                 ifsq_set_priv(ifsq, txr);
1582                 ifsq_set_hw_serialize(ifsq, &txr->tx_serialize);
1583                 txr->ifsq = ifsq;
1584
1585                 ifsq_watchdog_init(&txr->tx_watchdog, ifsq, igb_watchdog);
1586         }
1587
1588         /*
1589          * Specify the media types supported by this adapter and register
1590          * callbacks to update media and link information
1591          */
1592         if (sc->hw.phy.media_type == e1000_media_type_fiber ||
1593             sc->hw.phy.media_type == e1000_media_type_internal_serdes) {
1594                 ifmedia_add(&sc->media, IFM_ETHER | IFM_1000_SX | IFM_FDX,
1595                     0, NULL);
1596         } else {
1597                 ifmedia_add(&sc->media, IFM_ETHER | IFM_10_T, 0, NULL);
1598                 ifmedia_add(&sc->media, IFM_ETHER | IFM_10_T | IFM_FDX,
1599                     0, NULL);
1600                 ifmedia_add(&sc->media, IFM_ETHER | IFM_100_TX, 0, NULL);
1601                 ifmedia_add(&sc->media, IFM_ETHER | IFM_100_TX | IFM_FDX,
1602                     0, NULL);
1603                 if (sc->hw.phy.type != e1000_phy_ife) {
1604                         ifmedia_add(&sc->media,
1605                             IFM_ETHER | IFM_1000_T | IFM_FDX, 0, NULL);
1606                 }
1607         }
1608         ifmedia_add(&sc->media, IFM_ETHER | IFM_AUTO, 0, NULL);
1609         ifmedia_set(&sc->media, IFM_ETHER | IFM_AUTO | sc->ifm_flowctrl);
1610 }
1611
1612 static void
1613 igb_add_sysctl(struct igb_softc *sc)
1614 {
1615         struct sysctl_ctx_list *ctx;
1616         struct sysctl_oid *tree;
1617 #if defined(IGB_RSS_DEBUG) || defined(IGB_TSS_DEBUG)
1618         char node[32];
1619         int i;
1620 #endif
1621
1622         ctx = device_get_sysctl_ctx(sc->dev);
1623         tree = device_get_sysctl_tree(sc->dev);
1624         SYSCTL_ADD_INT(ctx, SYSCTL_CHILDREN(tree),
1625             OID_AUTO, "rxr", CTLFLAG_RD, &sc->rx_ring_cnt, 0, "# of RX rings");
1626         SYSCTL_ADD_INT(ctx, SYSCTL_CHILDREN(tree),
1627             OID_AUTO, "rxr_inuse", CTLFLAG_RD, &sc->rx_ring_inuse, 0,
1628             "# of RX rings used");
1629         SYSCTL_ADD_INT(ctx, SYSCTL_CHILDREN(tree),
1630             OID_AUTO, "txr", CTLFLAG_RD, &sc->tx_ring_cnt, 0, "# of TX rings");
1631         SYSCTL_ADD_INT(ctx, SYSCTL_CHILDREN(tree),
1632             OID_AUTO, "txr_inuse", CTLFLAG_RD, &sc->tx_ring_inuse, 0,
1633             "# of TX rings used");
1634         SYSCTL_ADD_INT(ctx, SYSCTL_CHILDREN(tree),
1635             OID_AUTO, "rxd", CTLFLAG_RD, &sc->rx_rings[0].num_rx_desc, 0,
1636             "# of RX descs");
1637         SYSCTL_ADD_INT(ctx, SYSCTL_CHILDREN(tree),
1638             OID_AUTO, "txd", CTLFLAG_RD, &sc->tx_rings[0].num_tx_desc, 0,
1639             "# of TX descs");
1640
1641 #define IGB_ADD_INTR_RATE_SYSCTL(sc, use, name) \
1642 do { \
1643         igb_add_intr_rate_sysctl(sc, IGB_INTR_USE_##use, #name "_intr_rate", \
1644             #use " interrupt rate"); \
1645 } while (0)
1646
1647         IGB_ADD_INTR_RATE_SYSCTL(sc, RXTX, rxtx);
1648         IGB_ADD_INTR_RATE_SYSCTL(sc, RX, rx);
1649         IGB_ADD_INTR_RATE_SYSCTL(sc, TX, tx);
1650         IGB_ADD_INTR_RATE_SYSCTL(sc, STATUS, sts);
1651
1652 #undef IGB_ADD_INTR_RATE_SYSCTL
1653
1654         SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(tree),
1655             OID_AUTO, "tx_intr_nsegs", CTLTYPE_INT | CTLFLAG_RW,
1656             sc, 0, igb_sysctl_tx_intr_nsegs, "I",
1657             "# of segments per TX interrupt");
1658
1659         SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(tree),
1660             OID_AUTO, "tx_wreg_nsegs", CTLTYPE_INT | CTLFLAG_RW,
1661             sc, 0, igb_sysctl_tx_wreg_nsegs, "I",
1662             "# of segments sent before write to hardware register");
1663
1664         SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(tree),
1665             OID_AUTO, "rx_wreg_nsegs", CTLTYPE_INT | CTLFLAG_RW,
1666             sc, 0, igb_sysctl_rx_wreg_nsegs, "I",
1667             "# of segments received before write to hardware register");
1668
1669 #ifdef IFPOLL_ENABLE
1670         SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(tree),
1671             OID_AUTO, "npoll_rxoff", CTLTYPE_INT|CTLFLAG_RW,
1672             sc, 0, igb_sysctl_npoll_rxoff, "I", "NPOLLING RX cpu offset");
1673         SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(tree),
1674             OID_AUTO, "npoll_txoff", CTLTYPE_INT|CTLFLAG_RW,
1675             sc, 0, igb_sysctl_npoll_txoff, "I", "NPOLLING TX cpu offset");
1676 #endif
1677
1678 #ifdef IGB_RSS_DEBUG
1679         SYSCTL_ADD_INT(ctx, SYSCTL_CHILDREN(tree),
1680             OID_AUTO, "rss_debug", CTLFLAG_RW, &sc->rss_debug, 0,
1681             "RSS debug level");
1682         for (i = 0; i < sc->rx_ring_cnt; ++i) {
1683                 ksnprintf(node, sizeof(node), "rx%d_pkt", i);
1684                 SYSCTL_ADD_ULONG(ctx,
1685                     SYSCTL_CHILDREN(tree), OID_AUTO, node,
1686                     CTLFLAG_RW, &sc->rx_rings[i].rx_packets, "RXed packets");
1687         }
1688 #endif
1689 #ifdef IGB_TSS_DEBUG
1690         for  (i = 0; i < sc->tx_ring_cnt; ++i) {
1691                 ksnprintf(node, sizeof(node), "tx%d_pkt", i);
1692                 SYSCTL_ADD_ULONG(ctx,
1693                     SYSCTL_CHILDREN(tree), OID_AUTO, node,
1694                     CTLFLAG_RW, &sc->tx_rings[i].tx_packets, "TXed packets");
1695         }
1696 #endif
1697 }
1698
1699 static int
1700 igb_alloc_rings(struct igb_softc *sc)
1701 {
1702         int error, i;
1703
1704         /*
1705          * Create top level busdma tag
1706          */
1707         error = bus_dma_tag_create(NULL, 1, 0,
1708             BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL,
1709             BUS_SPACE_MAXSIZE_32BIT, 0, BUS_SPACE_MAXSIZE_32BIT, 0,
1710             &sc->parent_tag);
1711         if (error) {
1712                 device_printf(sc->dev, "could not create top level DMA tag\n");
1713                 return error;
1714         }
1715
1716         /*
1717          * Allocate TX descriptor rings and buffers
1718          */
1719         sc->tx_rings = kmalloc_cachealign(
1720             sizeof(struct igb_tx_ring) * sc->tx_ring_cnt,
1721             M_DEVBUF, M_WAITOK | M_ZERO);
1722         for (i = 0; i < sc->tx_ring_cnt; ++i) {
1723                 struct igb_tx_ring *txr = &sc->tx_rings[i];
1724
1725                 /* Set up some basics */
1726                 txr->sc = sc;
1727                 txr->me = i;
1728                 lwkt_serialize_init(&txr->tx_serialize);
1729
1730                 error = igb_create_tx_ring(txr);
1731                 if (error)
1732                         return error;
1733         }
1734
1735         /*
1736          * Allocate RX descriptor rings and buffers
1737          */ 
1738         sc->rx_rings = kmalloc_cachealign(
1739             sizeof(struct igb_rx_ring) * sc->rx_ring_cnt,
1740             M_DEVBUF, M_WAITOK | M_ZERO);
1741         for (i = 0; i < sc->rx_ring_cnt; ++i) {
1742                 struct igb_rx_ring *rxr = &sc->rx_rings[i];
1743
1744                 /* Set up some basics */
1745                 rxr->sc = sc;
1746                 rxr->me = i;
1747                 lwkt_serialize_init(&rxr->rx_serialize);
1748
1749                 error = igb_create_rx_ring(rxr);
1750                 if (error)
1751                         return error;
1752         }
1753
1754         return 0;
1755 }
1756
1757 static void
1758 igb_free_rings(struct igb_softc *sc)
1759 {
1760         int i;
1761
1762         if (sc->tx_rings != NULL) {
1763                 for (i = 0; i < sc->tx_ring_cnt; ++i) {
1764                         struct igb_tx_ring *txr = &sc->tx_rings[i];
1765
1766                         igb_destroy_tx_ring(txr, txr->num_tx_desc);
1767                 }
1768                 kfree(sc->tx_rings, M_DEVBUF);
1769         }
1770
1771         if (sc->rx_rings != NULL) {
1772                 for (i = 0; i < sc->rx_ring_cnt; ++i) {
1773                         struct igb_rx_ring *rxr = &sc->rx_rings[i];
1774
1775                         igb_destroy_rx_ring(rxr, rxr->num_rx_desc);
1776                 }
1777                 kfree(sc->rx_rings, M_DEVBUF);
1778         }
1779 }
1780
1781 static int
1782 igb_create_tx_ring(struct igb_tx_ring *txr)
1783 {
1784         int tsize, error, i, ntxd;
1785
1786         /*
1787          * Validate number of transmit descriptors. It must not exceed
1788          * hardware maximum, and must be multiple of IGB_DBA_ALIGN.
1789          */
1790         ntxd = device_getenv_int(txr->sc->dev, "txd", igb_txd);
1791         if ((ntxd * sizeof(struct e1000_tx_desc)) % IGB_DBA_ALIGN != 0 ||
1792             ntxd > IGB_MAX_TXD || ntxd < IGB_MIN_TXD) {
1793                 device_printf(txr->sc->dev,
1794                     "Using %d TX descriptors instead of %d!\n",
1795                     IGB_DEFAULT_TXD, ntxd);
1796                 txr->num_tx_desc = IGB_DEFAULT_TXD;
1797         } else {
1798                 txr->num_tx_desc = ntxd;
1799         }
1800
1801         /*
1802          * Allocate TX descriptor ring
1803          */
1804         tsize = roundup2(txr->num_tx_desc * sizeof(union e1000_adv_tx_desc),
1805             IGB_DBA_ALIGN);
1806         txr->txdma.dma_vaddr = bus_dmamem_coherent_any(txr->sc->parent_tag,
1807             IGB_DBA_ALIGN, tsize, BUS_DMA_WAITOK,
1808             &txr->txdma.dma_tag, &txr->txdma.dma_map, &txr->txdma.dma_paddr);
1809         if (txr->txdma.dma_vaddr == NULL) {
1810                 device_printf(txr->sc->dev,
1811                     "Unable to allocate TX Descriptor memory\n");
1812                 return ENOMEM;
1813         }
1814         txr->tx_base = txr->txdma.dma_vaddr;
1815         bzero(txr->tx_base, tsize);
1816
1817         tsize = __VM_CACHELINE_ALIGN(
1818             sizeof(struct igb_tx_buf) * txr->num_tx_desc);
1819         txr->tx_buf = kmalloc_cachealign(tsize, M_DEVBUF, M_WAITOK | M_ZERO);
1820
1821         /*
1822          * Allocate TX head write-back buffer
1823          */
1824         txr->tx_hdr = bus_dmamem_coherent_any(txr->sc->parent_tag,
1825             __VM_CACHELINE_SIZE, __VM_CACHELINE_SIZE, BUS_DMA_WAITOK,
1826             &txr->tx_hdr_dtag, &txr->tx_hdr_dmap, &txr->tx_hdr_paddr);
1827         if (txr->tx_hdr == NULL) {
1828                 device_printf(txr->sc->dev,
1829                     "Unable to allocate TX head write-back buffer\n");
1830                 return ENOMEM;
1831         }
1832
1833         /*
1834          * Create DMA tag for TX buffers
1835          */
1836         error = bus_dma_tag_create(txr->sc->parent_tag,
1837             1, 0,               /* alignment, bounds */
1838             BUS_SPACE_MAXADDR,  /* lowaddr */
1839             BUS_SPACE_MAXADDR,  /* highaddr */
1840             NULL, NULL,         /* filter, filterarg */
1841             IGB_TSO_SIZE,       /* maxsize */
1842             IGB_MAX_SCATTER,    /* nsegments */
1843             PAGE_SIZE,          /* maxsegsize */
1844             BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW |
1845             BUS_DMA_ONEBPAGE,   /* flags */
1846             &txr->tx_tag);
1847         if (error) {
1848                 device_printf(txr->sc->dev, "Unable to allocate TX DMA tag\n");
1849                 kfree(txr->tx_buf, M_DEVBUF);
1850                 txr->tx_buf = NULL;
1851                 return error;
1852         }
1853
1854         /*
1855          * Create DMA maps for TX buffers
1856          */
1857         for (i = 0; i < txr->num_tx_desc; ++i) {
1858                 struct igb_tx_buf *txbuf = &txr->tx_buf[i];
1859
1860                 error = bus_dmamap_create(txr->tx_tag,
1861                     BUS_DMA_WAITOK | BUS_DMA_ONEBPAGE, &txbuf->map);
1862                 if (error) {
1863                         device_printf(txr->sc->dev,
1864                             "Unable to create TX DMA map\n");
1865                         igb_destroy_tx_ring(txr, i);
1866                         return error;
1867                 }
1868         }
1869
1870         if (txr->sc->hw.mac.type == e1000_82575)
1871                 txr->tx_flags |= IGB_TXFLAG_TSO_IPLEN0;
1872
1873         /*
1874          * Initialize various watermark
1875          */
1876         txr->intr_nsegs = txr->num_tx_desc / 16;
1877         txr->wreg_nsegs = IGB_DEF_TXWREG_NSEGS;
1878
1879         return 0;
1880 }
1881
1882 static void
1883 igb_free_tx_ring(struct igb_tx_ring *txr)
1884 {
1885         int i;
1886
1887         for (i = 0; i < txr->num_tx_desc; ++i) {
1888                 struct igb_tx_buf *txbuf = &txr->tx_buf[i];
1889
1890                 if (txbuf->m_head != NULL) {
1891                         bus_dmamap_unload(txr->tx_tag, txbuf->map);
1892                         m_freem(txbuf->m_head);
1893                         txbuf->m_head = NULL;
1894                 }
1895         }
1896 }
1897
1898 static void
1899 igb_destroy_tx_ring(struct igb_tx_ring *txr, int ndesc)
1900 {
1901         int i;
1902
1903         if (txr->txdma.dma_vaddr != NULL) {
1904                 bus_dmamap_unload(txr->txdma.dma_tag, txr->txdma.dma_map);
1905                 bus_dmamem_free(txr->txdma.dma_tag, txr->txdma.dma_vaddr,
1906                     txr->txdma.dma_map);
1907                 bus_dma_tag_destroy(txr->txdma.dma_tag);
1908                 txr->txdma.dma_vaddr = NULL;
1909         }
1910
1911         if (txr->tx_hdr != NULL) {
1912                 bus_dmamap_unload(txr->tx_hdr_dtag, txr->tx_hdr_dmap);
1913                 bus_dmamem_free(txr->tx_hdr_dtag, txr->tx_hdr,
1914                     txr->tx_hdr_dmap);
1915                 bus_dma_tag_destroy(txr->tx_hdr_dtag);
1916                 txr->tx_hdr = NULL;
1917         }
1918
1919         if (txr->tx_buf == NULL)
1920                 return;
1921
1922         for (i = 0; i < ndesc; ++i) {
1923                 struct igb_tx_buf *txbuf = &txr->tx_buf[i];
1924
1925                 KKASSERT(txbuf->m_head == NULL);
1926                 bus_dmamap_destroy(txr->tx_tag, txbuf->map);
1927         }
1928         bus_dma_tag_destroy(txr->tx_tag);
1929
1930         kfree(txr->tx_buf, M_DEVBUF);
1931         txr->tx_buf = NULL;
1932 }
1933
1934 static void
1935 igb_init_tx_ring(struct igb_tx_ring *txr)
1936 {
1937         /* Clear the old descriptor contents */
1938         bzero(txr->tx_base,
1939             sizeof(union e1000_adv_tx_desc) * txr->num_tx_desc);
1940
1941         /* Clear TX head write-back buffer */
1942         *(txr->tx_hdr) = 0;
1943
1944         /* Reset indices */
1945         txr->next_avail_desc = 0;
1946         txr->next_to_clean = 0;
1947         txr->tx_nsegs = 0;
1948
1949         /* Set number of descriptors available */
1950         txr->tx_avail = txr->num_tx_desc;
1951
1952         /* Enable this TX ring */
1953         txr->tx_flags |= IGB_TXFLAG_ENABLED;
1954 }
1955
1956 static void
1957 igb_init_tx_unit(struct igb_softc *sc)
1958 {
1959         struct e1000_hw *hw = &sc->hw;
1960         uint32_t tctl;
1961         int i;
1962
1963         /* Setup the Tx Descriptor Rings */
1964         for (i = 0; i < sc->tx_ring_inuse; ++i) {
1965                 struct igb_tx_ring *txr = &sc->tx_rings[i];
1966                 uint64_t bus_addr = txr->txdma.dma_paddr;
1967                 uint64_t hdr_paddr = txr->tx_hdr_paddr;
1968                 uint32_t txdctl = 0;
1969                 uint32_t dca_txctrl;
1970
1971                 E1000_WRITE_REG(hw, E1000_TDLEN(i),
1972                     txr->num_tx_desc * sizeof(struct e1000_tx_desc));
1973                 E1000_WRITE_REG(hw, E1000_TDBAH(i),
1974                     (uint32_t)(bus_addr >> 32));
1975                 E1000_WRITE_REG(hw, E1000_TDBAL(i),
1976                     (uint32_t)bus_addr);
1977
1978                 /* Setup the HW Tx Head and Tail descriptor pointers */
1979                 E1000_WRITE_REG(hw, E1000_TDT(i), 0);
1980                 E1000_WRITE_REG(hw, E1000_TDH(i), 0);
1981
1982                 dca_txctrl = E1000_READ_REG(hw, E1000_DCA_TXCTRL(i));
1983                 dca_txctrl &= ~E1000_DCA_TXCTRL_TX_WB_RO_EN;
1984                 E1000_WRITE_REG(hw, E1000_DCA_TXCTRL(i), dca_txctrl);
1985
1986                 /*
1987                  * Don't set WB_on_EITR:
1988                  * - 82575 does not have it
1989                  * - It almost has no effect on 82576, see:
1990                  *   82576 specification update errata #26
1991                  * - It causes unnecessary bus traffic
1992                  */
1993                 E1000_WRITE_REG(hw, E1000_TDWBAH(i),
1994                     (uint32_t)(hdr_paddr >> 32));
1995                 E1000_WRITE_REG(hw, E1000_TDWBAL(i),
1996                     ((uint32_t)hdr_paddr) | E1000_TX_HEAD_WB_ENABLE);
1997
1998                 /*
1999                  * WTHRESH is ignored by the hardware, since header
2000                  * write back mode is used.
2001                  */
2002                 txdctl |= IGB_TX_PTHRESH;
2003                 txdctl |= IGB_TX_HTHRESH << 8;
2004                 txdctl |= IGB_TX_WTHRESH << 16;
2005                 txdctl |= E1000_TXDCTL_QUEUE_ENABLE;
2006                 E1000_WRITE_REG(hw, E1000_TXDCTL(i), txdctl);
2007         }
2008
2009         if (sc->vf_ifp)
2010                 return;
2011
2012         e1000_config_collision_dist(hw);
2013
2014         /* Program the Transmit Control Register */
2015         tctl = E1000_READ_REG(hw, E1000_TCTL);
2016         tctl &= ~E1000_TCTL_CT;
2017         tctl |= (E1000_TCTL_PSP | E1000_TCTL_RTLC | E1000_TCTL_EN |
2018             (E1000_COLLISION_THRESHOLD << E1000_CT_SHIFT));
2019
2020         /* This write will effectively turn on the transmit unit. */
2021         E1000_WRITE_REG(hw, E1000_TCTL, tctl);
2022 }
2023
2024 static boolean_t
2025 igb_txcsum_ctx(struct igb_tx_ring *txr, struct mbuf *mp)
2026 {
2027         struct e1000_adv_tx_context_desc *TXD;
2028         uint32_t vlan_macip_lens, type_tucmd_mlhl, mss_l4len_idx;
2029         int ehdrlen, ctxd, ip_hlen = 0;
2030         boolean_t offload = TRUE;
2031
2032         if ((mp->m_pkthdr.csum_flags & IGB_CSUM_FEATURES) == 0)
2033                 offload = FALSE;
2034
2035         vlan_macip_lens = type_tucmd_mlhl = mss_l4len_idx = 0;
2036
2037         ctxd = txr->next_avail_desc;
2038         TXD = (struct e1000_adv_tx_context_desc *)&txr->tx_base[ctxd];
2039
2040         /*
2041          * In advanced descriptors the vlan tag must 
2042          * be placed into the context descriptor, thus
2043          * we need to be here just for that setup.
2044          */
2045         if (mp->m_flags & M_VLANTAG) {
2046                 uint16_t vlantag;
2047
2048                 vlantag = htole16(mp->m_pkthdr.ether_vlantag);
2049                 vlan_macip_lens |= (vlantag << E1000_ADVTXD_VLAN_SHIFT);
2050         } else if (!offload) {
2051                 return FALSE;
2052         }
2053
2054         ehdrlen = mp->m_pkthdr.csum_lhlen;
2055         KASSERT(ehdrlen > 0, ("invalid ether hlen"));
2056
2057         /* Set the ether header length */
2058         vlan_macip_lens |= ehdrlen << E1000_ADVTXD_MACLEN_SHIFT;
2059         if (mp->m_pkthdr.csum_flags & CSUM_IP) {
2060                 type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_IPV4;
2061                 ip_hlen = mp->m_pkthdr.csum_iphlen;
2062                 KASSERT(ip_hlen > 0, ("invalid ip hlen"));
2063         }
2064         vlan_macip_lens |= ip_hlen;
2065
2066         type_tucmd_mlhl |= E1000_ADVTXD_DCMD_DEXT | E1000_ADVTXD_DTYP_CTXT;
2067         if (mp->m_pkthdr.csum_flags & CSUM_TCP)
2068                 type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_TCP;
2069         else if (mp->m_pkthdr.csum_flags & CSUM_UDP)
2070                 type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_UDP;
2071
2072         /*
2073          * 82575 needs the TX context index added; the queue
2074          * index is used as TX context index here.
2075          */
2076         if (txr->sc->hw.mac.type == e1000_82575)
2077                 mss_l4len_idx = txr->me << 4;
2078
2079         /* Now copy bits into descriptor */
2080         TXD->vlan_macip_lens = htole32(vlan_macip_lens);
2081         TXD->type_tucmd_mlhl = htole32(type_tucmd_mlhl);
2082         TXD->seqnum_seed = htole32(0);
2083         TXD->mss_l4len_idx = htole32(mss_l4len_idx);
2084
2085         /* We've consumed the first desc, adjust counters */
2086         if (++ctxd == txr->num_tx_desc)
2087                 ctxd = 0;
2088         txr->next_avail_desc = ctxd;
2089         --txr->tx_avail;
2090
2091         return offload;
2092 }
2093
2094 static void
2095 igb_txeof(struct igb_tx_ring *txr, int hdr)
2096 {
2097         int first, avail;
2098
2099         if (txr->tx_avail == txr->num_tx_desc)
2100                 return;
2101
2102         first = txr->next_to_clean;
2103         if (first == hdr)
2104                 return;
2105
2106         avail = txr->tx_avail;
2107         while (first != hdr) {
2108                 struct igb_tx_buf *txbuf = &txr->tx_buf[first];
2109
2110                 ++avail;
2111                 if (txbuf->m_head) {
2112                         bus_dmamap_unload(txr->tx_tag, txbuf->map);
2113                         m_freem(txbuf->m_head);
2114                         txbuf->m_head = NULL;
2115                 }
2116                 if (++first == txr->num_tx_desc)
2117                         first = 0;
2118         }
2119         txr->next_to_clean = first;
2120         txr->tx_avail = avail;
2121
2122         /*
2123          * If we have a minimum free, clear OACTIVE
2124          * to tell the stack that it is OK to send packets.
2125          */
2126         if (txr->tx_avail > IGB_MAX_SCATTER + IGB_TX_RESERVED) {
2127                 ifsq_clr_oactive(txr->ifsq);
2128
2129                 /*
2130                  * We have enough TX descriptors, turn off
2131                  * the watchdog.  We allow small amount of
2132                  * packets (roughly intr_nsegs) pending on
2133                  * the transmit ring.
2134                  */
2135                 txr->tx_watchdog.wd_timer = 0;
2136         }
2137 }
2138
2139 static int
2140 igb_create_rx_ring(struct igb_rx_ring *rxr)
2141 {
2142         int rsize, i, error, nrxd;
2143
2144         /*
2145          * Validate number of receive descriptors. It must not exceed
2146          * hardware maximum, and must be multiple of IGB_DBA_ALIGN.
2147          */
2148         nrxd = device_getenv_int(rxr->sc->dev, "rxd", igb_rxd);
2149         if ((nrxd * sizeof(struct e1000_rx_desc)) % IGB_DBA_ALIGN != 0 ||
2150             nrxd > IGB_MAX_RXD || nrxd < IGB_MIN_RXD) {
2151                 device_printf(rxr->sc->dev,
2152                     "Using %d RX descriptors instead of %d!\n",
2153                     IGB_DEFAULT_RXD, nrxd);
2154                 rxr->num_rx_desc = IGB_DEFAULT_RXD;
2155         } else {
2156                 rxr->num_rx_desc = nrxd;
2157         }
2158
2159         /*
2160          * Allocate RX descriptor ring
2161          */
2162         rsize = roundup2(rxr->num_rx_desc * sizeof(union e1000_adv_rx_desc),
2163             IGB_DBA_ALIGN);
2164         rxr->rxdma.dma_vaddr = bus_dmamem_coherent_any(rxr->sc->parent_tag,
2165             IGB_DBA_ALIGN, rsize, BUS_DMA_WAITOK,
2166             &rxr->rxdma.dma_tag, &rxr->rxdma.dma_map,
2167             &rxr->rxdma.dma_paddr);
2168         if (rxr->rxdma.dma_vaddr == NULL) {
2169                 device_printf(rxr->sc->dev,
2170                     "Unable to allocate RxDescriptor memory\n");
2171                 return ENOMEM;
2172         }
2173         rxr->rx_base = rxr->rxdma.dma_vaddr;
2174         bzero(rxr->rx_base, rsize);
2175
2176         rsize = __VM_CACHELINE_ALIGN(
2177             sizeof(struct igb_rx_buf) * rxr->num_rx_desc);
2178         rxr->rx_buf = kmalloc_cachealign(rsize, M_DEVBUF, M_WAITOK | M_ZERO);
2179
2180         /*
2181          * Create DMA tag for RX buffers
2182          */
2183         error = bus_dma_tag_create(rxr->sc->parent_tag,
2184             1, 0,               /* alignment, bounds */
2185             BUS_SPACE_MAXADDR,  /* lowaddr */
2186             BUS_SPACE_MAXADDR,  /* highaddr */
2187             NULL, NULL,         /* filter, filterarg */
2188             MCLBYTES,           /* maxsize */
2189             1,                  /* nsegments */
2190             MCLBYTES,           /* maxsegsize */
2191             BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW, /* flags */
2192             &rxr->rx_tag);
2193         if (error) {
2194                 device_printf(rxr->sc->dev,
2195                     "Unable to create RX payload DMA tag\n");
2196                 kfree(rxr->rx_buf, M_DEVBUF);
2197                 rxr->rx_buf = NULL;
2198                 return error;
2199         }
2200
2201         /*
2202          * Create spare DMA map for RX buffers
2203          */
2204         error = bus_dmamap_create(rxr->rx_tag, BUS_DMA_WAITOK,
2205             &rxr->rx_sparemap);
2206         if (error) {
2207                 device_printf(rxr->sc->dev,
2208                     "Unable to create spare RX DMA maps\n");
2209                 bus_dma_tag_destroy(rxr->rx_tag);
2210                 kfree(rxr->rx_buf, M_DEVBUF);
2211                 rxr->rx_buf = NULL;
2212                 return error;
2213         }
2214
2215         /*
2216          * Create DMA maps for RX buffers
2217          */
2218         for (i = 0; i < rxr->num_rx_desc; i++) {
2219                 struct igb_rx_buf *rxbuf = &rxr->rx_buf[i];
2220
2221                 error = bus_dmamap_create(rxr->rx_tag,
2222                     BUS_DMA_WAITOK, &rxbuf->map);
2223                 if (error) {
2224                         device_printf(rxr->sc->dev,
2225                             "Unable to create RX DMA maps\n");
2226                         igb_destroy_rx_ring(rxr, i);
2227                         return error;
2228                 }
2229         }
2230
2231         /*
2232          * Initialize various watermark
2233          */
2234         rxr->wreg_nsegs = IGB_DEF_RXWREG_NSEGS;
2235
2236         return 0;
2237 }
2238
2239 static void
2240 igb_free_rx_ring(struct igb_rx_ring *rxr)
2241 {
2242         int i;
2243
2244         for (i = 0; i < rxr->num_rx_desc; ++i) {
2245                 struct igb_rx_buf *rxbuf = &rxr->rx_buf[i];
2246
2247                 if (rxbuf->m_head != NULL) {
2248                         bus_dmamap_unload(rxr->rx_tag, rxbuf->map);
2249                         m_freem(rxbuf->m_head);
2250                         rxbuf->m_head = NULL;
2251                 }
2252         }
2253
2254         if (rxr->fmp != NULL)
2255                 m_freem(rxr->fmp);
2256         rxr->fmp = NULL;
2257         rxr->lmp = NULL;
2258 }
2259
2260 static void
2261 igb_destroy_rx_ring(struct igb_rx_ring *rxr, int ndesc)
2262 {
2263         int i;
2264
2265         if (rxr->rxdma.dma_vaddr != NULL) {
2266                 bus_dmamap_unload(rxr->rxdma.dma_tag, rxr->rxdma.dma_map);
2267                 bus_dmamem_free(rxr->rxdma.dma_tag, rxr->rxdma.dma_vaddr,
2268                     rxr->rxdma.dma_map);
2269                 bus_dma_tag_destroy(rxr->rxdma.dma_tag);
2270                 rxr->rxdma.dma_vaddr = NULL;
2271         }
2272
2273         if (rxr->rx_buf == NULL)
2274                 return;
2275
2276         for (i = 0; i < ndesc; ++i) {
2277                 struct igb_rx_buf *rxbuf = &rxr->rx_buf[i];
2278
2279                 KKASSERT(rxbuf->m_head == NULL);
2280                 bus_dmamap_destroy(rxr->rx_tag, rxbuf->map);
2281         }
2282         bus_dmamap_destroy(rxr->rx_tag, rxr->rx_sparemap);
2283         bus_dma_tag_destroy(rxr->rx_tag);
2284
2285         kfree(rxr->rx_buf, M_DEVBUF);
2286         rxr->rx_buf = NULL;
2287 }
2288
2289 static void
2290 igb_setup_rxdesc(union e1000_adv_rx_desc *rxd, const struct igb_rx_buf *rxbuf)
2291 {
2292         rxd->read.pkt_addr = htole64(rxbuf->paddr);
2293         rxd->wb.upper.status_error = 0;
2294 }
2295
2296 static int
2297 igb_newbuf(struct igb_rx_ring *rxr, int i, boolean_t wait)
2298 {
2299         struct mbuf *m;
2300         bus_dma_segment_t seg;
2301         bus_dmamap_t map;
2302         struct igb_rx_buf *rxbuf;
2303         int error, nseg;
2304
2305         m = m_getcl(wait ? M_WAITOK : M_NOWAIT, MT_DATA, M_PKTHDR);
2306         if (m == NULL) {
2307                 if (wait) {
2308                         if_printf(&rxr->sc->arpcom.ac_if,
2309                             "Unable to allocate RX mbuf\n");
2310                 }
2311                 return ENOBUFS;
2312         }
2313         m->m_len = m->m_pkthdr.len = MCLBYTES;
2314
2315         if (rxr->sc->max_frame_size <= MCLBYTES - ETHER_ALIGN)
2316                 m_adj(m, ETHER_ALIGN);
2317
2318         error = bus_dmamap_load_mbuf_segment(rxr->rx_tag,
2319             rxr->rx_sparemap, m, &seg, 1, &nseg, BUS_DMA_NOWAIT);
2320         if (error) {
2321                 m_freem(m);
2322                 if (wait) {
2323                         if_printf(&rxr->sc->arpcom.ac_if,
2324                             "Unable to load RX mbuf\n");
2325                 }
2326                 return error;
2327         }
2328
2329         rxbuf = &rxr->rx_buf[i];
2330         if (rxbuf->m_head != NULL)
2331                 bus_dmamap_unload(rxr->rx_tag, rxbuf->map);
2332
2333         map = rxbuf->map;
2334         rxbuf->map = rxr->rx_sparemap;
2335         rxr->rx_sparemap = map;
2336
2337         rxbuf->m_head = m;
2338         rxbuf->paddr = seg.ds_addr;
2339
2340         igb_setup_rxdesc(&rxr->rx_base[i], rxbuf);
2341         return 0;
2342 }
2343
2344 static int
2345 igb_init_rx_ring(struct igb_rx_ring *rxr)
2346 {
2347         int i;
2348
2349         /* Clear the ring contents */
2350         bzero(rxr->rx_base,
2351             rxr->num_rx_desc * sizeof(union e1000_adv_rx_desc));
2352
2353         /* Now replenish the ring mbufs */
2354         for (i = 0; i < rxr->num_rx_desc; ++i) {
2355                 int error;
2356
2357                 error = igb_newbuf(rxr, i, TRUE);
2358                 if (error)
2359                         return error;
2360         }
2361
2362         /* Setup our descriptor indices */
2363         rxr->next_to_check = 0;
2364
2365         rxr->fmp = NULL;
2366         rxr->lmp = NULL;
2367         rxr->discard = FALSE;
2368
2369         return 0;
2370 }
2371
2372 static void
2373 igb_init_rx_unit(struct igb_softc *sc)
2374 {
2375         struct ifnet *ifp = &sc->arpcom.ac_if;
2376         struct e1000_hw *hw = &sc->hw;
2377         uint32_t rctl, rxcsum, srrctl = 0;
2378         int i;
2379
2380         /*
2381          * Make sure receives are disabled while setting
2382          * up the descriptor ring
2383          */
2384         rctl = E1000_READ_REG(hw, E1000_RCTL);
2385         E1000_WRITE_REG(hw, E1000_RCTL, rctl & ~E1000_RCTL_EN);
2386
2387 #if 0
2388         /*
2389         ** Set up for header split
2390         */
2391         if (igb_header_split) {
2392                 /* Use a standard mbuf for the header */
2393                 srrctl |= IGB_HDR_BUF << E1000_SRRCTL_BSIZEHDRSIZE_SHIFT;
2394                 srrctl |= E1000_SRRCTL_DESCTYPE_HDR_SPLIT_ALWAYS;
2395         } else
2396 #endif
2397                 srrctl |= E1000_SRRCTL_DESCTYPE_ADV_ONEBUF;
2398
2399         /*
2400         ** Set up for jumbo frames
2401         */
2402         if (ifp->if_mtu > ETHERMTU) {
2403                 rctl |= E1000_RCTL_LPE;
2404 #if 0
2405                 if (adapter->rx_mbuf_sz == MJUMPAGESIZE) {
2406                         srrctl |= 4096 >> E1000_SRRCTL_BSIZEPKT_SHIFT;
2407                         rctl |= E1000_RCTL_SZ_4096 | E1000_RCTL_BSEX;
2408                 } else if (adapter->rx_mbuf_sz > MJUMPAGESIZE) {
2409                         srrctl |= 8192 >> E1000_SRRCTL_BSIZEPKT_SHIFT;
2410                         rctl |= E1000_RCTL_SZ_8192 | E1000_RCTL_BSEX;
2411                 }
2412                 /* Set maximum packet len */
2413                 psize = adapter->max_frame_size;
2414                 /* are we on a vlan? */
2415                 if (adapter->ifp->if_vlantrunk != NULL)
2416                         psize += VLAN_TAG_SIZE;
2417                 E1000_WRITE_REG(&adapter->hw, E1000_RLPML, psize);
2418 #else
2419                 srrctl |= 2048 >> E1000_SRRCTL_BSIZEPKT_SHIFT;
2420                 rctl |= E1000_RCTL_SZ_2048;
2421 #endif
2422         } else {
2423                 rctl &= ~E1000_RCTL_LPE;
2424                 srrctl |= 2048 >> E1000_SRRCTL_BSIZEPKT_SHIFT;
2425                 rctl |= E1000_RCTL_SZ_2048;
2426         }
2427
2428         /* Setup the Base and Length of the Rx Descriptor Rings */
2429         for (i = 0; i < sc->rx_ring_inuse; ++i) {
2430                 struct igb_rx_ring *rxr = &sc->rx_rings[i];
2431                 uint64_t bus_addr = rxr->rxdma.dma_paddr;
2432                 uint32_t rxdctl;
2433
2434                 E1000_WRITE_REG(hw, E1000_RDLEN(i),
2435                     rxr->num_rx_desc * sizeof(struct e1000_rx_desc));
2436                 E1000_WRITE_REG(hw, E1000_RDBAH(i),
2437                     (uint32_t)(bus_addr >> 32));
2438                 E1000_WRITE_REG(hw, E1000_RDBAL(i),
2439                     (uint32_t)bus_addr);
2440                 E1000_WRITE_REG(hw, E1000_SRRCTL(i), srrctl);
2441                 /* Enable this Queue */
2442                 rxdctl = E1000_READ_REG(hw, E1000_RXDCTL(i));
2443                 rxdctl |= E1000_RXDCTL_QUEUE_ENABLE;
2444                 rxdctl &= 0xFFF00000;
2445                 rxdctl |= IGB_RX_PTHRESH;
2446                 rxdctl |= IGB_RX_HTHRESH << 8;
2447                 /*
2448                  * Don't set WTHRESH to a value above 1 on 82576, see:
2449                  * 82576 specification update errata #26
2450                  */
2451                 rxdctl |= IGB_RX_WTHRESH << 16;
2452                 E1000_WRITE_REG(hw, E1000_RXDCTL(i), rxdctl);
2453         }
2454
2455         rxcsum = E1000_READ_REG(&sc->hw, E1000_RXCSUM);
2456         rxcsum &= ~(E1000_RXCSUM_PCSS_MASK | E1000_RXCSUM_IPPCSE);
2457
2458         /*
2459          * Receive Checksum Offload for TCP and UDP
2460          *
2461          * Checksum offloading is also enabled if multiple receive
2462          * queue is to be supported, since we need it to figure out
2463          * fragments.
2464          */
2465         if ((ifp->if_capenable & IFCAP_RXCSUM) || IGB_ENABLE_HWRSS(sc)) {
2466                 /*
2467                  * NOTE:
2468                  * PCSD must be enabled to enable multiple
2469                  * receive queues.
2470                  */
2471                 rxcsum |= E1000_RXCSUM_IPOFL | E1000_RXCSUM_TUOFL |
2472                     E1000_RXCSUM_PCSD;
2473         } else {
2474                 rxcsum &= ~(E1000_RXCSUM_IPOFL | E1000_RXCSUM_TUOFL |
2475                     E1000_RXCSUM_PCSD);
2476         }
2477         E1000_WRITE_REG(&sc->hw, E1000_RXCSUM, rxcsum);
2478
2479         if (IGB_ENABLE_HWRSS(sc)) {
2480                 uint8_t key[IGB_NRSSRK * IGB_RSSRK_SIZE];
2481                 uint32_t reta_shift;
2482                 int j, r;
2483
2484                 /*
2485                  * NOTE:
2486                  * When we reach here, RSS has already been disabled
2487                  * in igb_stop(), so we could safely configure RSS key
2488                  * and redirect table.
2489                  */
2490
2491                 /*
2492                  * Configure RSS key
2493                  */
2494                 toeplitz_get_key(key, sizeof(key));
2495                 for (i = 0; i < IGB_NRSSRK; ++i) {
2496                         uint32_t rssrk;
2497
2498                         rssrk = IGB_RSSRK_VAL(key, i);
2499                         IGB_RSS_DPRINTF(sc, 1, "rssrk%d 0x%08x\n", i, rssrk);
2500
2501                         E1000_WRITE_REG(hw, E1000_RSSRK(i), rssrk);
2502                 }
2503
2504                 /*
2505                  * Configure RSS redirect table in following fashion:
2506                  * (hash & ring_cnt_mask) == rdr_table[(hash & rdr_table_mask)]
2507                  */
2508                 reta_shift = IGB_RETA_SHIFT;
2509                 if (hw->mac.type == e1000_82575)
2510                         reta_shift = IGB_RETA_SHIFT_82575;
2511
2512                 r = 0;
2513                 for (j = 0; j < IGB_NRETA; ++j) {
2514                         uint32_t reta = 0;
2515
2516                         for (i = 0; i < IGB_RETA_SIZE; ++i) {
2517                                 uint32_t q;
2518
2519                                 q = (r % sc->rx_ring_inuse) << reta_shift;
2520                                 reta |= q << (8 * i);
2521                                 ++r;
2522                         }
2523                         IGB_RSS_DPRINTF(sc, 1, "reta 0x%08x\n", reta);
2524                         E1000_WRITE_REG(hw, E1000_RETA(j), reta);
2525                 }
2526
2527                 /*
2528                  * Enable multiple receive queues.
2529                  * Enable IPv4 RSS standard hash functions.
2530                  * Disable RSS interrupt on 82575
2531                  */
2532                 E1000_WRITE_REG(&sc->hw, E1000_MRQC,
2533                                 E1000_MRQC_ENABLE_RSS_4Q |
2534                                 E1000_MRQC_RSS_FIELD_IPV4_TCP |
2535                                 E1000_MRQC_RSS_FIELD_IPV4);
2536         }
2537
2538         /* Setup the Receive Control Register */
2539         rctl &= ~(3 << E1000_RCTL_MO_SHIFT);
2540         rctl |= E1000_RCTL_EN | E1000_RCTL_BAM | E1000_RCTL_LBM_NO |
2541             E1000_RCTL_RDMTS_HALF |
2542             (hw->mac.mc_filter_type << E1000_RCTL_MO_SHIFT);
2543         /* Strip CRC bytes. */
2544         rctl |= E1000_RCTL_SECRC;
2545         /* Make sure VLAN Filters are off */
2546         rctl &= ~E1000_RCTL_VFE;
2547         /* Don't store bad packets */
2548         rctl &= ~E1000_RCTL_SBP;
2549
2550         /* Enable Receives */
2551         E1000_WRITE_REG(hw, E1000_RCTL, rctl);
2552
2553         /*
2554          * Setup the HW Rx Head and Tail Descriptor Pointers
2555          *   - needs to be after enable
2556          */
2557         for (i = 0; i < sc->rx_ring_inuse; ++i) {
2558                 struct igb_rx_ring *rxr = &sc->rx_rings[i];
2559
2560                 E1000_WRITE_REG(hw, E1000_RDH(i), rxr->next_to_check);
2561                 E1000_WRITE_REG(hw, E1000_RDT(i), rxr->num_rx_desc - 1);
2562         }
2563 }
2564
2565 static void
2566 igb_rx_refresh(struct igb_rx_ring *rxr, int i)
2567 {
2568         if (--i < 0)
2569                 i = rxr->num_rx_desc - 1;
2570         E1000_WRITE_REG(&rxr->sc->hw, E1000_RDT(rxr->me), i);
2571 }
2572
2573 static void
2574 igb_rxeof(struct igb_rx_ring *rxr, int count)
2575 {
2576         struct ifnet *ifp = &rxr->sc->arpcom.ac_if;
2577         union e1000_adv_rx_desc *cur;
2578         uint32_t staterr;
2579         int i, ncoll = 0, cpuid = mycpuid;
2580
2581         i = rxr->next_to_check;
2582         cur = &rxr->rx_base[i];
2583         staterr = le32toh(cur->wb.upper.status_error);
2584
2585         if ((staterr & E1000_RXD_STAT_DD) == 0)
2586                 return;
2587
2588         while ((staterr & E1000_RXD_STAT_DD) && count != 0) {
2589                 struct pktinfo *pi = NULL, pi0;
2590                 struct igb_rx_buf *rxbuf = &rxr->rx_buf[i];
2591                 struct mbuf *m = NULL;
2592                 boolean_t eop;
2593
2594                 eop = (staterr & E1000_RXD_STAT_EOP) ? TRUE : FALSE;
2595                 if (eop)
2596                         --count;
2597
2598                 ++ncoll;
2599                 if ((staterr & E1000_RXDEXT_ERR_FRAME_ERR_MASK) == 0 &&
2600                     !rxr->discard) {
2601                         struct mbuf *mp = rxbuf->m_head;
2602                         uint32_t hash, hashtype;
2603                         uint16_t vlan;
2604                         int len;
2605
2606                         len = le16toh(cur->wb.upper.length);
2607                         if ((rxr->sc->hw.mac.type == e1000_i350 ||
2608                              rxr->sc->hw.mac.type == e1000_i354) &&
2609                             (staterr & E1000_RXDEXT_STATERR_LB))
2610                                 vlan = be16toh(cur->wb.upper.vlan);
2611                         else
2612                                 vlan = le16toh(cur->wb.upper.vlan);
2613
2614                         hash = le32toh(cur->wb.lower.hi_dword.rss);
2615                         hashtype = le32toh(cur->wb.lower.lo_dword.data) &
2616                             E1000_RXDADV_RSSTYPE_MASK;
2617
2618                         IGB_RSS_DPRINTF(rxr->sc, 10,
2619                             "ring%d, hash 0x%08x, hashtype %u\n",
2620                             rxr->me, hash, hashtype);
2621
2622                         bus_dmamap_sync(rxr->rx_tag, rxbuf->map,
2623                             BUS_DMASYNC_POSTREAD);
2624
2625                         if (igb_newbuf(rxr, i, FALSE) != 0) {
2626                                 IFNET_STAT_INC(ifp, iqdrops, 1);
2627                                 goto discard;
2628                         }
2629
2630                         mp->m_len = len;
2631                         if (rxr->fmp == NULL) {
2632                                 mp->m_pkthdr.len = len;
2633                                 rxr->fmp = mp;
2634                                 rxr->lmp = mp;
2635                         } else {
2636                                 rxr->lmp->m_next = mp;
2637                                 rxr->lmp = rxr->lmp->m_next;
2638                                 rxr->fmp->m_pkthdr.len += len;
2639                         }
2640
2641                         if (eop) {
2642                                 m = rxr->fmp;
2643                                 rxr->fmp = NULL;
2644                                 rxr->lmp = NULL;
2645
2646                                 m->m_pkthdr.rcvif = ifp;
2647                                 IFNET_STAT_INC(ifp, ipackets, 1);
2648
2649                                 if (ifp->if_capenable & IFCAP_RXCSUM)
2650                                         igb_rxcsum(staterr, m);
2651
2652                                 if (staterr & E1000_RXD_STAT_VP) {
2653                                         m->m_pkthdr.ether_vlantag = vlan;
2654                                         m->m_flags |= M_VLANTAG;
2655                                 }
2656
2657                                 if (ifp->if_capenable & IFCAP_RSS) {
2658                                         pi = igb_rssinfo(m, &pi0,
2659                                             hash, hashtype, staterr);
2660                                 }
2661 #ifdef IGB_RSS_DEBUG
2662                                 rxr->rx_packets++;
2663 #endif
2664                         }
2665                 } else {
2666                         IFNET_STAT_INC(ifp, ierrors, 1);
2667 discard:
2668                         igb_setup_rxdesc(cur, rxbuf);
2669                         if (!eop)
2670                                 rxr->discard = TRUE;
2671                         else
2672                                 rxr->discard = FALSE;
2673                         if (rxr->fmp != NULL) {
2674                                 m_freem(rxr->fmp);
2675                                 rxr->fmp = NULL;
2676                                 rxr->lmp = NULL;
2677                         }
2678                         m = NULL;
2679                 }
2680
2681                 if (m != NULL)
2682                         ifp->if_input(ifp, m, pi, cpuid);
2683
2684                 /* Advance our pointers to the next descriptor. */
2685                 if (++i == rxr->num_rx_desc)
2686                         i = 0;
2687
2688                 if (ncoll >= rxr->wreg_nsegs) {
2689                         igb_rx_refresh(rxr, i);
2690                         ncoll = 0;
2691                 }
2692
2693                 cur = &rxr->rx_base[i];
2694                 staterr = le32toh(cur->wb.upper.status_error);
2695         }
2696         rxr->next_to_check = i;
2697
2698         if (ncoll > 0)
2699                 igb_rx_refresh(rxr, i);
2700 }
2701
2702
2703 static void
2704 igb_set_vlan(struct igb_softc *sc)
2705 {
2706         struct e1000_hw *hw = &sc->hw;
2707         uint32_t reg;
2708 #if 0
2709         struct ifnet *ifp = sc->arpcom.ac_if;
2710 #endif
2711
2712         if (sc->vf_ifp) {
2713                 e1000_rlpml_set_vf(hw, sc->max_frame_size + VLAN_TAG_SIZE);
2714                 return;
2715         }
2716
2717         reg = E1000_READ_REG(hw, E1000_CTRL);
2718         reg |= E1000_CTRL_VME;
2719         E1000_WRITE_REG(hw, E1000_CTRL, reg);
2720
2721 #if 0
2722         /* Enable the Filter Table */
2723         if (ifp->if_capenable & IFCAP_VLAN_HWFILTER) {
2724                 reg = E1000_READ_REG(hw, E1000_RCTL);
2725                 reg &= ~E1000_RCTL_CFIEN;
2726                 reg |= E1000_RCTL_VFE;
2727                 E1000_WRITE_REG(hw, E1000_RCTL, reg);
2728         }
2729 #endif
2730
2731         /* Update the frame size */
2732         E1000_WRITE_REG(&sc->hw, E1000_RLPML,
2733             sc->max_frame_size + VLAN_TAG_SIZE);
2734
2735 #if 0
2736         /* Don't bother with table if no vlans */
2737         if ((adapter->num_vlans == 0) ||
2738             ((ifp->if_capenable & IFCAP_VLAN_HWFILTER) == 0))
2739                 return;
2740         /*
2741         ** A soft reset zero's out the VFTA, so
2742         ** we need to repopulate it now.
2743         */
2744         for (int i = 0; i < IGB_VFTA_SIZE; i++)
2745                 if (adapter->shadow_vfta[i] != 0) {
2746                         if (adapter->vf_ifp)
2747                                 e1000_vfta_set_vf(hw,
2748                                     adapter->shadow_vfta[i], TRUE);
2749                         else
2750                                 E1000_WRITE_REG_ARRAY(hw, E1000_VFTA,
2751                                  i, adapter->shadow_vfta[i]);
2752                 }
2753 #endif
2754 }
2755
2756 static void
2757 igb_enable_intr(struct igb_softc *sc)
2758 {
2759         int i;
2760
2761         for (i = 0; i < sc->intr_cnt; ++i)
2762                 lwkt_serialize_handler_enable(sc->intr_data[i].intr_serialize);
2763
2764         if ((sc->flags & IGB_FLAG_SHARED_INTR) == 0) {
2765                 if (sc->intr_type == PCI_INTR_TYPE_MSIX)
2766                         E1000_WRITE_REG(&sc->hw, E1000_EIAC, sc->intr_mask);
2767                 else
2768                         E1000_WRITE_REG(&sc->hw, E1000_EIAC, 0);
2769                 E1000_WRITE_REG(&sc->hw, E1000_EIAM, sc->intr_mask);
2770                 E1000_WRITE_REG(&sc->hw, E1000_EIMS, sc->intr_mask);
2771                 E1000_WRITE_REG(&sc->hw, E1000_IMS, E1000_IMS_LSC);
2772         } else {
2773                 E1000_WRITE_REG(&sc->hw, E1000_IMS, IMS_ENABLE_MASK);
2774         }
2775         E1000_WRITE_FLUSH(&sc->hw);
2776 }
2777
2778 static void
2779 igb_disable_intr(struct igb_softc *sc)
2780 {
2781         int i;
2782
2783         if ((sc->flags & IGB_FLAG_SHARED_INTR) == 0) {
2784                 E1000_WRITE_REG(&sc->hw, E1000_EIMC, 0xffffffff);
2785                 E1000_WRITE_REG(&sc->hw, E1000_EIAC, 0);
2786         }
2787         E1000_WRITE_REG(&sc->hw, E1000_IMC, 0xffffffff);
2788         E1000_WRITE_FLUSH(&sc->hw);
2789
2790         for (i = 0; i < sc->intr_cnt; ++i)
2791                 lwkt_serialize_handler_disable(sc->intr_data[i].intr_serialize);
2792 }
2793
2794 /*
2795  * Bit of a misnomer, what this really means is
2796  * to enable OS management of the system... aka
2797  * to disable special hardware management features 
2798  */
2799 static void
2800 igb_get_mgmt(struct igb_softc *sc)
2801 {
2802         if (sc->flags & IGB_FLAG_HAS_MGMT) {
2803                 int manc2h = E1000_READ_REG(&sc->hw, E1000_MANC2H);
2804                 int manc = E1000_READ_REG(&sc->hw, E1000_MANC);
2805
2806                 /* disable hardware interception of ARP */
2807                 manc &= ~E1000_MANC_ARP_EN;
2808
2809                 /* enable receiving management packets to the host */
2810                 manc |= E1000_MANC_EN_MNG2HOST;
2811                 manc2h |= 1 << 5; /* Mng Port 623 */
2812                 manc2h |= 1 << 6; /* Mng Port 664 */
2813                 E1000_WRITE_REG(&sc->hw, E1000_MANC2H, manc2h);
2814                 E1000_WRITE_REG(&sc->hw, E1000_MANC, manc);
2815         }
2816 }
2817
2818 /*
2819  * Give control back to hardware management controller
2820  * if there is one.
2821  */
2822 static void
2823 igb_rel_mgmt(struct igb_softc *sc)
2824 {
2825         if (sc->flags & IGB_FLAG_HAS_MGMT) {
2826                 int manc = E1000_READ_REG(&sc->hw, E1000_MANC);
2827
2828                 /* Re-enable hardware interception of ARP */
2829                 manc |= E1000_MANC_ARP_EN;
2830                 manc &= ~E1000_MANC_EN_MNG2HOST;
2831
2832                 E1000_WRITE_REG(&sc->hw, E1000_MANC, manc);
2833         }
2834 }
2835
2836 /*
2837  * Sets CTRL_EXT:DRV_LOAD bit.
2838  *
2839  * For ASF and Pass Through versions of f/w this means that
2840  * the driver is loaded. 
2841  */
2842 static void
2843 igb_get_hw_control(struct igb_softc *sc)
2844 {
2845         uint32_t ctrl_ext;
2846
2847         if (sc->vf_ifp)
2848                 return;
2849
2850         /* Let firmware know the driver has taken over */
2851         ctrl_ext = E1000_READ_REG(&sc->hw, E1000_CTRL_EXT);
2852         E1000_WRITE_REG(&sc->hw, E1000_CTRL_EXT,
2853             ctrl_ext | E1000_CTRL_EXT_DRV_LOAD);
2854 }
2855
2856 /*
2857  * Resets CTRL_EXT:DRV_LOAD bit.
2858  *
2859  * For ASF and Pass Through versions of f/w this means that the
2860  * driver is no longer loaded.
2861  */
2862 static void
2863 igb_rel_hw_control(struct igb_softc *sc)
2864 {
2865         uint32_t ctrl_ext;
2866
2867         if (sc->vf_ifp)
2868                 return;
2869
2870         /* Let firmware taken over control of h/w */
2871         ctrl_ext = E1000_READ_REG(&sc->hw, E1000_CTRL_EXT);
2872         E1000_WRITE_REG(&sc->hw, E1000_CTRL_EXT,
2873             ctrl_ext & ~E1000_CTRL_EXT_DRV_LOAD);
2874 }
2875
2876 static boolean_t
2877 igb_is_valid_ether_addr(const uint8_t *addr)
2878 {
2879         uint8_t zero_addr[ETHER_ADDR_LEN] = { 0, 0, 0, 0, 0, 0 };
2880
2881         if ((addr[0] & 1) || !bcmp(addr, zero_addr, ETHER_ADDR_LEN))
2882                 return FALSE;
2883         return TRUE;
2884 }
2885
2886 /*
2887  * Enable PCI Wake On Lan capability
2888  */
2889 static void
2890 igb_enable_wol(device_t dev)
2891 {
2892         uint16_t cap, status;
2893         uint8_t id;
2894
2895         /* First find the capabilities pointer*/
2896         cap = pci_read_config(dev, PCIR_CAP_PTR, 2);
2897
2898         /* Read the PM Capabilities */
2899         id = pci_read_config(dev, cap, 1);
2900         if (id != PCIY_PMG)     /* Something wrong */
2901                 return;
2902
2903         /*
2904          * OK, we have the power capabilities,
2905          * so now get the status register
2906          */
2907         cap += PCIR_POWER_STATUS;
2908         status = pci_read_config(dev, cap, 2);
2909         status |= PCIM_PSTAT_PME | PCIM_PSTAT_PMEENABLE;
2910         pci_write_config(dev, cap, status, 2);
2911 }
2912
2913 static void
2914 igb_update_stats_counters(struct igb_softc *sc)
2915 {
2916         struct e1000_hw *hw = &sc->hw;
2917         struct e1000_hw_stats *stats;
2918         struct ifnet *ifp = &sc->arpcom.ac_if;
2919
2920         /* 
2921          * The virtual function adapter has only a
2922          * small controlled set of stats, do only 
2923          * those and return.
2924          */
2925         if (sc->vf_ifp) {
2926                 igb_update_vf_stats_counters(sc);
2927                 return;
2928         }
2929         stats = sc->stats;
2930
2931         if (sc->hw.phy.media_type == e1000_media_type_copper ||
2932             (E1000_READ_REG(hw, E1000_STATUS) & E1000_STATUS_LU)) {
2933                 stats->symerrs +=
2934                     E1000_READ_REG(hw,E1000_SYMERRS);
2935                 stats->sec += E1000_READ_REG(hw, E1000_SEC);
2936         }
2937
2938         stats->crcerrs += E1000_READ_REG(hw, E1000_CRCERRS);
2939         stats->mpc += E1000_READ_REG(hw, E1000_MPC);
2940         stats->scc += E1000_READ_REG(hw, E1000_SCC);
2941         stats->ecol += E1000_READ_REG(hw, E1000_ECOL);
2942
2943         stats->mcc += E1000_READ_REG(hw, E1000_MCC);
2944         stats->latecol += E1000_READ_REG(hw, E1000_LATECOL);
2945         stats->colc += E1000_READ_REG(hw, E1000_COLC);
2946         stats->dc += E1000_READ_REG(hw, E1000_DC);
2947         stats->rlec += E1000_READ_REG(hw, E1000_RLEC);
2948         stats->xonrxc += E1000_READ_REG(hw, E1000_XONRXC);
2949         stats->xontxc += E1000_READ_REG(hw, E1000_XONTXC);
2950
2951         /*
2952          * For watchdog management we need to know if we have been
2953          * paused during the last interval, so capture that here.
2954          */ 
2955         sc->pause_frames = E1000_READ_REG(hw, E1000_XOFFRXC);
2956         stats->xoffrxc += sc->pause_frames;
2957         stats->xofftxc += E1000_READ_REG(hw, E1000_XOFFTXC);
2958         stats->fcruc += E1000_READ_REG(hw, E1000_FCRUC);
2959         stats->prc64 += E1000_READ_REG(hw, E1000_PRC64);
2960         stats->prc127 += E1000_READ_REG(hw, E1000_PRC127);
2961         stats->prc255 += E1000_READ_REG(hw, E1000_PRC255);
2962         stats->prc511 += E1000_READ_REG(hw, E1000_PRC511);
2963         stats->prc1023 += E1000_READ_REG(hw, E1000_PRC1023);
2964         stats->prc1522 += E1000_READ_REG(hw, E1000_PRC1522);
2965         stats->gprc += E1000_READ_REG(hw, E1000_GPRC);
2966         stats->bprc += E1000_READ_REG(hw, E1000_BPRC);
2967         stats->mprc += E1000_READ_REG(hw, E1000_MPRC);
2968         stats->gptc += E1000_READ_REG(hw, E1000_GPTC);
2969
2970         /* For the 64-bit byte counters the low dword must be read first. */
2971         /* Both registers clear on the read of the high dword */
2972
2973         stats->gorc += E1000_READ_REG(hw, E1000_GORCL) +
2974             ((uint64_t)E1000_READ_REG(hw, E1000_GORCH) << 32);
2975         stats->gotc += E1000_READ_REG(hw, E1000_GOTCL) +
2976             ((uint64_t)E1000_READ_REG(hw, E1000_GOTCH) << 32);
2977
2978         stats->rnbc += E1000_READ_REG(hw, E1000_RNBC);
2979         stats->ruc += E1000_READ_REG(hw, E1000_RUC);
2980         stats->rfc += E1000_READ_REG(hw, E1000_RFC);
2981         stats->roc += E1000_READ_REG(hw, E1000_ROC);
2982         stats->rjc += E1000_READ_REG(hw, E1000_RJC);
2983
2984         stats->tor += E1000_READ_REG(hw, E1000_TORH);
2985         stats->tot += E1000_READ_REG(hw, E1000_TOTH);
2986
2987         stats->tpr += E1000_READ_REG(hw, E1000_TPR);
2988         stats->tpt += E1000_READ_REG(hw, E1000_TPT);
2989         stats->ptc64 += E1000_READ_REG(hw, E1000_PTC64);
2990         stats->ptc127 += E1000_READ_REG(hw, E1000_PTC127);
2991         stats->ptc255 += E1000_READ_REG(hw, E1000_PTC255);
2992         stats->ptc511 += E1000_READ_REG(hw, E1000_PTC511);
2993         stats->ptc1023 += E1000_READ_REG(hw, E1000_PTC1023);
2994         stats->ptc1522 += E1000_READ_REG(hw, E1000_PTC1522);
2995         stats->mptc += E1000_READ_REG(hw, E1000_MPTC);
2996         stats->bptc += E1000_READ_REG(hw, E1000_BPTC);
2997
2998         /* Interrupt Counts */
2999
3000         stats->iac += E1000_READ_REG(hw, E1000_IAC);
3001         stats->icrxptc += E1000_READ_REG(hw, E1000_ICRXPTC);
3002         stats->icrxatc += E1000_READ_REG(hw, E1000_ICRXATC);
3003         stats->ictxptc += E1000_READ_REG(hw, E1000_ICTXPTC);
3004         stats->ictxatc += E1000_READ_REG(hw, E1000_ICTXATC);
3005         stats->ictxqec += E1000_READ_REG(hw, E1000_ICTXQEC);
3006         stats->ictxqmtc += E1000_READ_REG(hw, E1000_ICTXQMTC);
3007         stats->icrxdmtc += E1000_READ_REG(hw, E1000_ICRXDMTC);
3008         stats->icrxoc += E1000_READ_REG(hw, E1000_ICRXOC);
3009
3010         /* Host to Card Statistics */
3011
3012         stats->cbtmpc += E1000_READ_REG(hw, E1000_CBTMPC);
3013         stats->htdpmc += E1000_READ_REG(hw, E1000_HTDPMC);
3014         stats->cbrdpc += E1000_READ_REG(hw, E1000_CBRDPC);
3015         stats->cbrmpc += E1000_READ_REG(hw, E1000_CBRMPC);
3016         stats->rpthc += E1000_READ_REG(hw, E1000_RPTHC);
3017         stats->hgptc += E1000_READ_REG(hw, E1000_HGPTC);
3018         stats->htcbdpc += E1000_READ_REG(hw, E1000_HTCBDPC);
3019         stats->hgorc += (E1000_READ_REG(hw, E1000_HGORCL) +
3020             ((uint64_t)E1000_READ_REG(hw, E1000_HGORCH) << 32));
3021         stats->hgotc += (E1000_READ_REG(hw, E1000_HGOTCL) +
3022             ((uint64_t)E1000_READ_REG(hw, E1000_HGOTCH) << 32));
3023         stats->lenerrs += E1000_READ_REG(hw, E1000_LENERRS);
3024         stats->scvpc += E1000_READ_REG(hw, E1000_SCVPC);
3025         stats->hrmpc += E1000_READ_REG(hw, E1000_HRMPC);
3026
3027         stats->algnerrc += E1000_READ_REG(hw, E1000_ALGNERRC);
3028         stats->rxerrc += E1000_READ_REG(hw, E1000_RXERRC);
3029         stats->tncrs += E1000_READ_REG(hw, E1000_TNCRS);
3030         stats->cexterr += E1000_READ_REG(hw, E1000_CEXTERR);
3031         stats->tsctc += E1000_READ_REG(hw, E1000_TSCTC);
3032         stats->tsctfc += E1000_READ_REG(hw, E1000_TSCTFC);
3033
3034         IFNET_STAT_SET(ifp, collisions, stats->colc);
3035
3036         /* Rx Errors */
3037         IFNET_STAT_SET(ifp, ierrors,
3038             stats->rxerrc + stats->crcerrs + stats->algnerrc +
3039             stats->ruc + stats->roc + stats->mpc + stats->cexterr);
3040
3041         /* Tx Errors */
3042         IFNET_STAT_SET(ifp, oerrors,
3043             stats->ecol + stats->latecol + sc->watchdog_events);
3044
3045         /* Driver specific counters */
3046         sc->device_control = E1000_READ_REG(hw, E1000_CTRL);
3047         sc->rx_control = E1000_READ_REG(hw, E1000_RCTL);
3048         sc->int_mask = E1000_READ_REG(hw, E1000_IMS);
3049         sc->eint_mask = E1000_READ_REG(hw, E1000_EIMS);
3050         sc->packet_buf_alloc_tx =
3051             ((E1000_READ_REG(hw, E1000_PBA) & 0xffff0000) >> 16);
3052         sc->packet_buf_alloc_rx =
3053             (E1000_READ_REG(hw, E1000_PBA) & 0xffff);
3054 }
3055
3056 static void
3057 igb_vf_init_stats(struct igb_softc *sc)
3058 {
3059         struct e1000_hw *hw = &sc->hw;
3060         struct e1000_vf_stats *stats;
3061
3062         stats = sc->stats;
3063         stats->last_gprc = E1000_READ_REG(hw, E1000_VFGPRC);
3064         stats->last_gorc = E1000_READ_REG(hw, E1000_VFGORC);
3065         stats->last_gptc = E1000_READ_REG(hw, E1000_VFGPTC);
3066         stats->last_gotc = E1000_READ_REG(hw, E1000_VFGOTC);
3067         stats->last_mprc = E1000_READ_REG(hw, E1000_VFMPRC);
3068 }
3069  
3070 static void
3071 igb_update_vf_stats_counters(struct igb_softc *sc)
3072 {
3073         struct e1000_hw *hw = &sc->hw;
3074         struct e1000_vf_stats *stats;
3075
3076         if (sc->link_speed == 0)
3077                 return;
3078
3079         stats = sc->stats;
3080         UPDATE_VF_REG(E1000_VFGPRC, stats->last_gprc, stats->gprc);
3081         UPDATE_VF_REG(E1000_VFGORC, stats->last_gorc, stats->gorc);
3082         UPDATE_VF_REG(E1000_VFGPTC, stats->last_gptc, stats->gptc);
3083         UPDATE_VF_REG(E1000_VFGOTC, stats->last_gotc, stats->gotc);
3084         UPDATE_VF_REG(E1000_VFMPRC, stats->last_mprc, stats->mprc);
3085 }
3086
3087 #ifdef IFPOLL_ENABLE
3088
3089 static void
3090 igb_npoll_status(struct ifnet *ifp)
3091 {
3092         struct igb_softc *sc = ifp->if_softc;
3093         uint32_t reg_icr;
3094
3095         ASSERT_SERIALIZED(&sc->main_serialize);
3096
3097         reg_icr = E1000_READ_REG(&sc->hw, E1000_ICR);
3098         if (reg_icr & (E1000_ICR_RXSEQ | E1000_ICR_LSC)) {
3099                 sc->hw.mac.get_link_status = 1;
3100                 igb_update_link_status(sc);
3101         }
3102 }
3103
3104 static void
3105 igb_npoll_tx(struct ifnet *ifp, void *arg, int cycle __unused)
3106 {
3107         struct igb_tx_ring *txr = arg;
3108
3109         ASSERT_SERIALIZED(&txr->tx_serialize);
3110
3111         igb_txeof(txr, *(txr->tx_hdr));
3112         if (!ifsq_is_empty(txr->ifsq))
3113                 ifsq_devstart(txr->ifsq);
3114 }
3115
3116 static void
3117 igb_npoll_rx(struct ifnet *ifp __unused, void *arg, int cycle)
3118 {
3119         struct igb_rx_ring *rxr = arg;
3120
3121         ASSERT_SERIALIZED(&rxr->rx_serialize);
3122
3123         igb_rxeof(rxr, cycle);
3124 }
3125
3126 static void
3127 igb_npoll(struct ifnet *ifp, struct ifpoll_info *info)
3128 {
3129         struct igb_softc *sc = ifp->if_softc;
3130         int i, txr_cnt, rxr_cnt;
3131
3132         ASSERT_IFNET_SERIALIZED_ALL(ifp);
3133
3134         if (info) {
3135                 int off;
3136
3137                 info->ifpi_status.status_func = igb_npoll_status;
3138                 info->ifpi_status.serializer = &sc->main_serialize;
3139
3140                 txr_cnt = igb_get_txring_inuse(sc, TRUE);
3141                 off = sc->tx_npoll_off;
3142                 for (i = 0; i < txr_cnt; ++i) {
3143                         struct igb_tx_ring *txr = &sc->tx_rings[i];
3144                         int idx = i + off;
3145
3146                         KKASSERT(idx < ncpus2);
3147                         info->ifpi_tx[idx].poll_func = igb_npoll_tx;
3148                         info->ifpi_tx[idx].arg = txr;
3149                         info->ifpi_tx[idx].serializer = &txr->tx_serialize;
3150                         ifsq_set_cpuid(txr->ifsq, idx);
3151                 }
3152
3153                 rxr_cnt = igb_get_rxring_inuse(sc, TRUE);
3154                 off = sc->rx_npoll_off;
3155                 for (i = 0; i < rxr_cnt; ++i) {
3156                         struct igb_rx_ring *rxr = &sc->rx_rings[i];
3157                         int idx = i + off;
3158
3159                         KKASSERT(idx < ncpus2);
3160                         info->ifpi_rx[idx].poll_func = igb_npoll_rx;
3161                         info->ifpi_rx[idx].arg = rxr;
3162                         info->ifpi_rx[idx].serializer = &rxr->rx_serialize;
3163                 }
3164
3165                 if (ifp->if_flags & IFF_RUNNING) {
3166                         if (rxr_cnt == sc->rx_ring_inuse &&
3167                             txr_cnt == sc->tx_ring_inuse) {
3168                                 igb_set_timer_cpuid(sc, TRUE);
3169                                 igb_disable_intr(sc);
3170                         } else {
3171                                 igb_init(sc);
3172                         }
3173                 }
3174         } else {
3175                 for (i = 0; i < sc->tx_ring_cnt; ++i) {
3176                         struct igb_tx_ring *txr = &sc->tx_rings[i];
3177
3178                         ifsq_set_cpuid(txr->ifsq, txr->tx_intr_cpuid);
3179                 }
3180
3181                 if (ifp->if_flags & IFF_RUNNING) {
3182                         txr_cnt = igb_get_txring_inuse(sc, FALSE);
3183                         rxr_cnt = igb_get_rxring_inuse(sc, FALSE);
3184
3185                         if (rxr_cnt == sc->rx_ring_inuse &&
3186                             txr_cnt == sc->tx_ring_inuse) {
3187                                 igb_set_timer_cpuid(sc, FALSE);
3188                                 igb_enable_intr(sc);
3189                         } else {
3190                                 igb_init(sc);
3191                         }
3192                 }
3193         }
3194 }
3195
3196 #endif /* IFPOLL_ENABLE */
3197
3198 static void
3199 igb_intr(void *xsc)
3200 {
3201         struct igb_softc *sc = xsc;
3202         struct ifnet *ifp = &sc->arpcom.ac_if;
3203         uint32_t eicr;
3204
3205         ASSERT_SERIALIZED(&sc->main_serialize);
3206
3207         eicr = E1000_READ_REG(&sc->hw, E1000_EICR);
3208
3209         if (eicr == 0)
3210                 return;
3211
3212         if (ifp->if_flags & IFF_RUNNING) {
3213                 struct igb_tx_ring *txr = &sc->tx_rings[0];
3214                 int i;
3215
3216                 for (i = 0; i < sc->rx_ring_inuse; ++i) {
3217                         struct igb_rx_ring *rxr = &sc->rx_rings[i];
3218
3219                         if (eicr & rxr->rx_intr_mask) {
3220                                 lwkt_serialize_enter(&rxr->rx_serialize);
3221                                 igb_rxeof(rxr, -1);
3222                                 lwkt_serialize_exit(&rxr->rx_serialize);
3223                         }
3224                 }
3225
3226                 if (eicr & txr->tx_intr_mask) {
3227                         lwkt_serialize_enter(&txr->tx_serialize);
3228                         igb_txeof(txr, *(txr->tx_hdr));
3229                         if (!ifsq_is_empty(txr->ifsq))
3230                                 ifsq_devstart(txr->ifsq);
3231                         lwkt_serialize_exit(&txr->tx_serialize);
3232                 }
3233         }
3234
3235         if (eicr & E1000_EICR_OTHER) {
3236                 uint32_t icr = E1000_READ_REG(&sc->hw, E1000_ICR);
3237
3238                 /* Link status change */
3239                 if (icr & E1000_ICR_LSC) {
3240                         sc->hw.mac.get_link_status = 1;
3241                         igb_update_link_status(sc);
3242                 }
3243         }
3244
3245         /*
3246          * Reading EICR has the side effect to clear interrupt mask,
3247          * so all interrupts need to be enabled here.
3248          */
3249         E1000_WRITE_REG(&sc->hw, E1000_EIMS, sc->intr_mask);
3250 }
3251
3252 static void
3253 igb_intr_shared(void *xsc)
3254 {
3255         struct igb_softc *sc = xsc;
3256         struct ifnet *ifp = &sc->arpcom.ac_if;
3257         uint32_t reg_icr;
3258
3259         ASSERT_SERIALIZED(&sc->main_serialize);
3260
3261         reg_icr = E1000_READ_REG(&sc->hw, E1000_ICR);
3262
3263         /* Hot eject?  */
3264         if (reg_icr == 0xffffffff)
3265                 return;
3266
3267         /* Definitely not our interrupt.  */
3268         if (reg_icr == 0x0)
3269                 return;
3270
3271         if ((reg_icr & E1000_ICR_INT_ASSERTED) == 0)
3272                 return;
3273
3274         if (ifp->if_flags & IFF_RUNNING) {
3275                 if (reg_icr &
3276                     (E1000_ICR_RXT0 | E1000_ICR_RXDMT0 | E1000_ICR_RXO)) {
3277                         int i;
3278
3279                         for (i = 0; i < sc->rx_ring_inuse; ++i) {
3280                                 struct igb_rx_ring *rxr = &sc->rx_rings[i];
3281
3282                                 lwkt_serialize_enter(&rxr->rx_serialize);
3283                                 igb_rxeof(rxr, -1);
3284                                 lwkt_serialize_exit(&rxr->rx_serialize);
3285                         }
3286                 }
3287
3288                 if (reg_icr & E1000_ICR_TXDW) {
3289                         struct igb_tx_ring *txr = &sc->tx_rings[0];
3290
3291                         lwkt_serialize_enter(&txr->tx_serialize);
3292                         igb_txeof(txr, *(txr->tx_hdr));
3293                         if (!ifsq_is_empty(txr->ifsq))
3294                                 ifsq_devstart(txr->ifsq);
3295                         lwkt_serialize_exit(&txr->tx_serialize);
3296                 }
3297         }
3298
3299         /* Link status change */
3300         if (reg_icr & (E1000_ICR_RXSEQ | E1000_ICR_LSC)) {
3301                 sc->hw.mac.get_link_status = 1;
3302                 igb_update_link_status(sc);
3303         }
3304
3305         if (reg_icr & E1000_ICR_RXO)
3306                 sc->rx_overruns++;
3307 }
3308
3309 static int
3310 igb_encap(struct igb_tx_ring *txr, struct mbuf **m_headp,
3311     int *segs_used, int *idx)
3312 {
3313         bus_dma_segment_t segs[IGB_MAX_SCATTER];
3314         bus_dmamap_t map;
3315         struct igb_tx_buf *tx_buf, *tx_buf_mapped;
3316         union e1000_adv_tx_desc *txd = NULL;
3317         struct mbuf *m_head = *m_headp;
3318         uint32_t olinfo_status = 0, cmd_type_len = 0, cmd_rs = 0;
3319         int maxsegs, nsegs, i, j, error;
3320         uint32_t hdrlen = 0;
3321
3322         if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3323                 error = igb_tso_pullup(txr, m_headp);
3324                 if (error)
3325                         return error;
3326                 m_head = *m_headp;
3327         }
3328
3329         /* Set basic descriptor constants */
3330         cmd_type_len |= E1000_ADVTXD_DTYP_DATA;
3331         cmd_type_len |= E1000_ADVTXD_DCMD_IFCS | E1000_ADVTXD_DCMD_DEXT;
3332         if (m_head->m_flags & M_VLANTAG)
3333                 cmd_type_len |= E1000_ADVTXD_DCMD_VLE;
3334
3335         /*
3336          * Map the packet for DMA.
3337          */
3338         tx_buf = &txr->tx_buf[txr->next_avail_desc];
3339         tx_buf_mapped = tx_buf;
3340         map = tx_buf->map;
3341
3342         maxsegs = txr->tx_avail - IGB_TX_RESERVED;
3343         if (maxsegs > IGB_MAX_SCATTER)
3344                 maxsegs = IGB_MAX_SCATTER;
3345
3346         error = bus_dmamap_load_mbuf_defrag(txr->tx_tag, map, m_headp,
3347             segs, maxsegs, &nsegs, BUS_DMA_NOWAIT);
3348         if (error) {
3349                 if (error == ENOBUFS)
3350                         txr->sc->mbuf_defrag_failed++;
3351                 else
3352                         txr->sc->no_tx_dma_setup++;
3353
3354                 m_freem(*m_headp);
3355                 *m_headp = NULL;
3356                 return error;
3357         }
3358         bus_dmamap_sync(txr->tx_tag, map, BUS_DMASYNC_PREWRITE);
3359
3360         m_head = *m_headp;
3361
3362         /*
3363          * Set up the TX context descriptor, if any hardware offloading is
3364          * needed.  This includes CSUM, VLAN, and TSO.  It will consume one
3365          * TX descriptor.
3366          *
3367          * Unlike these chips' predecessors (em/emx), TX context descriptor
3368          * will _not_ interfere TX data fetching pipelining.
3369          */
3370         if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3371                 igb_tso_ctx(txr, m_head, &hdrlen);
3372                 cmd_type_len |= E1000_ADVTXD_DCMD_TSE;
3373                 olinfo_status |= E1000_TXD_POPTS_IXSM << 8;
3374                 olinfo_status |= E1000_TXD_POPTS_TXSM << 8;
3375                 txr->tx_nsegs++;
3376                 (*segs_used)++;
3377         } else if (igb_txcsum_ctx(txr, m_head)) {
3378                 if (m_head->m_pkthdr.csum_flags & CSUM_IP)
3379                         olinfo_status |= (E1000_TXD_POPTS_IXSM << 8);
3380                 if (m_head->m_pkthdr.csum_flags & (CSUM_UDP | CSUM_TCP))
3381                         olinfo_status |= (E1000_TXD_POPTS_TXSM << 8);
3382                 txr->tx_nsegs++;
3383                 (*segs_used)++;
3384         }
3385
3386         *segs_used += nsegs;
3387         txr->tx_nsegs += nsegs;
3388         if (txr->tx_nsegs >= txr->intr_nsegs) {
3389                 /*
3390                  * Report Status (RS) is turned on every intr_nsegs
3391                  * descriptors (roughly).
3392                  */
3393                 txr->tx_nsegs = 0;
3394                 cmd_rs = E1000_ADVTXD_DCMD_RS;
3395         }
3396
3397         /* Calculate payload length */
3398         olinfo_status |= ((m_head->m_pkthdr.len - hdrlen)
3399             << E1000_ADVTXD_PAYLEN_SHIFT);
3400
3401         /*
3402          * 82575 needs the TX context index added; the queue
3403          * index is used as TX context index here.
3404          */
3405         if (txr->sc->hw.mac.type == e1000_82575)
3406                 olinfo_status |= txr->me << 4;
3407
3408         /* Set up our transmit descriptors */
3409         i = txr->next_avail_desc;
3410         for (j = 0; j < nsegs; j++) {
3411                 bus_size_t seg_len;
3412                 bus_addr_t seg_addr;
3413
3414                 tx_buf = &txr->tx_buf[i];
3415                 txd = (union e1000_adv_tx_desc *)&txr->tx_base[i];
3416                 seg_addr = segs[j].ds_addr;
3417                 seg_len = segs[j].ds_len;
3418
3419                 txd->read.buffer_addr = htole64(seg_addr);
3420                 txd->read.cmd_type_len = htole32(cmd_type_len | seg_len);
3421                 txd->read.olinfo_status = htole32(olinfo_status);
3422                 if (++i == txr->num_tx_desc)
3423                         i = 0;
3424                 tx_buf->m_head = NULL;
3425         }
3426
3427         KASSERT(txr->tx_avail > nsegs, ("invalid avail TX desc\n"));
3428         txr->next_avail_desc = i;
3429         txr->tx_avail -= nsegs;
3430
3431         tx_buf->m_head = m_head;
3432         tx_buf_mapped->map = tx_buf->map;
3433         tx_buf->map = map;
3434
3435         /*
3436          * Last Descriptor of Packet needs End Of Packet (EOP)
3437          */
3438         txd->read.cmd_type_len |= htole32(E1000_ADVTXD_DCMD_EOP | cmd_rs);
3439
3440         /*
3441          * Defer TDT updating, until enough descrptors are setup
3442          */
3443         *idx = i;
3444 #ifdef IGB_TSS_DEBUG
3445         ++txr->tx_packets;
3446 #endif
3447
3448         return 0;
3449 }
3450
3451 static void
3452 igb_start(struct ifnet *ifp, struct ifaltq_subque *ifsq)
3453 {
3454         struct igb_softc *sc = ifp->if_softc;
3455         struct igb_tx_ring *txr = ifsq_get_priv(ifsq);
3456         struct mbuf *m_head;
3457         int idx = -1, nsegs = 0;
3458
3459         KKASSERT(txr->ifsq == ifsq);
3460         ASSERT_SERIALIZED(&txr->tx_serialize);
3461
3462         if ((ifp->if_flags & IFF_RUNNING) == 0 || ifsq_is_oactive(ifsq))
3463                 return;
3464
3465         if (!sc->link_active || (txr->tx_flags & IGB_TXFLAG_ENABLED) == 0) {
3466                 ifsq_purge(ifsq);
3467                 return;
3468         }
3469
3470         while (!ifsq_is_empty(ifsq)) {
3471                 if (txr->tx_avail <= IGB_MAX_SCATTER + IGB_TX_RESERVED) {
3472                         ifsq_set_oactive(ifsq);
3473                         /* Set watchdog on */
3474                         txr->tx_watchdog.wd_timer = 5;
3475                         break;
3476                 }
3477
3478                 m_head = ifsq_dequeue(ifsq);
3479                 if (m_head == NULL)
3480                         break;
3481
3482                 if (igb_encap(txr, &m_head, &nsegs, &idx)) {
3483                         IFNET_STAT_INC(ifp, oerrors, 1);
3484                         continue;
3485                 }
3486
3487                 /*
3488                  * TX interrupt are aggressively aggregated, so increasing
3489                  * opackets at TX interrupt time will make the opackets
3490                  * statistics vastly inaccurate; we do the opackets increment
3491                  * now.
3492                  */
3493                 IFNET_STAT_INC(ifp, opackets, 1);
3494
3495                 if (nsegs >= txr->wreg_nsegs) {
3496                         E1000_WRITE_REG(&txr->sc->hw, E1000_TDT(txr->me), idx);
3497                         idx = -1;
3498                         nsegs = 0;
3499                 }
3500
3501                 /* Send a copy of the frame to the BPF listener */
3502                 ETHER_BPF_MTAP(ifp, m_head);
3503         }
3504         if (idx >= 0)
3505                 E1000_WRITE_REG(&txr->sc->hw, E1000_TDT(txr->me), idx);
3506 }
3507
3508 static void
3509 igb_watchdog(struct ifaltq_subque *ifsq)
3510 {
3511         struct igb_tx_ring *txr = ifsq_get_priv(ifsq);
3512         struct ifnet *ifp = ifsq_get_ifp(ifsq);
3513         struct igb_softc *sc = ifp->if_softc;
3514         int i;
3515
3516         KKASSERT(txr->ifsq == ifsq);
3517         ASSERT_IFNET_SERIALIZED_ALL(ifp);
3518
3519         /* 
3520          * If flow control has paused us since last checking
3521          * it invalidates the watchdog timing, so dont run it.
3522          */
3523         if (sc->pause_frames) {
3524                 sc->pause_frames = 0;
3525                 txr->tx_watchdog.wd_timer = 5;
3526                 return;
3527         }
3528
3529         if_printf(ifp, "Watchdog timeout -- resetting\n");
3530         if_printf(ifp, "Queue(%d) tdh = %d, hw tdt = %d\n", txr->me,
3531             E1000_READ_REG(&sc->hw, E1000_TDH(txr->me)),
3532             E1000_READ_REG(&sc->hw, E1000_TDT(txr->me)));
3533         if_printf(ifp, "TX(%d) desc avail = %d, "
3534             "Next TX to Clean = %d\n",
3535             txr->me, txr->tx_avail, txr->next_to_clean);
3536
3537         IFNET_STAT_INC(ifp, oerrors, 1);
3538         sc->watchdog_events++;
3539
3540         igb_init(sc);
3541         for (i = 0; i < sc->tx_ring_inuse; ++i)
3542                 ifsq_devstart_sched(sc->tx_rings[i].ifsq);
3543 }
3544
3545 static void
3546 igb_set_eitr(struct igb_softc *sc, int idx, int rate)
3547 {
3548         uint32_t eitr = 0;
3549
3550         if (rate > 0) {
3551                 if (sc->hw.mac.type == e1000_82575) {
3552                         eitr = 1000000000 / 256 / rate;
3553                         /*
3554                          * NOTE:
3555                          * Document is wrong on the 2 bits left shift
3556                          */
3557                 } else {
3558                         eitr = 1000000 / rate;
3559                         eitr <<= IGB_EITR_INTVL_SHIFT;
3560                 }
3561
3562                 if (eitr == 0) {
3563                         /* Don't disable it */
3564                         eitr = 1 << IGB_EITR_INTVL_SHIFT;
3565                 } else if (eitr > IGB_EITR_INTVL_MASK) {
3566                         /* Don't allow it to be too large */
3567                         eitr = IGB_EITR_INTVL_MASK;
3568                 }
3569         }
3570         if (sc->hw.mac.type == e1000_82575)
3571                 eitr |= eitr << 16;
3572         else
3573                 eitr |= E1000_EITR_CNT_IGNR;
3574         E1000_WRITE_REG(&sc->hw, E1000_EITR(idx), eitr);
3575 }
3576
3577 static void
3578 igb_add_intr_rate_sysctl(struct igb_softc *sc, int use,
3579     const char *name, const char *desc)
3580 {
3581         int i;
3582
3583         for (i = 0; i < sc->intr_cnt; ++i) {
3584                 if (sc->intr_data[i].intr_use == use) {
3585                         SYSCTL_ADD_PROC(device_get_sysctl_ctx(sc->dev),
3586                             SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev)),
3587                             OID_AUTO, name, CTLTYPE_INT | CTLFLAG_RW,
3588                             sc, use, igb_sysctl_intr_rate, "I", desc);
3589                         break;
3590                 }
3591         }
3592 }
3593
3594 static int
3595 igb_sysctl_intr_rate(SYSCTL_HANDLER_ARGS)
3596 {
3597         struct igb_softc *sc = (void *)arg1;
3598         int use = arg2;
3599         struct ifnet *ifp = &sc->arpcom.ac_if;
3600         int error, rate, i;
3601         struct igb_intr_data *intr;
3602
3603         rate = 0;
3604         for (i = 0; i < sc->intr_cnt; ++i) {
3605                 intr = &sc->intr_data[i];
3606                 if (intr->intr_use == use) {
3607                         rate = intr->intr_rate;
3608                         break;
3609                 }
3610         }
3611
3612         error = sysctl_handle_int(oidp, &rate, 0, req);
3613         if (error || req->newptr == NULL)
3614                 return error;
3615         if (rate <= 0)
3616                 return EINVAL;
3617
3618         ifnet_serialize_all(ifp);
3619
3620         for (i = 0; i < sc->intr_cnt; ++i) {
3621                 intr = &sc->intr_data[i];
3622                 if (intr->intr_use == use && intr->intr_rate != rate) {
3623                         intr->intr_rate = rate;
3624                         if (ifp->if_flags & IFF_RUNNING)
3625                                 igb_set_eitr(sc, i, rate);
3626                 }
3627         }
3628
3629         ifnet_deserialize_all(ifp);
3630
3631         return error;
3632 }
3633
3634 static int
3635 igb_sysctl_tx_intr_nsegs(SYSCTL_HANDLER_ARGS)
3636 {
3637         struct igb_softc *sc = (void *)arg1;
3638         struct ifnet *ifp = &sc->arpcom.ac_if;
3639         struct igb_tx_ring *txr = &sc->tx_rings[0];
3640         int error, nsegs;
3641
3642         nsegs = txr->intr_nsegs;
3643         error = sysctl_handle_int(oidp, &nsegs, 0, req);
3644         if (error || req->newptr == NULL)
3645                 return error;
3646         if (nsegs <= 0)
3647                 return EINVAL;
3648
3649         ifnet_serialize_all(ifp);
3650
3651         if (nsegs >= txr->num_tx_desc - IGB_MAX_SCATTER - IGB_TX_RESERVED) {
3652                 error = EINVAL;
3653         } else {
3654                 int i;
3655
3656                 error = 0;
3657                 for (i = 0; i < sc->tx_ring_cnt; ++i)
3658                         sc->tx_rings[i].intr_nsegs = nsegs;
3659         }
3660
3661         ifnet_deserialize_all(ifp);
3662
3663         return error;
3664 }
3665
3666 static int
3667 igb_sysctl_rx_wreg_nsegs(SYSCTL_HANDLER_ARGS)
3668 {
3669         struct igb_softc *sc = (void *)arg1;
3670         struct ifnet *ifp = &sc->arpcom.ac_if;
3671         int error, nsegs, i;
3672
3673         nsegs = sc->rx_rings[0].wreg_nsegs;
3674         error = sysctl_handle_int(oidp, &nsegs, 0, req);
3675         if (error || req->newptr == NULL)
3676                 return error;
3677
3678         ifnet_serialize_all(ifp);
3679         for (i = 0; i < sc->rx_ring_cnt; ++i)
3680                 sc->rx_rings[i].wreg_nsegs = nsegs;
3681         ifnet_deserialize_all(ifp);
3682
3683         return 0;
3684 }
3685
3686 static int
3687 igb_sysctl_tx_wreg_nsegs(SYSCTL_HANDLER_ARGS)
3688 {
3689         struct igb_softc *sc = (void *)arg1;
3690         struct ifnet *ifp = &sc->arpcom.ac_if;
3691         int error, nsegs, i;
3692
3693         nsegs = sc->tx_rings[0].wreg_nsegs;
3694         error = sysctl_handle_int(oidp, &nsegs, 0, req);
3695         if (error || req->newptr == NULL)
3696                 return error;
3697
3698         ifnet_serialize_all(ifp);
3699         for (i = 0; i < sc->tx_ring_cnt; ++i)
3700                 sc->tx_rings[i].wreg_nsegs = nsegs;
3701         ifnet_deserialize_all(ifp);
3702
3703         return 0;
3704 }
3705
3706 #ifdef IFPOLL_ENABLE
3707
3708 static int
3709 igb_sysctl_npoll_rxoff(SYSCTL_HANDLER_ARGS)
3710 {
3711         struct igb_softc *sc = (void *)arg1;
3712         struct ifnet *ifp = &sc->arpcom.ac_if;
3713         int error, off;
3714
3715         off = sc->rx_npoll_off;
3716         error = sysctl_handle_int(oidp, &off, 0, req);
3717         if (error || req->newptr == NULL)
3718                 return error;
3719         if (off < 0)
3720                 return EINVAL;
3721
3722         ifnet_serialize_all(ifp);
3723         if (off >= ncpus2 || off % sc->rx_ring_cnt != 0) {
3724                 error = EINVAL;
3725         } else {
3726                 error = 0;
3727                 sc->rx_npoll_off = off;
3728         }
3729         ifnet_deserialize_all(ifp);
3730
3731         return error;
3732 }
3733
3734 static int
3735 igb_sysctl_npoll_txoff(SYSCTL_HANDLER_ARGS)
3736 {
3737         struct igb_softc *sc = (void *)arg1;
3738         struct ifnet *ifp = &sc->arpcom.ac_if;
3739         int error, off;
3740
3741         off = sc->tx_npoll_off;
3742         error = sysctl_handle_int(oidp, &off, 0, req);
3743         if (error || req->newptr == NULL)
3744                 return error;
3745         if (off < 0)
3746                 return EINVAL;
3747
3748         ifnet_serialize_all(ifp);
3749         if (off >= ncpus2 || off % sc->tx_ring_cnt != 0) {
3750                 error = EINVAL;
3751         } else {
3752                 error = 0;
3753                 sc->tx_npoll_off = off;
3754         }
3755         ifnet_deserialize_all(ifp);
3756
3757         return error;
3758 }
3759
3760 #endif  /* IFPOLL_ENABLE */
3761
3762 static void
3763 igb_init_intr(struct igb_softc *sc)
3764 {
3765         int i;
3766
3767         igb_set_intr_mask(sc);
3768
3769         if ((sc->flags & IGB_FLAG_SHARED_INTR) == 0)
3770                 igb_init_unshared_intr(sc);
3771
3772         for (i = 0; i < sc->intr_cnt; ++i)
3773                 igb_set_eitr(sc, i, sc->intr_data[i].intr_rate);
3774 }
3775
3776 static void
3777 igb_init_unshared_intr(struct igb_softc *sc)
3778 {
3779         struct e1000_hw *hw = &sc->hw;
3780         const struct igb_rx_ring *rxr;
3781         const struct igb_tx_ring *txr;
3782         uint32_t ivar, index;
3783         int i;
3784
3785         /*
3786          * Enable extended mode
3787          */
3788         if (sc->hw.mac.type != e1000_82575) {
3789                 uint32_t gpie;
3790                 int ivar_max;
3791
3792                 gpie = E1000_GPIE_NSICR;
3793                 if (sc->intr_type == PCI_INTR_TYPE_MSIX) {
3794                         gpie |= E1000_GPIE_MSIX_MODE |
3795                             E1000_GPIE_EIAME |
3796                             E1000_GPIE_PBA;
3797                 }
3798                 E1000_WRITE_REG(hw, E1000_GPIE, gpie);
3799
3800                 /*
3801                  * Clear IVARs
3802                  */
3803                 switch (sc->hw.mac.type) {
3804                 case e1000_82576:
3805                         ivar_max = IGB_MAX_IVAR_82576;
3806                         break;
3807
3808                 case e1000_82580:
3809                         ivar_max = IGB_MAX_IVAR_82580;
3810                         break;
3811
3812                 case e1000_i350:
3813                         ivar_max = IGB_MAX_IVAR_I350;
3814                         break;
3815
3816                 case e1000_i354:
3817                         ivar_max = IGB_MAX_IVAR_I354;
3818                         break;
3819
3820                 case e1000_vfadapt:
3821                 case e1000_vfadapt_i350:
3822                         ivar_max = IGB_MAX_IVAR_VF;
3823                         break;
3824
3825                 case e1000_i210:
3826                         ivar_max = IGB_MAX_IVAR_I210;
3827                         break;
3828
3829                 case e1000_i211:
3830                         ivar_max = IGB_MAX_IVAR_I211;
3831                         break;
3832
3833                 default:
3834                         panic("unknown mac type %d\n", sc->hw.mac.type);
3835                 }
3836                 for (i = 0; i < ivar_max; ++i)
3837                         E1000_WRITE_REG_ARRAY(hw, E1000_IVAR0, i, 0);
3838                 E1000_WRITE_REG(hw, E1000_IVAR_MISC, 0);
3839         } else {
3840                 uint32_t tmp;
3841
3842                 KASSERT(sc->intr_type != PCI_INTR_TYPE_MSIX,
3843                     ("82575 w/ MSI-X"));
3844                 tmp = E1000_READ_REG(hw, E1000_CTRL_EXT);
3845                 tmp |= E1000_CTRL_EXT_IRCA;
3846                 E1000_WRITE_REG(hw, E1000_CTRL_EXT, tmp);
3847         }
3848
3849         /*
3850          * Map TX/RX interrupts to EICR
3851          */
3852         switch (sc->hw.mac.type) {
3853         case e1000_82580:
3854         case e1000_i350:
3855         case e1000_i354:
3856         case e1000_vfadapt:
3857         case e1000_vfadapt_i350:
3858         case e1000_i210:
3859         case e1000_i211:
3860                 /* RX entries */
3861                 for (i = 0; i < sc->rx_ring_inuse; ++i) {
3862                         rxr = &sc->rx_rings[i];
3863
3864                         index = i >> 1;
3865                         ivar = E1000_READ_REG_ARRAY(hw, E1000_IVAR0, index);
3866
3867                         if (i & 1) {
3868                                 ivar &= 0xff00ffff;
3869                                 ivar |=
3870                                 (rxr->rx_intr_vec | E1000_IVAR_VALID) << 16;
3871                         } else {
3872                                 ivar &= 0xffffff00;
3873                                 ivar |=
3874                                 (rxr->rx_intr_vec | E1000_IVAR_VALID);
3875                         }
3876                         E1000_WRITE_REG_ARRAY(hw, E1000_IVAR0, index, ivar);
3877                 }
3878                 /* TX entries */
3879                 for (i = 0; i < sc->tx_ring_inuse; ++i) {
3880                         txr = &sc->tx_rings[i];
3881
3882                         index = i >> 1;
3883                         ivar = E1000_READ_REG_ARRAY(hw, E1000_IVAR0, index);
3884
3885                         if (i & 1) {
3886                                 ivar &= 0x00ffffff;
3887                                 ivar |=
3888                                 (txr->tx_intr_vec | E1000_IVAR_VALID) << 24;
3889                         } else {
3890                                 ivar &= 0xffff00ff;
3891                                 ivar |=
3892                                 (txr->tx_intr_vec | E1000_IVAR_VALID) << 8;
3893                         }
3894                         E1000_WRITE_REG_ARRAY(hw, E1000_IVAR0, index, ivar);
3895                 }
3896                 if (sc->intr_type == PCI_INTR_TYPE_MSIX) {
3897                         ivar = (sc->sts_msix_vec | E1000_IVAR_VALID) << 8;
3898                         E1000_WRITE_REG(hw, E1000_IVAR_MISC, ivar);
3899                 }
3900                 break;
3901
3902         case e1000_82576:
3903                 /* RX entries */
3904                 for (i = 0; i < sc->rx_ring_inuse; ++i) {
3905                         rxr = &sc->rx_rings[i];
3906
3907                         index = i & 0x7; /* Each IVAR has two entries */
3908                         ivar = E1000_READ_REG_ARRAY(hw, E1000_IVAR0, index);
3909
3910                         if (i < 8) {
3911                                 ivar &= 0xffffff00;
3912                                 ivar |=
3913                                 (rxr->rx_intr_vec | E1000_IVAR_VALID);
3914                         } else {
3915                                 ivar &= 0xff00ffff;
3916                                 ivar |=
3917                                 (rxr->rx_intr_vec | E1000_IVAR_VALID) << 16;
3918                         }
3919                         E1000_WRITE_REG_ARRAY(hw, E1000_IVAR0, index, ivar);
3920                 }
3921                 /* TX entries */
3922                 for (i = 0; i < sc->tx_ring_inuse; ++i) {
3923                         txr = &sc->tx_rings[i];
3924
3925                         index = i & 0x7; /* Each IVAR has two entries */
3926                         ivar = E1000_READ_REG_ARRAY(hw, E1000_IVAR0, index);
3927
3928                         if (i < 8) {
3929                                 ivar &= 0xffff00ff;
3930                                 ivar |=
3931                                 (txr->tx_intr_vec | E1000_IVAR_VALID) << 8;
3932                         } else {
3933                                 ivar &= 0x00ffffff;
3934                                 ivar |=
3935                                 (txr->tx_intr_vec | E1000_IVAR_VALID) << 24;
3936                         }
3937                         E1000_WRITE_REG_ARRAY(hw, E1000_IVAR0, index, ivar);
3938                 }
3939                 if (sc->intr_type == PCI_INTR_TYPE_MSIX) {
3940                         ivar = (sc->sts_msix_vec | E1000_IVAR_VALID) << 8;
3941                         E1000_WRITE_REG(hw, E1000_IVAR_MISC, ivar);
3942                 }
3943                 break;
3944
3945         case e1000_82575:
3946                 /*
3947                  * Enable necessary interrupt bits.
3948                  *
3949                  * The name of the register is confusing; in addition to
3950                  * configuring the first vector of MSI-X, it also configures
3951                  * which bits of EICR could be set by the hardware even when
3952                  * MSI or line interrupt is used; it thus controls interrupt
3953                  * generation.  It MUST be configured explicitly; the default
3954                  * value mentioned in the datasheet is wrong: RX queue0 and
3955                  * TX queue0 are NOT enabled by default.
3956                  */
3957                 E1000_WRITE_REG(&sc->hw, E1000_MSIXBM(0), sc->intr_mask);
3958                 break;
3959
3960         default:
3961                 panic("unknown mac type %d\n", sc->hw.mac.type);
3962         }
3963 }
3964
3965 static int
3966 igb_setup_intr(struct igb_softc *sc)
3967 {
3968         int i;
3969
3970         for (i = 0; i < sc->intr_cnt; ++i) {
3971                 struct igb_intr_data *intr = &sc->intr_data[i];
3972                 int error;
3973
3974                 error = bus_setup_intr_descr(sc->dev, intr->intr_res,
3975                     INTR_MPSAFE, intr->intr_func, intr->intr_funcarg,
3976                     &intr->intr_hand, intr->intr_serialize, intr->intr_desc);
3977                 if (error) {
3978                         device_printf(sc->dev, "can't setup %dth intr\n", i);
3979                         igb_teardown_intr(sc, i);
3980                         return error;
3981                 }
3982         }
3983         return 0;
3984 }
3985
3986 static void
3987 igb_set_txintr_mask(struct igb_tx_ring *txr, int *intr_vec0, int intr_vecmax)
3988 {
3989         if (txr->sc->hw.mac.type == e1000_82575) {
3990                 txr->tx_intr_vec = 0;   /* unused */
3991                 switch (txr->me) {
3992                 case 0:
3993                         txr->tx_intr_mask = E1000_EICR_TX_QUEUE0;
3994                         break;
3995                 case 1:
3996                         txr->tx_intr_mask = E1000_EICR_TX_QUEUE1;
3997                         break;
3998                 case 2:
3999                         txr->tx_intr_mask = E1000_EICR_TX_QUEUE2;
4000                         break;
4001                 case 3:
4002                         txr->tx_intr_mask = E1000_EICR_TX_QUEUE3;
4003                         break;
4004                 default:
4005                         panic("unsupported # of TX ring, %d\n", txr->me);
4006                 }
4007         } else {
4008                 int intr_vec = *intr_vec0;
4009
4010                 txr->tx_intr_vec = intr_vec % intr_vecmax;
4011                 txr->tx_intr_mask = 1 << txr->tx_intr_vec;
4012
4013                 *intr_vec0 = intr_vec + 1;
4014         }
4015 }
4016
4017 static void
4018 igb_set_rxintr_mask(struct igb_rx_ring *rxr, int *intr_vec0, int intr_vecmax)
4019 {
4020         if (rxr->sc->hw.mac.type == e1000_82575) {
4021                 rxr->rx_intr_vec = 0;   /* unused */
4022                 switch (rxr->me) {
4023                 case 0:
4024                         rxr->rx_intr_mask = E1000_EICR_RX_QUEUE0;
4025                         break;
4026                 case 1:
4027                         rxr->rx_intr_mask = E1000_EICR_RX_QUEUE1;
4028                         break;
4029                 case 2:
4030                         rxr->rx_intr_mask = E1000_EICR_RX_QUEUE2;
4031                         break;
4032                 case 3:
4033                         rxr->rx_intr_mask = E1000_EICR_RX_QUEUE3;
4034                         break;
4035                 default:
4036                         panic("unsupported # of RX ring, %d\n", rxr->me);
4037                 }
4038         } else {
4039                 int intr_vec = *intr_vec0;
4040
4041                 rxr->rx_intr_vec = intr_vec % intr_vecmax;
4042                 rxr->rx_intr_mask = 1 << rxr->rx_intr_vec;
4043
4044                 *intr_vec0 = intr_vec + 1;
4045         }
4046 }
4047
4048 static void
4049 igb_serialize(struct ifnet *ifp, enum ifnet_serialize slz)
4050 {
4051         struct igb_softc *sc = ifp->if_softc;
4052
4053         ifnet_serialize_array_enter(sc->serializes, sc->serialize_cnt, slz);
4054 }
4055
4056 static void
4057 igb_deserialize(struct ifnet *ifp, enum ifnet_serialize slz)
4058 {
4059         struct igb_softc *sc = ifp->if_softc;
4060
4061         ifnet_serialize_array_exit(sc->serializes, sc->serialize_cnt, slz);
4062 }
4063
4064 static int
4065 igb_tryserialize(struct ifnet *ifp, enum ifnet_serialize slz)
4066 {
4067         struct igb_softc *sc = ifp->if_softc;
4068
4069         return ifnet_serialize_array_try(sc->serializes, sc->serialize_cnt,
4070             slz);
4071 }
4072
4073 #ifdef INVARIANTS
4074
4075 static void
4076 igb_serialize_assert(struct ifnet *ifp, enum ifnet_serialize slz,
4077     boolean_t serialized)
4078 {
4079         struct igb_softc *sc = ifp->if_softc;
4080
4081         ifnet_serialize_array_assert(sc->serializes, sc->serialize_cnt,
4082             slz, serialized);
4083 }
4084
4085 #endif  /* INVARIANTS */
4086
4087 static void
4088 igb_set_intr_mask(struct igb_softc *sc)
4089 {
4090         int i;
4091
4092         sc->intr_mask = sc->sts_intr_mask;
4093         for (i = 0; i < sc->rx_ring_inuse; ++i)
4094                 sc->intr_mask |= sc->rx_rings[i].rx_intr_mask;
4095         for (i = 0; i < sc->tx_ring_inuse; ++i)
4096                 sc->intr_mask |= sc->tx_rings[i].tx_intr_mask;
4097         if (bootverbose) {
4098                 if_printf(&sc->arpcom.ac_if, "intr mask 0x%08x\n",
4099                     sc->intr_mask);
4100         }
4101 }
4102
4103 static int
4104 igb_alloc_intr(struct igb_softc *sc)
4105 {
4106         struct igb_intr_data *intr;
4107         int i, intr_vec, intr_vecmax;
4108         u_int intr_flags;
4109
4110         igb_alloc_msix(sc);
4111         if (sc->intr_type == PCI_INTR_TYPE_MSIX)
4112                 goto done;
4113
4114         if (sc->intr_data != NULL)
4115                 kfree(sc->intr_data, M_DEVBUF);
4116
4117         sc->intr_cnt = 1;
4118         sc->intr_data = kmalloc(sizeof(struct igb_intr_data), M_DEVBUF,
4119             M_WAITOK | M_ZERO);
4120         intr = &sc->intr_data[0];
4121
4122         /*
4123          * Allocate MSI/legacy interrupt resource
4124          */
4125         sc->intr_type = pci_alloc_1intr(sc->dev, igb_msi_enable,
4126             &intr->intr_rid, &intr_flags);
4127
4128         if (sc->intr_type == PCI_INTR_TYPE_LEGACY) {
4129                 int unshared;
4130
4131                 unshared = device_getenv_int(sc->dev, "irq.unshared", 0);
4132                 if (!unshared) {
4133                         sc->flags |= IGB_FLAG_SHARED_INTR;
4134                         if (bootverbose)
4135                                 device_printf(sc->dev, "IRQ shared\n");
4136                 } else {
4137                         intr_flags &= ~RF_SHAREABLE;
4138                         if (bootverbose)
4139                                 device_printf(sc->dev, "IRQ unshared\n");
4140                 }
4141         }
4142
4143         intr->intr_res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ,
4144             &intr->intr_rid, intr_flags);
4145         if (intr->intr_res == NULL) {
4146                 device_printf(sc->dev, "Unable to allocate bus resource: "
4147                     "interrupt\n");
4148                 return ENXIO;
4149         }
4150
4151         intr->intr_serialize = &sc->main_serialize;
4152         intr->intr_cpuid = rman_get_cpuid(intr->intr_res);
4153         intr->intr_func = (sc->flags & IGB_FLAG_SHARED_INTR) ?
4154             igb_intr_shared : igb_intr;
4155         intr->intr_funcarg = sc;
4156         intr->intr_rate = IGB_INTR_RATE;
4157         intr->intr_use = IGB_INTR_USE_RXTX;
4158
4159         for (i = 0; i < sc->tx_ring_cnt; ++i)
4160                 sc->tx_rings[i].tx_intr_cpuid = intr->intr_cpuid;
4161
4162         /*
4163          * Setup MSI/legacy interrupt mask
4164          */
4165         switch (sc->hw.mac.type) {
4166         case e1000_82575:
4167                 intr_vecmax = IGB_MAX_TXRXINT_82575;
4168                 break;
4169
4170         case e1000_82576:
4171                 intr_vecmax = IGB_MAX_TXRXINT_82576;
4172                 break;
4173
4174         case e1000_82580:
4175                 intr_vecmax = IGB_MAX_TXRXINT_82580;
4176                 break;
4177
4178         case e1000_i350:
4179                 intr_vecmax = IGB_MAX_TXRXINT_I350;
4180                 break;
4181
4182         case e1000_i354:
4183                 intr_vecmax = IGB_MAX_TXRXINT_I354;
4184                 break;
4185
4186         case e1000_i210:
4187                 intr_vecmax = IGB_MAX_TXRXINT_I210;
4188                 break;
4189
4190         case e1000_i211:
4191                 intr_vecmax = IGB_MAX_TXRXINT_I211;
4192                 break;
4193
4194         default:
4195                 intr_vecmax = IGB_MIN_TXRXINT;
4196                 break;
4197         }
4198         intr_vec = 0;
4199         for (i = 0; i < sc->tx_ring_cnt; ++i)
4200                 igb_set_txintr_mask(&sc->tx_rings[i], &intr_vec, intr_vecmax);
4201         for (i = 0; i < sc->rx_ring_cnt; ++i)
4202                 igb_set_rxintr_mask(&sc->rx_rings[i], &intr_vec, intr_vecmax);
4203         sc->sts_intr_mask = E1000_EICR_OTHER;
4204 done:
4205         igb_set_ring_inuse(sc, FALSE);
4206         igb_set_intr_mask(sc);
4207         return 0;
4208 }
4209
4210 static void
4211 igb_free_intr(struct igb_softc *sc)
4212 {
4213         if (sc->intr_data == NULL)
4214                 return;
4215
4216         if (sc->intr_type != PCI_INTR_TYPE_MSIX) {
4217                 struct igb_intr_data *intr = &sc->intr_data[0];
4218
4219                 KKASSERT(sc->intr_cnt == 1);
4220                 if (intr->intr_res != NULL) {
4221                         bus_release_resource(sc->dev, SYS_RES_IRQ,
4222                             intr->intr_rid, intr->intr_res);
4223                 }
4224                 if (sc->intr_type == PCI_INTR_TYPE_MSI)
4225                         pci_release_msi(sc->dev);
4226
4227                 kfree(sc->intr_data, M_DEVBUF);
4228         } else {
4229                 igb_free_msix(sc, TRUE);
4230         }
4231 }
4232
4233 static void
4234 igb_teardown_intr(struct igb_softc *sc, int intr_cnt)
4235 {
4236         int i;
4237
4238         if (sc->intr_data == NULL)
4239                 return;
4240
4241         for (i = 0; i < intr_cnt; ++i) {
4242                 struct igb_intr_data *intr = &sc->intr_data[i];
4243
4244                 bus_teardown_intr(sc->dev, intr->intr_res, intr->intr_hand);
4245         }
4246 }
4247
4248 static void
4249 igb_alloc_msix(struct igb_softc *sc)
4250 {
4251         int msix_enable, msix_cnt, msix_cnt2, alloc_cnt;
4252         int i, x, error;
4253         int offset, offset_def, agg_rxtx, ring_max;
4254         struct igb_intr_data *intr;
4255         boolean_t aggregate, setup = FALSE;
4256
4257         /*
4258          * Don't enable MSI-X on 82575, see:
4259          * 82575 specification update errata #25
4260          */
4261         if (sc->hw.mac.type == e1000_82575)
4262                 return;
4263
4264         /* Don't enable MSI-X on VF */
4265         if (sc->vf_ifp)
4266                 return;
4267
4268         msix_enable = device_getenv_int(sc->dev, "msix.enable",
4269             igb_msix_enable);
4270         if (!msix_enable)
4271                 return;
4272
4273         msix_cnt = pci_msix_count(sc->dev);
4274 #ifdef IGB_MSIX_DEBUG
4275         msix_cnt = device_getenv_int(sc->dev, "msix.count", msix_cnt);
4276 #endif
4277         if (msix_cnt <= 1) {
4278                 /* One MSI-X model does not make sense */
4279                 return;
4280         }
4281
4282         i = 0;
4283         while ((1 << (i + 1)) <= msix_cnt)
4284                 ++i;
4285         msix_cnt2 = 1 << i;
4286
4287         if (bootverbose) {
4288                 device_printf(sc->dev, "MSI-X count %d/%d\n",
4289                     msix_cnt2, msix_cnt);
4290         }
4291
4292         KKASSERT(msix_cnt2 <= msix_cnt);
4293         if (msix_cnt == msix_cnt2) {
4294                 /* We need at least one MSI-X for link status */
4295                 msix_cnt2 >>= 1;
4296                 if (msix_cnt2 <= 1) {
4297                         /* One MSI-X for RX/TX does not make sense */
4298                         device_printf(sc->dev, "not enough MSI-X for TX/RX, "
4299                             "MSI-X count %d/%d\n", msix_cnt2, msix_cnt);
4300                         return;
4301                 }
4302                 KKASSERT(msix_cnt > msix_cnt2);
4303
4304                 if (bootverbose) {
4305                         device_printf(sc->dev, "MSI-X count fixup %d/%d\n",
4306                             msix_cnt2, msix_cnt);
4307                 }
4308         }
4309
4310         sc->rx_ring_msix = sc->rx_ring_cnt;
4311         if (sc->rx_ring_msix > msix_cnt2)
4312                 sc->rx_ring_msix = msix_cnt2;
4313
4314         sc->tx_ring_msix = sc->tx_ring_cnt;
4315         if (sc->tx_ring_msix > msix_cnt2)
4316                 sc->tx_ring_msix = msix_cnt2;
4317
4318         ring_max = sc->rx_ring_msix;
4319         if (ring_max < sc->tx_ring_msix)
4320                 ring_max = sc->tx_ring_msix;
4321
4322         /* Allow user to force independent RX/TX MSI-X handling */
4323         agg_rxtx = device_getenv_int(sc->dev, "msix.agg_rxtx",
4324             igb_msix_agg_rxtx);
4325
4326         if (!agg_rxtx && msix_cnt >= sc->tx_ring_msix + sc->rx_ring_msix + 1) {
4327                 /*
4328                  * Independent TX/RX MSI-X
4329                  */
4330                 aggregate = FALSE;
4331                 if (bootverbose)
4332                         device_printf(sc->dev, "independent TX/RX MSI-X\n");
4333                 alloc_cnt = sc->tx_ring_msix + sc->rx_ring_msix;
4334         } else {
4335                 /*
4336                  * Aggregate TX/RX MSI-X
4337                  */
4338                 aggregate = TRUE;
4339                 if (bootverbose)
4340                         device_printf(sc->dev, "aggregate TX/RX MSI-X\n");
4341                 alloc_cnt = msix_cnt2;
4342                 if (alloc_cnt > ring_max)
4343                         alloc_cnt = ring_max;
4344                 KKASSERT(alloc_cnt >= sc->rx_ring_msix &&
4345                     alloc_cnt >= sc->tx_ring_msix);
4346         }
4347         ++alloc_cnt;    /* For link status */
4348
4349         if (bootverbose) {
4350                 device_printf(sc->dev, "MSI-X alloc %d, "
4351                     "RX ring %d, TX ring %d\n", alloc_cnt,
4352                     sc->rx_ring_msix, sc->tx_ring_msix);
4353         }
4354
4355         sc->msix_mem_rid = PCIR_BAR(IGB_MSIX_BAR);
4356         sc->msix_mem_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4357             &sc->msix_mem_rid, RF_ACTIVE);
4358         if (sc->msix_mem_res == NULL) {
4359                 sc->msix_mem_rid = PCIR_BAR(IGB_MSIX_BAR_ALT);
4360                 sc->msix_mem_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4361                     &sc->msix_mem_rid, RF_ACTIVE);
4362                 if (sc->msix_mem_res == NULL) {
4363                         device_printf(sc->dev, "Unable to map MSI-X table\n");
4364                         return;
4365                 }
4366         }
4367
4368         sc->intr_cnt = alloc_cnt;
4369         sc->intr_data = kmalloc(sizeof(struct igb_intr_data) * sc->intr_cnt,
4370             M_DEVBUF, M_WAITOK | M_ZERO);
4371         for (x = 0; x < sc->intr_cnt; ++x) {
4372                 intr = &sc->intr_data[x];
4373                 intr->intr_rid = -1;
4374                 intr->intr_rate = IGB_INTR_RATE;
4375         }
4376
4377         x = 0;
4378         if (!aggregate) {
4379                 /*
4380                  * RX rings
4381                  */
4382                 if (sc->rx_ring_msix == ncpus2) {
4383                         offset = 0;
4384                 } else {
4385                         offset_def = (sc->rx_ring_msix *
4386                             device_get_unit(sc->dev)) % ncpus2;
4387
4388                         offset = device_getenv_int(sc->dev,
4389                             "msix.rxoff", offset_def);
4390                         if (offset >= ncpus2 ||
4391                             offset % sc->rx_ring_msix != 0) {
4392                                 device_printf(sc->dev,
4393                                     "invalid msix.rxoff %d, use %d\n",
4394                                     offset, offset_def);
4395                                 offset = offset_def;
4396                         }
4397                 }
4398                 igb_msix_rx_conf(sc, 0, &x, offset);
4399
4400                 /*
4401                  * TX rings
4402                  */
4403                 if (sc->tx_ring_msix == ncpus2) {
4404                         offset = 0;
4405                 } else {
4406                         offset_def = (sc->tx_ring_msix *
4407                             device_get_unit(sc->dev)) % ncpus2;
4408
4409                         offset = device_getenv_int(sc->dev,
4410                             "msix.txoff", offset_def);
4411                         if (offset >= ncpus2 ||
4412                             offset % sc->tx_ring_msix != 0) {
4413                                 device_printf(sc->dev,
4414                                     "invalid msix.txoff %d, use %d\n",
4415                                     offset, offset_def);
4416                                 offset = offset_def;
4417                         }
4418                 }
4419                 igb_msix_tx_conf(sc, 0, &x, offset);
4420         } else {
4421                 int ring_agg;
4422
4423                 ring_agg = sc->rx_ring_msix;
4424                 if (ring_agg > sc->tx_ring_msix)
4425                         ring_agg = sc->tx_ring_msix;
4426
4427                 if (ring_max == ncpus2) {
4428                         offset = 0;
4429                 } else {
4430                         offset_def = (ring_max * device_get_unit(sc->dev)) %
4431                             ncpus2;
4432
4433                         offset = device_getenv_int(sc->dev, "msix.off",
4434                             offset_def);
4435                         if (offset >= ncpus2 || offset % ring_max != 0) {
4436                                 device_printf(sc->dev,
4437                                     "invalid msix.off %d, use %d\n",
4438                                     offset, offset_def);
4439                                 offset = offset_def;
4440                         }
4441                 }
4442
4443                 for (i = 0; i < ring_agg; ++i) {
4444                         struct igb_tx_ring *txr = &sc->tx_rings[i];
4445                         struct igb_rx_ring *rxr = &sc->rx_rings[i];
4446
4447                         KKASSERT(x < sc->intr_cnt);
4448                         rxr->rx_intr_vec = x;
4449                         rxr->rx_intr_mask = 1 << rxr->rx_intr_vec;
4450                         rxr->rx_txr = txr;
4451                         txr->tx_intr_vec = rxr->rx_intr_vec;
4452                         txr->tx_intr_mask = rxr->rx_intr_mask;
4453
4454                         intr = &sc->intr_data[x++];
4455
4456                         intr->intr_serialize = &rxr->rx_serialize;
4457                         intr->intr_func = igb_msix_rxtx;
4458                         intr->intr_funcarg = rxr;
4459                         intr->intr_use = IGB_INTR_USE_RXTX;
4460
4461                         intr->intr_cpuid = i + offset;
4462                         KKASSERT(intr->intr_cpuid < ncpus2);
4463                         txr->tx_intr_cpuid = intr->intr_cpuid;
4464
4465                         ksnprintf(intr->intr_desc0, sizeof(intr->intr_desc0),
4466                             "%s rxtx%d", device_get_nameunit(sc->dev), i);
4467                         intr->intr_desc = intr->intr_desc0;
4468                 }
4469
4470                 if (ring_agg != ring_max) {
4471                         if (ring_max == sc->tx_ring_msix)
4472                                 igb_msix_tx_conf(sc, i, &x, offset);
4473                         else
4474                                 igb_msix_rx_conf(sc, i, &x, offset);
4475                 }
4476         }
4477
4478         /*
4479          * Link status
4480          */
4481         KKASSERT(x < sc->intr_cnt);
4482         sc->sts_msix_vec = x;
4483         sc->sts_intr_mask = 1 << sc->sts_msix_vec;
4484
4485         intr = &sc->intr_data[x++];
4486
4487         intr->intr_serialize = &sc->main_serialize;
4488         intr->intr_func = igb_msix_status;
4489         intr->intr_funcarg = sc;
4490         intr->intr_cpuid = 0;
4491         intr->intr_use = IGB_INTR_USE_STATUS;
4492
4493         ksnprintf(intr->intr_desc0, sizeof(intr->intr_desc0), "%s sts",
4494             device_get_nameunit(sc->dev));
4495         intr->intr_desc = intr->intr_desc0;
4496
4497         KKASSERT(x == sc->intr_cnt);
4498
4499         error = pci_setup_msix(sc->dev);
4500         if (error) {
4501                 device_printf(sc->dev, "Setup MSI-X failed\n");
4502                 goto back;
4503         }
4504         setup = TRUE;
4505
4506         for (i = 0; i < sc->intr_cnt; ++i) {
4507                 intr = &sc->intr_data[i];
4508
4509                 error = pci_alloc_msix_vector(sc->dev, i, &intr->intr_rid,
4510                     intr->intr_cpuid);
4511                 if (error) {
4512                         device_printf(sc->dev,
4513                             "Unable to allocate MSI-X %d on cpu%d\n", i,
4514                             intr->intr_cpuid);
4515                         goto back;
4516                 }
4517
4518                 intr->intr_res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ,
4519                     &intr->intr_rid, RF_ACTIVE);
4520                 if (intr->intr_res == NULL) {
4521                         device_printf(sc->dev,
4522                             "Unable to allocate MSI-X %d resource\n", i);
4523                         error = ENOMEM;
4524                         goto back;
4525                 }
4526         }
4527
4528         pci_enable_msix(sc->dev);
4529         sc->intr_type = PCI_INTR_TYPE_MSIX;
4530 back:
4531         if (error)
4532                 igb_free_msix(sc, setup);
4533 }
4534
4535 static void
4536 igb_free_msix(struct igb_softc *sc, boolean_t setup)
4537 {
4538         int i;
4539
4540         KKASSERT(sc->intr_cnt > 1);
4541
4542         for (i = 0; i < sc->intr_cnt; ++i) {
4543                 struct igb_intr_data *intr = &sc->intr_data[i];
4544
4545                 if (intr->intr_res != NULL) {
4546                         bus_release_resource(sc->dev, SYS_RES_IRQ,
4547                             intr->intr_rid, intr->intr_res);
4548                 }
4549                 if (intr->intr_rid >= 0)
4550                         pci_release_msix_vector(sc->dev, intr->intr_rid);
4551         }
4552         if (setup)
4553                 pci_teardown_msix(sc->dev);
4554
4555         sc->intr_cnt = 0;
4556         kfree(sc->intr_data, M_DEVBUF);
4557         sc->intr_data = NULL;
4558 }
4559
4560 static void
4561 igb_msix_rx(void *arg)
4562 {
4563         struct igb_rx_ring *rxr = arg;
4564
4565         ASSERT_SERIALIZED(&rxr->rx_serialize);
4566         igb_rxeof(rxr, -1);
4567
4568         E1000_WRITE_REG(&rxr->sc->hw, E1000_EIMS, rxr->rx_intr_mask);
4569 }
4570
4571 static void
4572 igb_msix_tx(void *arg)
4573 {
4574         struct igb_tx_ring *txr = arg;
4575
4576         ASSERT_SERIALIZED(&txr->tx_serialize);
4577
4578         igb_txeof(txr, *(txr->tx_hdr));
4579         if (!ifsq_is_empty(txr->ifsq))
4580                 ifsq_devstart(txr->ifsq);
4581
4582         E1000_WRITE_REG(&txr->sc->hw, E1000_EIMS, txr->tx_intr_mask);
4583 }
4584
4585 static void
4586 igb_msix_status(void *arg)
4587 {
4588         struct igb_softc *sc = arg;
4589         uint32_t icr;
4590
4591         ASSERT_SERIALIZED(&sc->main_serialize);
4592
4593         icr = E1000_READ_REG(&sc->hw, E1000_ICR);
4594         if (icr & E1000_ICR_LSC) {
4595                 sc->hw.mac.get_link_status = 1;
4596                 igb_update_link_status(sc);
4597         }
4598
4599         E1000_WRITE_REG(&sc->hw, E1000_EIMS, sc->sts_intr_mask);
4600 }
4601
4602 static void
4603 igb_set_ring_inuse(struct igb_softc *sc, boolean_t polling)
4604 {
4605         sc->rx_ring_inuse = igb_get_rxring_inuse(sc, polling);
4606         sc->tx_ring_inuse = igb_get_txring_inuse(sc, polling);
4607         if (bootverbose) {
4608                 if_printf(&sc->arpcom.ac_if, "RX rings %d/%d, TX rings %d/%d\n",
4609                     sc->rx_ring_inuse, sc->rx_ring_cnt,
4610                     sc->tx_ring_inuse, sc->tx_ring_cnt);
4611         }
4612 }
4613
4614 static int
4615 igb_get_rxring_inuse(const struct igb_softc *sc, boolean_t polling)
4616 {
4617         if (!IGB_ENABLE_HWRSS(sc))
4618                 return 1;
4619
4620         if (polling)
4621                 return sc->rx_ring_cnt;
4622         else if (sc->intr_type != PCI_INTR_TYPE_MSIX)
4623                 return IGB_MIN_RING_RSS;
4624         else
4625                 return sc->rx_ring_msix;
4626 }
4627
4628 static int
4629 igb_get_txring_inuse(const struct igb_softc *sc, boolean_t polling)
4630 {
4631         if (!IGB_ENABLE_HWTSS(sc))
4632                 return 1;
4633
4634         if (polling)
4635                 return sc->tx_ring_cnt;
4636         else if (sc->intr_type != PCI_INTR_TYPE_MSIX)
4637                 return IGB_MIN_RING;
4638         else
4639                 return sc->tx_ring_msix;
4640 }
4641
4642 static int
4643 igb_tso_pullup(struct igb_tx_ring *txr, struct mbuf **mp)
4644 {
4645         int hoff, iphlen, thoff;
4646         struct mbuf *m;
4647
4648         m = *mp;
4649         KASSERT(M_WRITABLE(m), ("TSO mbuf not writable"));
4650
4651         iphlen = m->m_pkthdr.csum_iphlen;
4652         thoff = m->m_pkthdr.csum_thlen;
4653         hoff = m->m_pkthdr.csum_lhlen;
4654
4655         KASSERT(iphlen > 0, ("invalid ip hlen"));
4656         KASSERT(thoff > 0, ("invalid tcp hlen"));
4657         KASSERT(hoff > 0, ("invalid ether hlen"));
4658
4659         if (__predict_false(m->m_len < hoff + iphlen + thoff)) {
4660                 m = m_pullup(m, hoff + iphlen + thoff);
4661                 if (m == NULL) {
4662                         *mp = NULL;
4663                         return ENOBUFS;
4664                 }
4665                 *mp = m;
4666         }
4667         if (txr->tx_flags & IGB_TXFLAG_TSO_IPLEN0) {
4668                 struct ip *ip;
4669
4670                 ip = mtodoff(m, struct ip *, hoff);
4671                 ip->ip_len = 0;
4672         }
4673
4674         return 0;
4675 }
4676
4677 static void
4678 igb_tso_ctx(struct igb_tx_ring *txr, struct mbuf *m, uint32_t *hlen)
4679 {
4680         struct e1000_adv_tx_context_desc *TXD;
4681         uint32_t vlan_macip_lens, type_tucmd_mlhl, mss_l4len_idx;
4682         int hoff, ctxd, iphlen, thoff;
4683
4684         iphlen = m->m_pkthdr.csum_iphlen;
4685         thoff = m->m_pkthdr.csum_thlen;
4686         hoff = m->m_pkthdr.csum_lhlen;
4687
4688         vlan_macip_lens = type_tucmd_mlhl = mss_l4len_idx = 0;
4689
4690         ctxd = txr->next_avail_desc;
4691         TXD = (struct e1000_adv_tx_context_desc *)&txr->tx_base[ctxd];
4692
4693         if (m->m_flags & M_VLANTAG) {
4694                 uint16_t vlantag;
4695
4696                 vlantag = htole16(m->m_pkthdr.ether_vlantag);
4697                 vlan_macip_lens |= (vlantag << E1000_ADVTXD_VLAN_SHIFT);
4698         }
4699
4700         vlan_macip_lens |= (hoff << E1000_ADVTXD_MACLEN_SHIFT);
4701         vlan_macip_lens |= iphlen;
4702
4703         type_tucmd_mlhl |= E1000_ADVTXD_DCMD_DEXT | E1000_ADVTXD_DTYP_CTXT;
4704         type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_TCP;
4705         type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_IPV4;
4706
4707         mss_l4len_idx |= (m->m_pkthdr.tso_segsz << E1000_ADVTXD_MSS_SHIFT);
4708         mss_l4len_idx |= (thoff << E1000_ADVTXD_L4LEN_SHIFT);
4709
4710         /*
4711          * 82575 needs the TX context index added; the queue
4712          * index is used as TX context index here.
4713          */
4714         if (txr->sc->hw.mac.type == e1000_82575)
4715                 mss_l4len_idx |= txr->me << 4;
4716
4717         TXD->vlan_macip_lens = htole32(vlan_macip_lens);
4718         TXD->type_tucmd_mlhl = htole32(type_tucmd_mlhl);
4719         TXD->seqnum_seed = htole32(0);
4720         TXD->mss_l4len_idx = htole32(mss_l4len_idx);
4721
4722         /* We've consumed the first desc, adjust counters */
4723         if (++ctxd == txr->num_tx_desc)
4724                 ctxd = 0;
4725         txr->next_avail_desc = ctxd;
4726         --txr->tx_avail;
4727
4728         *hlen = hoff + iphlen + thoff;
4729 }
4730
4731 static void
4732 igb_setup_serialize(struct igb_softc *sc)
4733 {
4734         int i = 0, j;
4735
4736         /* Main + RX + TX */
4737         sc->serialize_cnt = 1 + sc->rx_ring_cnt + sc->tx_ring_cnt;
4738         sc->serializes =
4739             kmalloc(sc->serialize_cnt * sizeof(struct lwkt_serialize *),
4740                 M_DEVBUF, M_WAITOK | M_ZERO);
4741
4742         /*
4743          * Setup serializes
4744          *
4745          * NOTE: Order is critical
4746          */
4747
4748         KKASSERT(i < sc->serialize_cnt);
4749         sc->serializes[i++] = &sc->main_serialize;
4750
4751         for (j = 0; j < sc->rx_ring_cnt; ++j) {
4752                 KKASSERT(i < sc->serialize_cnt);
4753                 sc->serializes[i++] = &sc->rx_rings[j].rx_serialize;
4754         }
4755
4756         for (j = 0; j < sc->tx_ring_cnt; ++j) {
4757                 KKASSERT(i < sc->serialize_cnt);
4758                 sc->serializes[i++] = &sc->tx_rings[j].tx_serialize;
4759         }
4760
4761         KKASSERT(i == sc->serialize_cnt);
4762 }
4763
4764 static void
4765 igb_msix_rx_conf(struct igb_softc *sc, int i, int *x0, int offset)
4766 {
4767         int x = *x0;
4768
4769         for (; i < sc->rx_ring_msix; ++i) {
4770                 struct igb_rx_ring *rxr = &sc->rx_rings[i];
4771                 struct igb_intr_data *intr;
4772
4773                 KKASSERT(x < sc->intr_cnt);
4774                 rxr->rx_intr_vec = x;
4775                 rxr->rx_intr_mask = 1 << rxr->rx_intr_vec;
4776
4777                 intr = &sc->intr_data[x++];
4778
4779                 intr->intr_serialize = &rxr->rx_serialize;
4780                 intr->intr_func = igb_msix_rx;
4781                 intr->intr_funcarg = rxr;
4782                 intr->intr_rate = IGB_MSIX_RX_RATE;
4783                 intr->intr_use = IGB_INTR_USE_RX;
4784
4785                 intr->intr_cpuid = i + offset;
4786                 KKASSERT(intr->intr_cpuid < ncpus2);
4787
4788                 ksnprintf(intr->intr_desc0, sizeof(intr->intr_desc0), "%s rx%d",
4789                     device_get_nameunit(sc->dev), i);
4790                 intr->intr_desc = intr->intr_desc0;
4791         }
4792         *x0 = x;
4793 }
4794
4795 static void
4796 igb_msix_tx_conf(struct igb_softc *sc, int i, int *x0, int offset)
4797 {
4798         int x = *x0;
4799
4800         for (; i < sc->tx_ring_msix; ++i) {
4801                 struct igb_tx_ring *txr = &sc->tx_rings[i];
4802                 struct igb_intr_data *intr;
4803
4804                 KKASSERT(x < sc->intr_cnt);
4805                 txr->tx_intr_vec = x;
4806                 txr->tx_intr_mask = 1 << txr->tx_intr_vec;
4807
4808                 intr = &sc->intr_data[x++];
4809
4810                 intr->intr_serialize = &txr->tx_serialize;
4811                 intr->intr_func = igb_msix_tx;
4812                 intr->intr_funcarg = txr;
4813                 intr->intr_rate = IGB_MSIX_TX_RATE;
4814                 intr->intr_use = IGB_INTR_USE_TX;
4815
4816                 intr->intr_cpuid = i + offset;
4817                 KKASSERT(intr->intr_cpuid < ncpus2);
4818                 txr->tx_intr_cpuid = intr->intr_cpuid;
4819
4820                 ksnprintf(intr->intr_desc0, sizeof(intr->intr_desc0), "%s tx%d",
4821                     device_get_nameunit(sc->dev), i);
4822                 intr->intr_desc = intr->intr_desc0;
4823         }
4824         *x0 = x;
4825 }
4826
4827 static void
4828 igb_msix_rxtx(void *arg)
4829 {
4830         struct igb_rx_ring *rxr = arg;
4831         struct igb_tx_ring *txr;
4832         int hdr;
4833
4834         ASSERT_SERIALIZED(&rxr->rx_serialize);
4835
4836         igb_rxeof(rxr, -1);
4837
4838         /*
4839          * NOTE:
4840          * Since next_to_clean is only changed by igb_txeof(),
4841          * which is called only in interrupt handler, the
4842          * check w/o holding tx serializer is MPSAFE.
4843          */
4844         txr = rxr->rx_txr;
4845         hdr = *(txr->tx_hdr);
4846         if (hdr != txr->next_to_clean) {
4847                 lwkt_serialize_enter(&txr->tx_serialize);
4848                 igb_txeof(txr, hdr);
4849                 if (!ifsq_is_empty(txr->ifsq))
4850                         ifsq_devstart(txr->ifsq);
4851                 lwkt_serialize_exit(&txr->tx_serialize);
4852         }
4853
4854         E1000_WRITE_REG(&rxr->sc->hw, E1000_EIMS, rxr->rx_intr_mask);
4855 }
4856
4857 static void
4858 igb_set_timer_cpuid(struct igb_softc *sc, boolean_t polling)
4859 {
4860         if (polling || sc->intr_type == PCI_INTR_TYPE_MSIX)
4861                 sc->timer_cpuid = 0; /* XXX fixed */
4862         else
4863                 sc->timer_cpuid = rman_get_cpuid(sc->intr_data[0].intr_res);
4864 }
4865
4866 static void
4867 igb_init_dmac(struct igb_softc *sc, uint32_t pba)
4868 {
4869         struct e1000_hw *hw = &sc->hw;
4870         uint32_t reg;
4871
4872         if (hw->mac.type == e1000_i211)
4873                 return;
4874
4875         if (hw->mac.type > e1000_82580) {
4876                 uint32_t dmac;
4877                 uint16_t hwm;
4878
4879                 if (sc->dma_coalesce == 0) { /* Disabling it */
4880                         reg = ~E1000_DMACR_DMAC_EN;
4881                         E1000_WRITE_REG(hw, E1000_DMACR, reg);
4882                         return;
4883                 } else {
4884                         if_printf(&sc->arpcom.ac_if,
4885                             "DMA Coalescing enabled\n");
4886                 }
4887
4888                 /* Set starting threshold */
4889                 E1000_WRITE_REG(hw, E1000_DMCTXTH, 0);
4890
4891                 hwm = 64 * pba - sc->max_frame_size / 16;
4892                 if (hwm < 64 * (pba - 6))
4893                         hwm = 64 * (pba - 6);
4894                 reg = E1000_READ_REG(hw, E1000_FCRTC);
4895                 reg &= ~E1000_FCRTC_RTH_COAL_MASK;
4896                 reg |= ((hwm << E1000_FCRTC_RTH_COAL_SHIFT)
4897                     & E1000_FCRTC_RTH_COAL_MASK);
4898                 E1000_WRITE_REG(hw, E1000_FCRTC, reg);
4899
4900                 dmac = pba - sc->max_frame_size / 512;
4901                 if (dmac < pba - 10)
4902                         dmac = pba - 10;
4903                 reg = E1000_READ_REG(hw, E1000_DMACR);
4904                 reg &= ~E1000_DMACR_DMACTHR_MASK;
4905                 reg |= ((dmac << E1000_DMACR_DMACTHR_SHIFT)
4906                     & E1000_DMACR_DMACTHR_MASK);
4907
4908                 /* transition to L0x or L1 if available..*/
4909                 reg |= (E1000_DMACR_DMAC_EN | E1000_DMACR_DMAC_LX_MASK);
4910
4911                 /*
4912                  * Check if status is 2.5Gb backplane connection
4913                  * before configuration of watchdog timer, which
4914                  * is in msec values in 12.8usec intervals watchdog
4915                  * timer = msec values in 32usec intervals for non
4916                  * 2.5Gb connection.
4917                  */
4918                 if (hw->mac.type == e1000_i354) {
4919                         int status = E1000_READ_REG(hw, E1000_STATUS);
4920
4921                         if ((status & E1000_STATUS_2P5_SKU) &&
4922                             !(status & E1000_STATUS_2P5_SKU_OVER))
4923                                 reg |= ((sc->dma_coalesce * 5) >> 6);
4924                         else
4925                                 reg |= (sc->dma_coalesce >> 5);
4926                 } else {
4927                         reg |= (sc->dma_coalesce >> 5);
4928                 }
4929
4930                 E1000_WRITE_REG(hw, E1000_DMACR, reg);
4931
4932                 E1000_WRITE_REG(hw, E1000_DMCRTRH, 0);
4933
4934                 /* Set the interval before transition */
4935                 reg = E1000_READ_REG(hw, E1000_DMCTLX);
4936                 if (hw->mac.type == e1000_i350)
4937                         reg |= IGB_DMCTLX_DCFLUSH_DIS;
4938                 /*
4939                  * In 2.5Gb connection, TTLX unit is 0.4 usec, which
4940                  * is 0x4*2 = 0xA.  But delay is still 4 usec.
4941                  */
4942                 if (hw->mac.type == e1000_i354) {
4943                         int status = E1000_READ_REG(hw, E1000_STATUS);
4944
4945                         if ((status & E1000_STATUS_2P5_SKU) &&
4946                             !(status & E1000_STATUS_2P5_SKU_OVER))
4947                                 reg |= 0xA;
4948                         else
4949                                 reg |= 0x4;
4950                 } else {
4951                         reg |= 0x4;
4952                 }
4953                 E1000_WRITE_REG(hw, E1000_DMCTLX, reg);
4954
4955                 /* Free space in tx packet buffer to wake from DMA coal */
4956                 E1000_WRITE_REG(hw, E1000_DMCTXTH,
4957                     (IGB_TXPBSIZE - (2 * sc->max_frame_size)) >> 6);
4958
4959                 /* Make low power state decision controlled by DMA coal */
4960                 reg = E1000_READ_REG(hw, E1000_PCIEMISC);
4961                 reg &= ~E1000_PCIEMISC_LX_DECISION;
4962                 E1000_WRITE_REG(hw, E1000_PCIEMISC, reg);
4963         } else if (hw->mac.type == e1000_82580) {
4964                 reg = E1000_READ_REG(hw, E1000_PCIEMISC);
4965                 E1000_WRITE_REG(hw, E1000_PCIEMISC,
4966                     reg & ~E1000_PCIEMISC_LX_DECISION);
4967                 E1000_WRITE_REG(hw, E1000_DMACR, 0);
4968         }
4969 }