a821d869b58be285c5865a377a73091673bd11c2
[dragonfly.git] / sys / dev / netif / igb / if_igb.c
1 /*
2  * Copyright (c) 2001-2011, Intel Corporation 
3  * All rights reserved.
4  * 
5  * Redistribution and use in source and binary forms, with or without 
6  * modification, are permitted provided that the following conditions are met:
7  * 
8  *  1. Redistributions of source code must retain the above copyright notice, 
9  *     this list of conditions and the following disclaimer.
10  * 
11  *  2. Redistributions in binary form must reproduce the above copyright 
12  *     notice, this list of conditions and the following disclaimer in the 
13  *     documentation and/or other materials provided with the distribution.
14  * 
15  *  3. Neither the name of the Intel Corporation nor the names of its 
16  *     contributors may be used to endorse or promote products derived from 
17  *     this software without specific prior written permission.
18  * 
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
22  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
23  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31
32 #include "opt_ifpoll.h"
33 #include "opt_igb.h"
34
35 #include <sys/param.h>
36 #include <sys/bus.h>
37 #include <sys/endian.h>
38 #include <sys/interrupt.h>
39 #include <sys/kernel.h>
40 #include <sys/malloc.h>
41 #include <sys/mbuf.h>
42 #include <sys/proc.h>
43 #include <sys/rman.h>
44 #include <sys/serialize.h>
45 #include <sys/serialize2.h>
46 #include <sys/socket.h>
47 #include <sys/sockio.h>
48 #include <sys/sysctl.h>
49 #include <sys/systm.h>
50
51 #include <net/bpf.h>
52 #include <net/ethernet.h>
53 #include <net/if.h>
54 #include <net/if_arp.h>
55 #include <net/if_dl.h>
56 #include <net/if_media.h>
57 #include <net/ifq_var.h>
58 #include <net/toeplitz.h>
59 #include <net/toeplitz2.h>
60 #include <net/vlan/if_vlan_var.h>
61 #include <net/vlan/if_vlan_ether.h>
62 #include <net/if_poll.h>
63
64 #include <netinet/in_systm.h>
65 #include <netinet/in.h>
66 #include <netinet/ip.h>
67
68 #include <bus/pci/pcivar.h>
69 #include <bus/pci/pcireg.h>
70
71 #include <dev/netif/ig_hal/e1000_api.h>
72 #include <dev/netif/ig_hal/e1000_82575.h>
73 #include <dev/netif/igb/if_igb.h>
74
75 #define IGB_FLOWCTRL_STRLEN     16
76
77 #ifdef IGB_RSS_DEBUG
78 #define IGB_RSS_DPRINTF(sc, lvl, fmt, ...) \
79 do { \
80         if (sc->rss_debug >= lvl) \
81                 if_printf(&sc->arpcom.ac_if, fmt, __VA_ARGS__); \
82 } while (0)
83 #else   /* !IGB_RSS_DEBUG */
84 #define IGB_RSS_DPRINTF(sc, lvl, fmt, ...)      ((void)0)
85 #endif  /* IGB_RSS_DEBUG */
86
87 #define IGB_NAME        "Intel(R) PRO/1000 "
88 #define IGB_DEVICE(id)  \
89         { IGB_VENDOR_ID, E1000_DEV_ID_##id, IGB_NAME #id }
90 #define IGB_DEVICE_NULL { 0, 0, NULL }
91
92 static struct igb_device {
93         uint16_t        vid;
94         uint16_t        did;
95         const char      *desc;
96 } igb_devices[] = {
97         IGB_DEVICE(82575EB_COPPER),
98         IGB_DEVICE(82575EB_FIBER_SERDES),
99         IGB_DEVICE(82575GB_QUAD_COPPER),
100         IGB_DEVICE(82576),
101         IGB_DEVICE(82576_NS),
102         IGB_DEVICE(82576_NS_SERDES),
103         IGB_DEVICE(82576_FIBER),
104         IGB_DEVICE(82576_SERDES),
105         IGB_DEVICE(82576_SERDES_QUAD),
106         IGB_DEVICE(82576_QUAD_COPPER),
107         IGB_DEVICE(82576_QUAD_COPPER_ET2),
108         IGB_DEVICE(82576_VF),
109         IGB_DEVICE(82580_COPPER),
110         IGB_DEVICE(82580_FIBER),
111         IGB_DEVICE(82580_SERDES),
112         IGB_DEVICE(82580_SGMII),
113         IGB_DEVICE(82580_COPPER_DUAL),
114         IGB_DEVICE(82580_QUAD_FIBER),
115         IGB_DEVICE(DH89XXCC_SERDES),
116         IGB_DEVICE(DH89XXCC_SGMII),
117         IGB_DEVICE(DH89XXCC_SFP),
118         IGB_DEVICE(DH89XXCC_BACKPLANE),
119         IGB_DEVICE(I350_COPPER),
120         IGB_DEVICE(I350_FIBER),
121         IGB_DEVICE(I350_SERDES),
122         IGB_DEVICE(I350_SGMII),
123         IGB_DEVICE(I350_VF),
124         IGB_DEVICE(I210_COPPER),
125         IGB_DEVICE(I210_COPPER_IT),
126         IGB_DEVICE(I210_COPPER_OEM1),
127         IGB_DEVICE(I210_COPPER_FLASHLESS),
128         IGB_DEVICE(I210_SERDES_FLASHLESS),
129         IGB_DEVICE(I210_FIBER),
130         IGB_DEVICE(I210_SERDES),
131         IGB_DEVICE(I210_SGMII),
132         IGB_DEVICE(I211_COPPER),
133         IGB_DEVICE(I354_BACKPLANE_1GBPS),
134         IGB_DEVICE(I354_SGMII),
135
136         /* required last entry */
137         IGB_DEVICE_NULL
138 };
139
140 static int      igb_probe(device_t);
141 static int      igb_attach(device_t);
142 static int      igb_detach(device_t);
143 static int      igb_shutdown(device_t);
144 static int      igb_suspend(device_t);
145 static int      igb_resume(device_t);
146
147 static boolean_t igb_is_valid_ether_addr(const uint8_t *);
148 static void     igb_setup_ifp(struct igb_softc *);
149 static boolean_t igb_txcsum_ctx(struct igb_tx_ring *, struct mbuf *);
150 static int      igb_tso_pullup(struct igb_tx_ring *, struct mbuf **);
151 static void     igb_tso_ctx(struct igb_tx_ring *, struct mbuf *, uint32_t *);
152 static void     igb_add_sysctl(struct igb_softc *);
153 static int      igb_sysctl_intr_rate(SYSCTL_HANDLER_ARGS);
154 static int      igb_sysctl_msix_rate(SYSCTL_HANDLER_ARGS);
155 static int      igb_sysctl_tx_intr_nsegs(SYSCTL_HANDLER_ARGS);
156 static int      igb_sysctl_tx_wreg_nsegs(SYSCTL_HANDLER_ARGS);
157 static int      igb_sysctl_rx_wreg_nsegs(SYSCTL_HANDLER_ARGS);
158 static void     igb_set_ring_inuse(struct igb_softc *, boolean_t);
159 static int      igb_get_rxring_inuse(const struct igb_softc *, boolean_t);
160 static int      igb_get_txring_inuse(const struct igb_softc *, boolean_t);
161 static void     igb_set_timer_cpuid(struct igb_softc *, boolean_t);
162 #ifdef IFPOLL_ENABLE
163 static int      igb_sysctl_npoll_rxoff(SYSCTL_HANDLER_ARGS);
164 static int      igb_sysctl_npoll_txoff(SYSCTL_HANDLER_ARGS);
165 #endif
166 static int      igb_sysctl_flowctrl(SYSCTL_HANDLER_ARGS);
167 static enum e1000_fc_mode igb_str2fc(const char *);
168 static void     igb_fc2str(enum e1000_fc_mode, char *, int);
169
170 static void     igb_vf_init_stats(struct igb_softc *);
171 static void     igb_reset(struct igb_softc *);
172 static void     igb_update_stats_counters(struct igb_softc *);
173 static void     igb_update_vf_stats_counters(struct igb_softc *);
174 static void     igb_update_link_status(struct igb_softc *);
175 static void     igb_init_tx_unit(struct igb_softc *);
176 static void     igb_init_rx_unit(struct igb_softc *);
177
178 static void     igb_set_vlan(struct igb_softc *);
179 static void     igb_set_multi(struct igb_softc *);
180 static void     igb_set_promisc(struct igb_softc *);
181 static void     igb_disable_promisc(struct igb_softc *);
182
183 static int      igb_alloc_rings(struct igb_softc *);
184 static void     igb_free_rings(struct igb_softc *);
185 static int      igb_create_tx_ring(struct igb_tx_ring *);
186 static int      igb_create_rx_ring(struct igb_rx_ring *);
187 static void     igb_free_tx_ring(struct igb_tx_ring *);
188 static void     igb_free_rx_ring(struct igb_rx_ring *);
189 static void     igb_destroy_tx_ring(struct igb_tx_ring *, int);
190 static void     igb_destroy_rx_ring(struct igb_rx_ring *, int);
191 static void     igb_init_tx_ring(struct igb_tx_ring *);
192 static int      igb_init_rx_ring(struct igb_rx_ring *);
193 static int      igb_newbuf(struct igb_rx_ring *, int, boolean_t);
194 static int      igb_encap(struct igb_tx_ring *, struct mbuf **, int *, int *);
195 static void     igb_rx_refresh(struct igb_rx_ring *, int);
196 static void     igb_setup_serializer(struct igb_softc *);
197
198 static void     igb_stop(struct igb_softc *);
199 static void     igb_init(void *);
200 static int      igb_ioctl(struct ifnet *, u_long, caddr_t, struct ucred *);
201 static void     igb_media_status(struct ifnet *, struct ifmediareq *);
202 static int      igb_media_change(struct ifnet *);
203 static void     igb_timer(void *);
204 static void     igb_watchdog(struct ifaltq_subque *);
205 static void     igb_start(struct ifnet *, struct ifaltq_subque *);
206 #ifdef IFPOLL_ENABLE
207 static void     igb_npoll(struct ifnet *, struct ifpoll_info *);
208 static void     igb_npoll_rx(struct ifnet *, void *, int);
209 static void     igb_npoll_tx(struct ifnet *, void *, int);
210 static void     igb_npoll_status(struct ifnet *);
211 #endif
212 static void     igb_serialize(struct ifnet *, enum ifnet_serialize);
213 static void     igb_deserialize(struct ifnet *, enum ifnet_serialize);
214 static int      igb_tryserialize(struct ifnet *, enum ifnet_serialize);
215 #ifdef INVARIANTS
216 static void     igb_serialize_assert(struct ifnet *, enum ifnet_serialize,
217                     boolean_t);
218 #endif
219
220 static void     igb_intr(void *);
221 static void     igb_intr_shared(void *);
222 static void     igb_rxeof(struct igb_rx_ring *, int);
223 static void     igb_txeof(struct igb_tx_ring *);
224 static void     igb_set_eitr(struct igb_softc *, int, int);
225 static void     igb_enable_intr(struct igb_softc *);
226 static void     igb_disable_intr(struct igb_softc *);
227 static void     igb_init_unshared_intr(struct igb_softc *);
228 static void     igb_init_intr(struct igb_softc *);
229 static int      igb_setup_intr(struct igb_softc *);
230 static void     igb_set_txintr_mask(struct igb_tx_ring *, int *, int);
231 static void     igb_set_rxintr_mask(struct igb_rx_ring *, int *, int);
232 static void     igb_set_intr_mask(struct igb_softc *);
233 static int      igb_alloc_intr(struct igb_softc *);
234 static void     igb_free_intr(struct igb_softc *);
235 static void     igb_teardown_intr(struct igb_softc *);
236 static void     igb_msix_try_alloc(struct igb_softc *);
237 static void     igb_msix_rx_conf(struct igb_softc *, int, int *, int);
238 static void     igb_msix_tx_conf(struct igb_softc *, int, int *, int);
239 static void     igb_msix_free(struct igb_softc *, boolean_t);
240 static int      igb_msix_setup(struct igb_softc *);
241 static void     igb_msix_teardown(struct igb_softc *, int);
242 static void     igb_msix_rx(void *);
243 static void     igb_msix_tx(void *);
244 static void     igb_msix_status(void *);
245 static void     igb_msix_rxtx(void *);
246
247 /* Management and WOL Support */
248 static void     igb_get_mgmt(struct igb_softc *);
249 static void     igb_rel_mgmt(struct igb_softc *);
250 static void     igb_get_hw_control(struct igb_softc *);
251 static void     igb_rel_hw_control(struct igb_softc *);
252 static void     igb_enable_wol(device_t);
253
254 static device_method_t igb_methods[] = {
255         /* Device interface */
256         DEVMETHOD(device_probe,         igb_probe),
257         DEVMETHOD(device_attach,        igb_attach),
258         DEVMETHOD(device_detach,        igb_detach),
259         DEVMETHOD(device_shutdown,      igb_shutdown),
260         DEVMETHOD(device_suspend,       igb_suspend),
261         DEVMETHOD(device_resume,        igb_resume),
262         DEVMETHOD_END
263 };
264
265 static driver_t igb_driver = {
266         "igb",
267         igb_methods,
268         sizeof(struct igb_softc),
269 };
270
271 static devclass_t igb_devclass;
272
273 DECLARE_DUMMY_MODULE(if_igb);
274 MODULE_DEPEND(igb, ig_hal, 1, 1, 1);
275 DRIVER_MODULE(if_igb, pci, igb_driver, igb_devclass, NULL, NULL);
276
277 static int      igb_rxd = IGB_DEFAULT_RXD;
278 static int      igb_txd = IGB_DEFAULT_TXD;
279 static int      igb_rxr = 0;
280 static int      igb_txr = 0;
281 static int      igb_msi_enable = 1;
282 static int      igb_msix_enable = 1;
283 static int      igb_eee_disabled = 1;   /* Energy Efficient Ethernet */
284
285 static char     igb_flowctrl[IGB_FLOWCTRL_STRLEN] = "rx_pause";
286
287 /*
288  * DMA Coalescing, only for i350 - default to off,
289  * this feature is for power savings
290  */
291 static int      igb_dma_coalesce = 0;
292
293 TUNABLE_INT("hw.igb.rxd", &igb_rxd);
294 TUNABLE_INT("hw.igb.txd", &igb_txd);
295 TUNABLE_INT("hw.igb.rxr", &igb_rxr);
296 TUNABLE_INT("hw.igb.txr", &igb_txr);
297 TUNABLE_INT("hw.igb.msi.enable", &igb_msi_enable);
298 TUNABLE_INT("hw.igb.msix.enable", &igb_msix_enable);
299 TUNABLE_STR("hw.igb.flow_ctrl", igb_flowctrl, sizeof(igb_flowctrl));
300
301 /* i350 specific */
302 TUNABLE_INT("hw.igb.eee_disabled", &igb_eee_disabled);
303 TUNABLE_INT("hw.igb.dma_coalesce", &igb_dma_coalesce);
304
305 static __inline void
306 igb_rxcsum(uint32_t staterr, struct mbuf *mp)
307 {
308         /* Ignore Checksum bit is set */
309         if (staterr & E1000_RXD_STAT_IXSM)
310                 return;
311
312         if ((staterr & (E1000_RXD_STAT_IPCS | E1000_RXDEXT_STATERR_IPE)) ==
313             E1000_RXD_STAT_IPCS)
314                 mp->m_pkthdr.csum_flags |= CSUM_IP_CHECKED | CSUM_IP_VALID;
315
316         if (staterr & (E1000_RXD_STAT_TCPCS | E1000_RXD_STAT_UDPCS)) {
317                 if ((staterr & E1000_RXDEXT_STATERR_TCPE) == 0) {
318                         mp->m_pkthdr.csum_flags |= CSUM_DATA_VALID |
319                             CSUM_PSEUDO_HDR | CSUM_FRAG_NOT_CHECKED;
320                         mp->m_pkthdr.csum_data = htons(0xffff);
321                 }
322         }
323 }
324
325 static __inline struct pktinfo *
326 igb_rssinfo(struct mbuf *m, struct pktinfo *pi,
327     uint32_t hash, uint32_t hashtype, uint32_t staterr)
328 {
329         switch (hashtype) {
330         case E1000_RXDADV_RSSTYPE_IPV4_TCP:
331                 pi->pi_netisr = NETISR_IP;
332                 pi->pi_flags = 0;
333                 pi->pi_l3proto = IPPROTO_TCP;
334                 break;
335
336         case E1000_RXDADV_RSSTYPE_IPV4:
337                 if (staterr & E1000_RXD_STAT_IXSM)
338                         return NULL;
339
340                 if ((staterr &
341                      (E1000_RXD_STAT_TCPCS | E1000_RXDEXT_STATERR_TCPE)) ==
342                     E1000_RXD_STAT_TCPCS) {
343                         pi->pi_netisr = NETISR_IP;
344                         pi->pi_flags = 0;
345                         pi->pi_l3proto = IPPROTO_UDP;
346                         break;
347                 }
348                 /* FALL THROUGH */
349         default:
350                 return NULL;
351         }
352
353         m->m_flags |= M_HASH;
354         m->m_pkthdr.hash = toeplitz_hash(hash);
355         return pi;
356 }
357
358 static int
359 igb_probe(device_t dev)
360 {
361         const struct igb_device *d;
362         uint16_t vid, did;
363
364         vid = pci_get_vendor(dev);
365         did = pci_get_device(dev);
366
367         for (d = igb_devices; d->desc != NULL; ++d) {
368                 if (vid == d->vid && did == d->did) {
369                         device_set_desc(dev, d->desc);
370                         return 0;
371                 }
372         }
373         return ENXIO;
374 }
375
376 static int
377 igb_attach(device_t dev)
378 {
379         struct igb_softc *sc = device_get_softc(dev);
380         uint16_t eeprom_data;
381         int error = 0, ring_max;
382         char flowctrl[IGB_FLOWCTRL_STRLEN];
383 #ifdef IFPOLL_ENABLE
384         int offset, offset_def;
385 #endif
386
387 #ifdef notyet
388         /* SYSCTL stuff */
389         SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev),
390             SYSCTL_CHILDREN(device_get_sysctl_tree(dev)),
391             OID_AUTO, "nvm", CTLTYPE_INT|CTLFLAG_RW, adapter, 0,
392             igb_sysctl_nvm_info, "I", "NVM Information");
393         SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev),
394             SYSCTL_CHILDREN(device_get_sysctl_tree(dev)),
395             OID_AUTO, "flow_control", CTLTYPE_INT|CTLFLAG_RW,
396             adapter, 0, igb_set_flowcntl, "I", "Flow Control");
397 #endif
398
399         callout_init_mp(&sc->timer);
400         lwkt_serialize_init(&sc->main_serialize);
401
402         if_initname(&sc->arpcom.ac_if, device_get_name(dev),
403             device_get_unit(dev));
404         sc->dev = sc->osdep.dev = dev;
405
406         /*
407          * Determine hardware and mac type
408          */
409         sc->hw.vendor_id = pci_get_vendor(dev);
410         sc->hw.device_id = pci_get_device(dev);
411         sc->hw.revision_id = pci_read_config(dev, PCIR_REVID, 1);
412         sc->hw.subsystem_vendor_id = pci_read_config(dev, PCIR_SUBVEND_0, 2);
413         sc->hw.subsystem_device_id = pci_read_config(dev, PCIR_SUBDEV_0, 2);
414
415         if (e1000_set_mac_type(&sc->hw))
416                 return ENXIO;
417
418         /* Are we a VF device? */
419         if (sc->hw.mac.type == e1000_vfadapt ||
420             sc->hw.mac.type == e1000_vfadapt_i350)
421                 sc->vf_ifp = 1;
422         else
423                 sc->vf_ifp = 0;
424
425         /*
426          * Configure total supported RX/TX ring count
427          */
428         switch (sc->hw.mac.type) {
429         case e1000_82575:
430                 ring_max = IGB_MAX_RING_82575;
431                 break;
432
433         case e1000_82576:
434                 ring_max = IGB_MAX_RING_82576;
435                 break;
436
437         case e1000_82580:
438                 ring_max = IGB_MAX_RING_82580;
439                 break;
440
441         case e1000_i350:
442                 ring_max = IGB_MAX_RING_I350;
443                 break;
444
445         case e1000_i354:
446                 ring_max = IGB_MAX_RING_I354;
447                 break;
448
449         case e1000_i210:
450                 ring_max = IGB_MAX_RING_I210;
451                 break;
452
453         case e1000_i211:
454                 ring_max = IGB_MAX_RING_I211;
455                 break;
456
457         default:
458                 ring_max = IGB_MIN_RING;
459                 break;
460         }
461
462         sc->rx_ring_cnt = device_getenv_int(dev, "rxr", igb_rxr);
463         sc->rx_ring_cnt = if_ring_count2(sc->rx_ring_cnt, ring_max);
464 #ifdef IGB_RSS_DEBUG
465         sc->rx_ring_cnt = device_getenv_int(dev, "rxr_debug", sc->rx_ring_cnt);
466 #endif
467         sc->rx_ring_inuse = sc->rx_ring_cnt;
468
469         sc->tx_ring_cnt = device_getenv_int(dev, "txr", igb_txr);
470         sc->tx_ring_cnt = if_ring_count2(sc->tx_ring_cnt, ring_max);
471 #ifdef IGB_TSS_DEBUG
472         sc->tx_ring_cnt = device_getenv_int(dev, "txr_debug", sc->tx_ring_cnt);
473 #endif
474         sc->tx_ring_inuse = sc->tx_ring_cnt;
475
476         /* Setup flow control. */
477         device_getenv_string(dev, "flow_ctrl", flowctrl, sizeof(flowctrl),
478             igb_flowctrl);
479         sc->flow_ctrl = igb_str2fc(flowctrl);
480
481         /* Enable bus mastering */
482         pci_enable_busmaster(dev);
483
484         /*
485          * Allocate IO memory
486          */
487         sc->mem_rid = PCIR_BAR(0);
488         sc->mem_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &sc->mem_rid,
489             RF_ACTIVE);
490         if (sc->mem_res == NULL) {
491                 device_printf(dev, "Unable to allocate bus resource: memory\n");
492                 error = ENXIO;
493                 goto failed;
494         }
495         sc->osdep.mem_bus_space_tag = rman_get_bustag(sc->mem_res);
496         sc->osdep.mem_bus_space_handle = rman_get_bushandle(sc->mem_res);
497
498         sc->hw.hw_addr = (uint8_t *)&sc->osdep.mem_bus_space_handle;
499
500         /* Save PCI command register for Shared Code */
501         sc->hw.bus.pci_cmd_word = pci_read_config(dev, PCIR_COMMAND, 2);
502         sc->hw.back = &sc->osdep;
503
504         /* Do Shared Code initialization */
505         if (e1000_setup_init_funcs(&sc->hw, TRUE)) {
506                 device_printf(dev, "Setup of Shared code failed\n");
507                 error = ENXIO;
508                 goto failed;
509         }
510
511         e1000_get_bus_info(&sc->hw);
512
513         sc->hw.mac.autoneg = DO_AUTO_NEG;
514         sc->hw.phy.autoneg_wait_to_complete = FALSE;
515         sc->hw.phy.autoneg_advertised = AUTONEG_ADV_DEFAULT;
516
517         /* Copper options */
518         if (sc->hw.phy.media_type == e1000_media_type_copper) {
519                 sc->hw.phy.mdix = AUTO_ALL_MODES;
520                 sc->hw.phy.disable_polarity_correction = FALSE;
521                 sc->hw.phy.ms_type = IGB_MASTER_SLAVE;
522         }
523
524         /* Set the frame limits assuming  standard ethernet sized frames. */
525         sc->max_frame_size = ETHERMTU + ETHER_HDR_LEN + ETHER_CRC_LEN;
526
527         /* Allocate RX/TX rings */
528         error = igb_alloc_rings(sc);
529         if (error)
530                 goto failed;
531
532 #ifdef IFPOLL_ENABLE
533         /*
534          * NPOLLING RX CPU offset
535          */
536         if (sc->rx_ring_cnt == ncpus2) {
537                 offset = 0;
538         } else {
539                 offset_def = (sc->rx_ring_cnt * device_get_unit(dev)) % ncpus2;
540                 offset = device_getenv_int(dev, "npoll.rxoff", offset_def);
541                 if (offset >= ncpus2 ||
542                     offset % sc->rx_ring_cnt != 0) {
543                         device_printf(dev, "invalid npoll.rxoff %d, use %d\n",
544                             offset, offset_def);
545                         offset = offset_def;
546                 }
547         }
548         sc->rx_npoll_off = offset;
549
550         /*
551          * NPOLLING TX CPU offset
552          */
553         if (sc->tx_ring_cnt == ncpus2) {
554                 offset = 0;
555         } else {
556                 offset_def = (sc->tx_ring_cnt * device_get_unit(dev)) % ncpus2;
557                 offset = device_getenv_int(dev, "npoll.txoff", offset_def);
558                 if (offset >= ncpus2 ||
559                     offset % sc->tx_ring_cnt != 0) {
560                         device_printf(dev, "invalid npoll.txoff %d, use %d\n",
561                             offset, offset_def);
562                         offset = offset_def;
563                 }
564         }
565         sc->tx_npoll_off = offset;
566 #endif
567
568         /* Allocate interrupt */
569         error = igb_alloc_intr(sc);
570         if (error)
571                 goto failed;
572
573         /* Setup serializers */
574         igb_setup_serializer(sc);
575
576         /* Allocate the appropriate stats memory */
577         if (sc->vf_ifp) {
578                 sc->stats = kmalloc(sizeof(struct e1000_vf_stats), M_DEVBUF,
579                     M_WAITOK | M_ZERO);
580                 igb_vf_init_stats(sc);
581         } else {
582                 sc->stats = kmalloc(sizeof(struct e1000_hw_stats), M_DEVBUF,
583                     M_WAITOK | M_ZERO);
584         }
585
586         /* Allocate multicast array memory. */
587         sc->mta = kmalloc(ETHER_ADDR_LEN * MAX_NUM_MULTICAST_ADDRESSES,
588             M_DEVBUF, M_WAITOK);
589
590         /* Some adapter-specific advanced features */
591         if (sc->hw.mac.type >= e1000_i350) {
592 #ifdef notyet
593                 igb_set_sysctl_value(adapter, "dma_coalesce",
594                     "configure dma coalesce",
595                     &adapter->dma_coalesce, igb_dma_coalesce);
596                 igb_set_sysctl_value(adapter, "eee_disabled",
597                     "enable Energy Efficient Ethernet",
598                     &adapter->hw.dev_spec._82575.eee_disable,
599                     igb_eee_disabled);
600 #else
601                 sc->dma_coalesce = igb_dma_coalesce;
602                 sc->hw.dev_spec._82575.eee_disable = igb_eee_disabled;
603 #endif
604                 if (sc->hw.phy.media_type == e1000_media_type_copper) {
605                         if (sc->hw.mac.type == e1000_i354)
606                                 e1000_set_eee_i354(&sc->hw);
607                         else
608                                 e1000_set_eee_i350(&sc->hw);
609                 }
610         }
611
612         /*
613          * Start from a known state, this is important in reading the nvm and
614          * mac from that.
615          */
616         e1000_reset_hw(&sc->hw);
617
618         /* Make sure we have a good EEPROM before we read from it */
619         if (sc->hw.mac.type != e1000_i210 && sc->hw.mac.type != e1000_i211 &&
620             e1000_validate_nvm_checksum(&sc->hw) < 0) {
621                 /*
622                  * Some PCI-E parts fail the first check due to
623                  * the link being in sleep state, call it again,
624                  * if it fails a second time its a real issue.
625                  */
626                 if (e1000_validate_nvm_checksum(&sc->hw) < 0) {
627                         device_printf(dev,
628                             "The EEPROM Checksum Is Not Valid\n");
629                         error = EIO;
630                         goto failed;
631                 }
632         }
633
634         /* Copy the permanent MAC address out of the EEPROM */
635         if (e1000_read_mac_addr(&sc->hw) < 0) {
636                 device_printf(dev, "EEPROM read error while reading MAC"
637                     " address\n");
638                 error = EIO;
639                 goto failed;
640         }
641         if (!igb_is_valid_ether_addr(sc->hw.mac.addr)) {
642                 device_printf(dev, "Invalid MAC address\n");
643                 error = EIO;
644                 goto failed;
645         }
646
647         /* Setup OS specific network interface */
648         igb_setup_ifp(sc);
649
650         /* Add sysctl tree, must after igb_setup_ifp() */
651         igb_add_sysctl(sc);
652
653         /* Now get a good starting state */
654         igb_reset(sc);
655
656         /* Initialize statistics */
657         igb_update_stats_counters(sc);
658
659         sc->hw.mac.get_link_status = 1;
660         igb_update_link_status(sc);
661
662         /* Indicate SOL/IDER usage */
663         if (e1000_check_reset_block(&sc->hw)) {
664                 device_printf(dev,
665                     "PHY reset is blocked due to SOL/IDER session.\n");
666         }
667
668         /* Determine if we have to control management hardware */
669         if (e1000_enable_mng_pass_thru(&sc->hw))
670                 sc->flags |= IGB_FLAG_HAS_MGMT;
671
672         /*
673          * Setup Wake-on-Lan
674          */
675         /* APME bit in EEPROM is mapped to WUC.APME */
676         eeprom_data = E1000_READ_REG(&sc->hw, E1000_WUC) & E1000_WUC_APME;
677         if (eeprom_data)
678                 sc->wol = E1000_WUFC_MAG;
679         /* XXX disable WOL */
680         sc->wol = 0; 
681
682 #ifdef notyet
683         /* Register for VLAN events */
684         adapter->vlan_attach = EVENTHANDLER_REGISTER(vlan_config,
685              igb_register_vlan, adapter, EVENTHANDLER_PRI_FIRST);
686         adapter->vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig,
687              igb_unregister_vlan, adapter, EVENTHANDLER_PRI_FIRST);
688 #endif
689
690 #ifdef notyet
691         igb_add_hw_stats(adapter);
692 #endif
693
694         /*
695          * Disable interrupt to prevent spurious interrupts (line based
696          * interrupt, MSI or even MSI-X), which had been observed on
697          * several types of LOMs, from being handled.
698          */
699         igb_disable_intr(sc);
700
701         error = igb_setup_intr(sc);
702         if (error) {
703                 ether_ifdetach(&sc->arpcom.ac_if);
704                 goto failed;
705         }
706         return 0;
707
708 failed:
709         igb_detach(dev);
710         return error;
711 }
712
713 static int
714 igb_detach(device_t dev)
715 {
716         struct igb_softc *sc = device_get_softc(dev);
717
718         if (device_is_attached(dev)) {
719                 struct ifnet *ifp = &sc->arpcom.ac_if;
720
721                 ifnet_serialize_all(ifp);
722
723                 igb_stop(sc);
724
725                 e1000_phy_hw_reset(&sc->hw);
726
727                 /* Give control back to firmware */
728                 igb_rel_mgmt(sc);
729                 igb_rel_hw_control(sc);
730
731                 if (sc->wol) {
732                         E1000_WRITE_REG(&sc->hw, E1000_WUC, E1000_WUC_PME_EN);
733                         E1000_WRITE_REG(&sc->hw, E1000_WUFC, sc->wol);
734                         igb_enable_wol(dev);
735                 }
736
737                 igb_teardown_intr(sc);
738
739                 ifnet_deserialize_all(ifp);
740
741                 ether_ifdetach(ifp);
742         } else if (sc->mem_res != NULL) {
743                 igb_rel_hw_control(sc);
744         }
745         bus_generic_detach(dev);
746
747         igb_free_intr(sc);
748
749         if (sc->msix_mem_res != NULL) {
750                 bus_release_resource(dev, SYS_RES_MEMORY, sc->msix_mem_rid,
751                     sc->msix_mem_res);
752         }
753         if (sc->mem_res != NULL) {
754                 bus_release_resource(dev, SYS_RES_MEMORY, sc->mem_rid,
755                     sc->mem_res);
756         }
757
758         igb_free_rings(sc);
759
760         if (sc->mta != NULL)
761                 kfree(sc->mta, M_DEVBUF);
762         if (sc->stats != NULL)
763                 kfree(sc->stats, M_DEVBUF);
764         if (sc->serializes != NULL)
765                 kfree(sc->serializes, M_DEVBUF);
766
767         return 0;
768 }
769
770 static int
771 igb_shutdown(device_t dev)
772 {
773         return igb_suspend(dev);
774 }
775
776 static int
777 igb_suspend(device_t dev)
778 {
779         struct igb_softc *sc = device_get_softc(dev);
780         struct ifnet *ifp = &sc->arpcom.ac_if;
781
782         ifnet_serialize_all(ifp);
783
784         igb_stop(sc);
785
786         igb_rel_mgmt(sc);
787         igb_rel_hw_control(sc);
788
789         if (sc->wol) {
790                 E1000_WRITE_REG(&sc->hw, E1000_WUC, E1000_WUC_PME_EN);
791                 E1000_WRITE_REG(&sc->hw, E1000_WUFC, sc->wol);
792                 igb_enable_wol(dev);
793         }
794
795         ifnet_deserialize_all(ifp);
796
797         return bus_generic_suspend(dev);
798 }
799
800 static int
801 igb_resume(device_t dev)
802 {
803         struct igb_softc *sc = device_get_softc(dev);
804         struct ifnet *ifp = &sc->arpcom.ac_if;
805         int i;
806
807         ifnet_serialize_all(ifp);
808
809         igb_init(sc);
810         igb_get_mgmt(sc);
811
812         for (i = 0; i < sc->tx_ring_inuse; ++i)
813                 ifsq_devstart_sched(sc->tx_rings[i].ifsq);
814
815         ifnet_deserialize_all(ifp);
816
817         return bus_generic_resume(dev);
818 }
819
820 static int
821 igb_ioctl(struct ifnet *ifp, u_long command, caddr_t data, struct ucred *cr)
822 {
823         struct igb_softc *sc = ifp->if_softc;
824         struct ifreq *ifr = (struct ifreq *)data;
825         int max_frame_size, mask, reinit;
826         int error = 0;
827
828         ASSERT_IFNET_SERIALIZED_ALL(ifp);
829
830         switch (command) {
831         case SIOCSIFMTU:
832                 max_frame_size = 9234;
833                 if (ifr->ifr_mtu > max_frame_size - ETHER_HDR_LEN -
834                     ETHER_CRC_LEN) {
835                         error = EINVAL;
836                         break;
837                 }
838
839                 ifp->if_mtu = ifr->ifr_mtu;
840                 sc->max_frame_size = ifp->if_mtu + ETHER_HDR_LEN +
841                     ETHER_CRC_LEN;
842
843                 if (ifp->if_flags & IFF_RUNNING)
844                         igb_init(sc);
845                 break;
846
847         case SIOCSIFFLAGS:
848                 if (ifp->if_flags & IFF_UP) {
849                         if (ifp->if_flags & IFF_RUNNING) {
850                                 if ((ifp->if_flags ^ sc->if_flags) &
851                                     (IFF_PROMISC | IFF_ALLMULTI)) {
852                                         igb_disable_promisc(sc);
853                                         igb_set_promisc(sc);
854                                 }
855                         } else {
856                                 igb_init(sc);
857                         }
858                 } else if (ifp->if_flags & IFF_RUNNING) {
859                         igb_stop(sc);
860                 }
861                 sc->if_flags = ifp->if_flags;
862                 break;
863
864         case SIOCADDMULTI:
865         case SIOCDELMULTI:
866                 if (ifp->if_flags & IFF_RUNNING) {
867                         igb_disable_intr(sc);
868                         igb_set_multi(sc);
869 #ifdef IFPOLL_ENABLE
870                         if (!(ifp->if_flags & IFF_NPOLLING))
871 #endif
872                                 igb_enable_intr(sc);
873                 }
874                 break;
875
876         case SIOCSIFMEDIA:
877                 /* Check SOL/IDER usage */
878                 if (e1000_check_reset_block(&sc->hw)) {
879                         if_printf(ifp, "Media change is "
880                             "blocked due to SOL/IDER session.\n");
881                         break;
882                 }
883                 /* FALL THROUGH */
884
885         case SIOCGIFMEDIA:
886                 error = ifmedia_ioctl(ifp, ifr, &sc->media, command);
887                 break;
888
889         case SIOCSIFCAP:
890                 reinit = 0;
891                 mask = ifr->ifr_reqcap ^ ifp->if_capenable;
892                 if (mask & IFCAP_RXCSUM) {
893                         ifp->if_capenable ^= IFCAP_RXCSUM;
894                         reinit = 1;
895                 }
896                 if (mask & IFCAP_VLAN_HWTAGGING) {
897                         ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
898                         reinit = 1;
899                 }
900                 if (mask & IFCAP_TXCSUM) {
901                         ifp->if_capenable ^= IFCAP_TXCSUM;
902                         if (ifp->if_capenable & IFCAP_TXCSUM)
903                                 ifp->if_hwassist |= IGB_CSUM_FEATURES;
904                         else
905                                 ifp->if_hwassist &= ~IGB_CSUM_FEATURES;
906                 }
907                 if (mask & IFCAP_TSO) {
908                         ifp->if_capenable ^= IFCAP_TSO;
909                         if (ifp->if_capenable & IFCAP_TSO)
910                                 ifp->if_hwassist |= CSUM_TSO;
911                         else
912                                 ifp->if_hwassist &= ~CSUM_TSO;
913                 }
914                 if (mask & IFCAP_RSS)
915                         ifp->if_capenable ^= IFCAP_RSS;
916                 if (reinit && (ifp->if_flags & IFF_RUNNING))
917                         igb_init(sc);
918                 break;
919
920         default:
921                 error = ether_ioctl(ifp, command, data);
922                 break;
923         }
924         return error;
925 }
926
927 static void
928 igb_init(void *xsc)
929 {
930         struct igb_softc *sc = xsc;
931         struct ifnet *ifp = &sc->arpcom.ac_if;
932         boolean_t polling;
933         int i;
934
935         ASSERT_IFNET_SERIALIZED_ALL(ifp);
936
937         igb_stop(sc);
938
939         /* Get the latest mac address, User can use a LAA */
940         bcopy(IF_LLADDR(ifp), sc->hw.mac.addr, ETHER_ADDR_LEN);
941
942         /* Put the address into the Receive Address Array */
943         e1000_rar_set(&sc->hw, sc->hw.mac.addr, 0);
944
945         igb_reset(sc);
946         igb_update_link_status(sc);
947
948         E1000_WRITE_REG(&sc->hw, E1000_VET, ETHERTYPE_VLAN);
949
950         /* Configure for OS presence */
951         igb_get_mgmt(sc);
952
953         polling = FALSE;
954 #ifdef IFPOLL_ENABLE
955         if (ifp->if_flags & IFF_NPOLLING)
956                 polling = TRUE;
957 #endif
958
959         /* Configured used RX/TX rings */
960         igb_set_ring_inuse(sc, polling);
961         ifq_set_subq_mask(&ifp->if_snd, sc->tx_ring_inuse - 1);
962
963         /* Initialize interrupt */
964         igb_init_intr(sc);
965
966         /* Prepare transmit descriptors and buffers */
967         for (i = 0; i < sc->tx_ring_inuse; ++i)
968                 igb_init_tx_ring(&sc->tx_rings[i]);
969         igb_init_tx_unit(sc);
970
971         /* Setup Multicast table */
972         igb_set_multi(sc);
973
974 #if 0
975         /*
976          * Figure out the desired mbuf pool
977          * for doing jumbo/packetsplit
978          */
979         if (adapter->max_frame_size <= 2048)
980                 adapter->rx_mbuf_sz = MCLBYTES;
981         else if (adapter->max_frame_size <= 4096)
982                 adapter->rx_mbuf_sz = MJUMPAGESIZE;
983         else
984                 adapter->rx_mbuf_sz = MJUM9BYTES;
985 #endif
986
987         /* Prepare receive descriptors and buffers */
988         for (i = 0; i < sc->rx_ring_inuse; ++i) {
989                 int error;
990
991                 error = igb_init_rx_ring(&sc->rx_rings[i]);
992                 if (error) {
993                         if_printf(ifp, "Could not setup receive structures\n");
994                         igb_stop(sc);
995                         return;
996                 }
997         }
998         igb_init_rx_unit(sc);
999
1000         /* Enable VLAN support */
1001         if (ifp->if_capenable & IFCAP_VLAN_HWTAGGING)
1002                 igb_set_vlan(sc);
1003
1004         /* Don't lose promiscuous settings */
1005         igb_set_promisc(sc);
1006
1007         ifp->if_flags |= IFF_RUNNING;
1008         for (i = 0; i < sc->tx_ring_inuse; ++i) {
1009                 ifsq_clr_oactive(sc->tx_rings[i].ifsq);
1010                 ifsq_watchdog_start(&sc->tx_rings[i].tx_watchdog);
1011         }
1012
1013         igb_set_timer_cpuid(sc, polling);
1014         callout_reset_bycpu(&sc->timer, hz, igb_timer, sc, sc->timer_cpuid);
1015         e1000_clear_hw_cntrs_base_generic(&sc->hw);
1016
1017         /* This clears any pending interrupts */
1018         E1000_READ_REG(&sc->hw, E1000_ICR);
1019
1020         /*
1021          * Only enable interrupts if we are not polling, make sure
1022          * they are off otherwise.
1023          */
1024         if (polling) {
1025                 igb_disable_intr(sc);
1026         } else {
1027                 igb_enable_intr(sc);
1028                 E1000_WRITE_REG(&sc->hw, E1000_ICS, E1000_ICS_LSC);
1029         }
1030
1031         /* Set Energy Efficient Ethernet */
1032         if (sc->hw.phy.media_type == e1000_media_type_copper) {
1033                 if (sc->hw.mac.type == e1000_i354)
1034                         e1000_set_eee_i354(&sc->hw);
1035                 else
1036                         e1000_set_eee_i350(&sc->hw);
1037         }
1038 }
1039
1040 static void
1041 igb_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
1042 {
1043         struct igb_softc *sc = ifp->if_softc;
1044
1045         ASSERT_IFNET_SERIALIZED_ALL(ifp);
1046
1047         if ((ifp->if_flags & IFF_RUNNING) == 0)
1048                 sc->hw.mac.get_link_status = 1;
1049         igb_update_link_status(sc);
1050
1051         ifmr->ifm_status = IFM_AVALID;
1052         ifmr->ifm_active = IFM_ETHER;
1053
1054         if (!sc->link_active)
1055                 return;
1056
1057         ifmr->ifm_status |= IFM_ACTIVE;
1058
1059         switch (sc->link_speed) {
1060         case 10:
1061                 ifmr->ifm_active |= IFM_10_T;
1062                 break;
1063
1064         case 100:
1065                 /*
1066                  * Support for 100Mb SFP - these are Fiber 
1067                  * but the media type appears as serdes
1068                  */
1069                 if (sc->hw.phy.media_type == e1000_media_type_internal_serdes)
1070                         ifmr->ifm_active |= IFM_100_FX;
1071                 else
1072                         ifmr->ifm_active |= IFM_100_TX;
1073                 break;
1074
1075         case 1000:
1076                 ifmr->ifm_active |= IFM_1000_T;
1077                 break;
1078         }
1079
1080         if (sc->link_duplex == FULL_DUPLEX)
1081                 ifmr->ifm_active |= IFM_FDX;
1082         else
1083                 ifmr->ifm_active |= IFM_HDX;
1084 }
1085
1086 static int
1087 igb_media_change(struct ifnet *ifp)
1088 {
1089         struct igb_softc *sc = ifp->if_softc;
1090         struct ifmedia *ifm = &sc->media;
1091
1092         ASSERT_IFNET_SERIALIZED_ALL(ifp);
1093
1094         if (IFM_TYPE(ifm->ifm_media) != IFM_ETHER)
1095                 return EINVAL;
1096
1097         switch (IFM_SUBTYPE(ifm->ifm_media)) {
1098         case IFM_AUTO:
1099                 sc->hw.mac.autoneg = DO_AUTO_NEG;
1100                 sc->hw.phy.autoneg_advertised = AUTONEG_ADV_DEFAULT;
1101                 break;
1102
1103         case IFM_1000_LX:
1104         case IFM_1000_SX:
1105         case IFM_1000_T:
1106                 sc->hw.mac.autoneg = DO_AUTO_NEG;
1107                 sc->hw.phy.autoneg_advertised = ADVERTISE_1000_FULL;
1108                 break;
1109
1110         case IFM_100_TX:
1111                 sc->hw.mac.autoneg = FALSE;
1112                 sc->hw.phy.autoneg_advertised = 0;
1113                 if ((ifm->ifm_media & IFM_GMASK) == IFM_FDX)
1114                         sc->hw.mac.forced_speed_duplex = ADVERTISE_100_FULL;
1115                 else
1116                         sc->hw.mac.forced_speed_duplex = ADVERTISE_100_HALF;
1117                 break;
1118
1119         case IFM_10_T:
1120                 sc->hw.mac.autoneg = FALSE;
1121                 sc->hw.phy.autoneg_advertised = 0;
1122                 if ((ifm->ifm_media & IFM_GMASK) == IFM_FDX)
1123                         sc->hw.mac.forced_speed_duplex = ADVERTISE_10_FULL;
1124                 else
1125                         sc->hw.mac.forced_speed_duplex = ADVERTISE_10_HALF;
1126                 break;
1127
1128         default:
1129                 if_printf(ifp, "Unsupported media type\n");
1130                 break;
1131         }
1132
1133         igb_init(sc);
1134
1135         return 0;
1136 }
1137
1138 static void
1139 igb_set_promisc(struct igb_softc *sc)
1140 {
1141         struct ifnet *ifp = &sc->arpcom.ac_if;
1142         struct e1000_hw *hw = &sc->hw;
1143         uint32_t reg;
1144
1145         if (sc->vf_ifp) {
1146                 e1000_promisc_set_vf(hw, e1000_promisc_enabled);
1147                 return;
1148         }
1149
1150         reg = E1000_READ_REG(hw, E1000_RCTL);
1151         if (ifp->if_flags & IFF_PROMISC) {
1152                 reg |= (E1000_RCTL_UPE | E1000_RCTL_MPE);
1153                 E1000_WRITE_REG(hw, E1000_RCTL, reg);
1154         } else if (ifp->if_flags & IFF_ALLMULTI) {
1155                 reg |= E1000_RCTL_MPE;
1156                 reg &= ~E1000_RCTL_UPE;
1157                 E1000_WRITE_REG(hw, E1000_RCTL, reg);
1158         }
1159 }
1160
1161 static void
1162 igb_disable_promisc(struct igb_softc *sc)
1163 {
1164         struct e1000_hw *hw = &sc->hw;
1165         struct ifnet *ifp = &sc->arpcom.ac_if;
1166         uint32_t reg;
1167         int mcnt = 0;
1168
1169         if (sc->vf_ifp) {
1170                 e1000_promisc_set_vf(hw, e1000_promisc_disabled);
1171                 return;
1172         }
1173         reg = E1000_READ_REG(hw, E1000_RCTL);
1174         reg &= ~E1000_RCTL_UPE;
1175         if (ifp->if_flags & IFF_ALLMULTI) {
1176                 mcnt = MAX_NUM_MULTICAST_ADDRESSES;
1177         } else {
1178                 struct  ifmultiaddr *ifma;
1179                 TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1180                         if (ifma->ifma_addr->sa_family != AF_LINK)
1181                                 continue;
1182                         if (mcnt == MAX_NUM_MULTICAST_ADDRESSES)
1183                                 break;
1184                         mcnt++;
1185                 }
1186         }
1187         /* Don't disable if in MAX groups */
1188         if (mcnt < MAX_NUM_MULTICAST_ADDRESSES)
1189                 reg &= ~E1000_RCTL_MPE;
1190         E1000_WRITE_REG(hw, E1000_RCTL, reg);
1191 }
1192
1193 static void
1194 igb_set_multi(struct igb_softc *sc)
1195 {
1196         struct ifnet *ifp = &sc->arpcom.ac_if;
1197         struct ifmultiaddr *ifma;
1198         uint32_t reg_rctl = 0;
1199         uint8_t *mta;
1200         int mcnt = 0;
1201
1202         mta = sc->mta;
1203         bzero(mta, ETH_ADDR_LEN * MAX_NUM_MULTICAST_ADDRESSES);
1204
1205         TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1206                 if (ifma->ifma_addr->sa_family != AF_LINK)
1207                         continue;
1208
1209                 if (mcnt == MAX_NUM_MULTICAST_ADDRESSES)
1210                         break;
1211
1212                 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1213                     &mta[mcnt * ETH_ADDR_LEN], ETH_ADDR_LEN);
1214                 mcnt++;
1215         }
1216
1217         if (mcnt >= MAX_NUM_MULTICAST_ADDRESSES) {
1218                 reg_rctl = E1000_READ_REG(&sc->hw, E1000_RCTL);
1219                 reg_rctl |= E1000_RCTL_MPE;
1220                 E1000_WRITE_REG(&sc->hw, E1000_RCTL, reg_rctl);
1221         } else {
1222                 e1000_update_mc_addr_list(&sc->hw, mta, mcnt);
1223         }
1224 }
1225
1226 static void
1227 igb_timer(void *xsc)
1228 {
1229         struct igb_softc *sc = xsc;
1230
1231         lwkt_serialize_enter(&sc->main_serialize);
1232
1233         igb_update_link_status(sc);
1234         igb_update_stats_counters(sc);
1235
1236         callout_reset_bycpu(&sc->timer, hz, igb_timer, sc, sc->timer_cpuid);
1237
1238         lwkt_serialize_exit(&sc->main_serialize);
1239 }
1240
1241 static void
1242 igb_update_link_status(struct igb_softc *sc)
1243 {
1244         struct ifnet *ifp = &sc->arpcom.ac_if;
1245         struct e1000_hw *hw = &sc->hw;
1246         uint32_t link_check, thstat, ctrl;
1247
1248         link_check = thstat = ctrl = 0;
1249
1250         /* Get the cached link value or read for real */
1251         switch (hw->phy.media_type) {
1252         case e1000_media_type_copper:
1253                 if (hw->mac.get_link_status) {
1254                         /* Do the work to read phy */
1255                         e1000_check_for_link(hw);
1256                         link_check = !hw->mac.get_link_status;
1257                 } else {
1258                         link_check = TRUE;
1259                 }
1260                 break;
1261
1262         case e1000_media_type_fiber:
1263                 e1000_check_for_link(hw);
1264                 link_check = E1000_READ_REG(hw, E1000_STATUS) & E1000_STATUS_LU;
1265                 break;
1266
1267         case e1000_media_type_internal_serdes:
1268                 e1000_check_for_link(hw);
1269                 link_check = hw->mac.serdes_has_link;
1270                 break;
1271
1272         /* VF device is type_unknown */
1273         case e1000_media_type_unknown:
1274                 e1000_check_for_link(hw);
1275                 link_check = !hw->mac.get_link_status;
1276                 /* Fall thru */
1277         default:
1278                 break;
1279         }
1280
1281         /* Check for thermal downshift or shutdown */
1282         if (hw->mac.type == e1000_i350) {
1283                 thstat = E1000_READ_REG(hw, E1000_THSTAT);
1284                 ctrl = E1000_READ_REG(hw, E1000_CTRL_EXT);
1285         }
1286
1287         /* Now we check if a transition has happened */
1288         if (link_check && sc->link_active == 0) {
1289                 e1000_get_speed_and_duplex(hw, 
1290                     &sc->link_speed, &sc->link_duplex);
1291                 if (bootverbose) {
1292                         const char *flowctl;
1293
1294                         /* Get the flow control for display */
1295                         switch (hw->fc.current_mode) {
1296                         case e1000_fc_rx_pause:
1297                                 flowctl = "RX";
1298                                 break;
1299
1300                         case e1000_fc_tx_pause:
1301                                 flowctl = "TX";
1302                                 break;
1303
1304                         case e1000_fc_full:
1305                                 flowctl = "Full";
1306                                 break;
1307
1308                         default:
1309                                 flowctl = "None";
1310                                 break;
1311                         }
1312
1313                         if_printf(ifp, "Link is up %d Mbps %s, "
1314                             "Flow control: %s\n",
1315                             sc->link_speed,
1316                             sc->link_duplex == FULL_DUPLEX ?
1317                             "Full Duplex" : "Half Duplex",
1318                             flowctl);
1319                 }
1320                 sc->link_active = 1;
1321
1322                 ifp->if_baudrate = sc->link_speed * 1000000;
1323                 if ((ctrl & E1000_CTRL_EXT_LINK_MODE_GMII) &&
1324                     (thstat & E1000_THSTAT_LINK_THROTTLE))
1325                         if_printf(ifp, "Link: thermal downshift\n");
1326                 /* Delay Link Up for Phy update */
1327                 if ((hw->mac.type == e1000_i210 ||
1328                      hw->mac.type == e1000_i211) &&
1329                     hw->phy.id == I210_I_PHY_ID)
1330                         msec_delay(IGB_I210_LINK_DELAY);
1331                 /* This can sleep */
1332                 ifp->if_link_state = LINK_STATE_UP;
1333                 if_link_state_change(ifp);
1334         } else if (!link_check && sc->link_active == 1) {
1335                 ifp->if_baudrate = sc->link_speed = 0;
1336                 sc->link_duplex = 0;
1337                 if (bootverbose)
1338                         if_printf(ifp, "Link is Down\n");
1339                 if ((ctrl & E1000_CTRL_EXT_LINK_MODE_GMII) &&
1340                     (thstat & E1000_THSTAT_PWR_DOWN))
1341                         if_printf(ifp, "Link: thermal shutdown\n");
1342                 sc->link_active = 0;
1343                 /* This can sleep */
1344                 ifp->if_link_state = LINK_STATE_DOWN;
1345                 if_link_state_change(ifp);
1346         }
1347 }
1348
1349 static void
1350 igb_stop(struct igb_softc *sc)
1351 {
1352         struct ifnet *ifp = &sc->arpcom.ac_if;
1353         int i;
1354
1355         ASSERT_IFNET_SERIALIZED_ALL(ifp);
1356
1357         igb_disable_intr(sc);
1358
1359         callout_stop(&sc->timer);
1360
1361         ifp->if_flags &= ~IFF_RUNNING;
1362         for (i = 0; i < sc->tx_ring_cnt; ++i) {
1363                 ifsq_clr_oactive(sc->tx_rings[i].ifsq);
1364                 ifsq_watchdog_stop(&sc->tx_rings[i].tx_watchdog);
1365                 sc->tx_rings[i].tx_flags &= ~IGB_TXFLAG_ENABLED;
1366         }
1367
1368         e1000_reset_hw(&sc->hw);
1369         E1000_WRITE_REG(&sc->hw, E1000_WUC, 0);
1370
1371         e1000_led_off(&sc->hw);
1372         e1000_cleanup_led(&sc->hw);
1373
1374         for (i = 0; i < sc->tx_ring_cnt; ++i)
1375                 igb_free_tx_ring(&sc->tx_rings[i]);
1376         for (i = 0; i < sc->rx_ring_cnt; ++i)
1377                 igb_free_rx_ring(&sc->rx_rings[i]);
1378 }
1379
1380 static void
1381 igb_reset(struct igb_softc *sc)
1382 {
1383         struct ifnet *ifp = &sc->arpcom.ac_if;
1384         struct e1000_hw *hw = &sc->hw;
1385         struct e1000_fc_info *fc = &hw->fc;
1386         uint32_t pba = 0;
1387         uint16_t hwm;
1388
1389         /* Let the firmware know the OS is in control */
1390         igb_get_hw_control(sc);
1391
1392         /*
1393          * Packet Buffer Allocation (PBA)
1394          * Writing PBA sets the receive portion of the buffer
1395          * the remainder is used for the transmit buffer.
1396          */
1397         switch (hw->mac.type) {
1398         case e1000_82575:
1399                 pba = E1000_PBA_32K;
1400                 break;
1401
1402         case e1000_82576:
1403         case e1000_vfadapt:
1404                 pba = E1000_READ_REG(hw, E1000_RXPBS);
1405                 pba &= E1000_RXPBS_SIZE_MASK_82576;
1406                 break;
1407
1408         case e1000_82580:
1409         case e1000_i350:
1410         case e1000_i354:
1411         case e1000_vfadapt_i350:
1412                 pba = E1000_READ_REG(hw, E1000_RXPBS);
1413                 pba = e1000_rxpbs_adjust_82580(pba);
1414                 break;
1415
1416         case e1000_i210:
1417         case e1000_i211:
1418                 pba = E1000_PBA_34K;
1419                 break;
1420
1421         default:
1422                 break;
1423         }
1424
1425         /* Special needs in case of Jumbo frames */
1426         if (hw->mac.type == e1000_82575 && ifp->if_mtu > ETHERMTU) {
1427                 uint32_t tx_space, min_tx, min_rx;
1428
1429                 pba = E1000_READ_REG(hw, E1000_PBA);
1430                 tx_space = pba >> 16;
1431                 pba &= 0xffff;
1432
1433                 min_tx = (sc->max_frame_size +
1434                     sizeof(struct e1000_tx_desc) - ETHER_CRC_LEN) * 2;
1435                 min_tx = roundup2(min_tx, 1024);
1436                 min_tx >>= 10;
1437                 min_rx = sc->max_frame_size;
1438                 min_rx = roundup2(min_rx, 1024);
1439                 min_rx >>= 10;
1440                 if (tx_space < min_tx && (min_tx - tx_space) < pba) {
1441                         pba = pba - (min_tx - tx_space);
1442                         /*
1443                          * if short on rx space, rx wins
1444                          * and must trump tx adjustment
1445                          */
1446                         if (pba < min_rx)
1447                                 pba = min_rx;
1448                 }
1449                 E1000_WRITE_REG(hw, E1000_PBA, pba);
1450         }
1451
1452         /*
1453          * These parameters control the automatic generation (Tx) and
1454          * response (Rx) to Ethernet PAUSE frames.
1455          * - High water mark should allow for at least two frames to be
1456          *   received after sending an XOFF.
1457          * - Low water mark works best when it is very near the high water mark.
1458          *   This allows the receiver to restart by sending XON when it has
1459          *   drained a bit.
1460          */
1461         hwm = min(((pba << 10) * 9 / 10),
1462             ((pba << 10) - 2 * sc->max_frame_size));
1463
1464         if (hw->mac.type < e1000_82576) {
1465                 fc->high_water = hwm & 0xFFF8; /* 8-byte granularity */
1466                 fc->low_water = fc->high_water - 8;
1467         } else {
1468                 fc->high_water = hwm & 0xFFF0; /* 16-byte granularity */
1469                 fc->low_water = fc->high_water - 16;
1470         }
1471         fc->pause_time = IGB_FC_PAUSE_TIME;
1472         fc->send_xon = TRUE;
1473         fc->requested_mode = sc->flow_ctrl;
1474
1475         /* Issue a global reset */
1476         e1000_reset_hw(hw);
1477         E1000_WRITE_REG(hw, E1000_WUC, 0);
1478
1479         if (e1000_init_hw(hw) < 0)
1480                 if_printf(ifp, "Hardware Initialization Failed\n");
1481
1482         /* Setup DMA Coalescing */
1483         if (hw->mac.type > e1000_82580 && hw->mac.type != e1000_i211) {
1484                 uint32_t dmac;
1485                 uint32_t reg;
1486
1487                 if (sc->dma_coalesce == 0) {
1488                         /*
1489                          * Disabled
1490                          */
1491                         reg = E1000_READ_REG(hw, E1000_DMACR);
1492                         reg &= ~E1000_DMACR_DMAC_EN;
1493                         E1000_WRITE_REG(hw, E1000_DMACR, reg);
1494                         goto reset_out;
1495                 }
1496
1497                 /* Set starting thresholds */
1498                 E1000_WRITE_REG(hw, E1000_DMCTXTH, 0);
1499                 E1000_WRITE_REG(hw, E1000_DMCRTRH, 0);
1500
1501                 hwm = 64 * pba - sc->max_frame_size / 16;
1502                 if (hwm < 64 * (pba - 6))
1503                         hwm = 64 * (pba - 6);
1504                 reg = E1000_READ_REG(hw, E1000_FCRTC);
1505                 reg &= ~E1000_FCRTC_RTH_COAL_MASK;
1506                 reg |= ((hwm << E1000_FCRTC_RTH_COAL_SHIFT)
1507                     & E1000_FCRTC_RTH_COAL_MASK);
1508                 E1000_WRITE_REG(hw, E1000_FCRTC, reg);
1509
1510                 dmac = pba - sc->max_frame_size / 512;
1511                 if (dmac < pba - 10)
1512                         dmac = pba - 10;
1513                 reg = E1000_READ_REG(hw, E1000_DMACR);
1514                 reg &= ~E1000_DMACR_DMACTHR_MASK;
1515                 reg = ((dmac << E1000_DMACR_DMACTHR_SHIFT)
1516                     & E1000_DMACR_DMACTHR_MASK);
1517                 /* Transition to L0x or L1 if available.. */
1518                 reg |= (E1000_DMACR_DMAC_EN | E1000_DMACR_DMAC_LX_MASK);
1519                 /* timer = value in sc->dma_coalesce in 32usec intervals */
1520                 reg |= (sc->dma_coalesce >> 5);
1521                 E1000_WRITE_REG(hw, E1000_DMACR, reg);
1522
1523                 /* Set the interval before transition */
1524                 reg = E1000_READ_REG(hw, E1000_DMCTLX);
1525                 reg |= 0x80000004;
1526                 E1000_WRITE_REG(hw, E1000_DMCTLX, reg);
1527
1528                 /* Free space in tx packet buffer to wake from DMA coal */
1529                 E1000_WRITE_REG(hw, E1000_DMCTXTH,
1530                     (20480 - (2 * sc->max_frame_size)) >> 6);
1531
1532                 /* Make low power state decision controlled by DMA coal */
1533                 reg = E1000_READ_REG(hw, E1000_PCIEMISC);
1534                 reg &= ~E1000_PCIEMISC_LX_DECISION;
1535                 E1000_WRITE_REG(hw, E1000_PCIEMISC, reg);
1536                 if_printf(ifp, "DMA Coalescing enabled\n");
1537         } else if (hw->mac.type == e1000_82580) {
1538                 uint32_t reg = E1000_READ_REG(hw, E1000_PCIEMISC);
1539
1540                 E1000_WRITE_REG(hw, E1000_DMACR, 0);
1541                 E1000_WRITE_REG(hw, E1000_PCIEMISC,
1542                     reg & ~E1000_PCIEMISC_LX_DECISION);
1543         }
1544
1545 reset_out:
1546         E1000_WRITE_REG(&sc->hw, E1000_VET, ETHERTYPE_VLAN);
1547         e1000_get_phy_info(hw);
1548         e1000_check_for_link(hw);
1549 }
1550
1551 static void
1552 igb_setup_ifp(struct igb_softc *sc)
1553 {
1554         struct ifnet *ifp = &sc->arpcom.ac_if;
1555         int i;
1556
1557         ifp->if_softc = sc;
1558         ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
1559         ifp->if_init = igb_init;
1560         ifp->if_ioctl = igb_ioctl;
1561         ifp->if_start = igb_start;
1562         ifp->if_serialize = igb_serialize;
1563         ifp->if_deserialize = igb_deserialize;
1564         ifp->if_tryserialize = igb_tryserialize;
1565 #ifdef INVARIANTS
1566         ifp->if_serialize_assert = igb_serialize_assert;
1567 #endif
1568 #ifdef IFPOLL_ENABLE
1569         ifp->if_npoll = igb_npoll;
1570 #endif
1571
1572         ifp->if_nmbclusters = sc->rx_ring_cnt * sc->rx_rings[0].num_rx_desc;
1573
1574         ifq_set_maxlen(&ifp->if_snd, sc->tx_rings[0].num_tx_desc - 1);
1575         ifq_set_ready(&ifp->if_snd);
1576         ifq_set_subq_cnt(&ifp->if_snd, sc->tx_ring_cnt);
1577
1578         ifp->if_mapsubq = ifq_mapsubq_mask;
1579         ifq_set_subq_mask(&ifp->if_snd, 0);
1580
1581         ether_ifattach(ifp, sc->hw.mac.addr, NULL);
1582
1583         ifp->if_capabilities =
1584             IFCAP_HWCSUM | IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_TSO;
1585         if (IGB_ENABLE_HWRSS(sc))
1586                 ifp->if_capabilities |= IFCAP_RSS;
1587         ifp->if_capenable = ifp->if_capabilities;
1588         ifp->if_hwassist = IGB_CSUM_FEATURES | CSUM_TSO;
1589
1590         /*
1591          * Tell the upper layer(s) we support long frames
1592          */
1593         ifp->if_data.ifi_hdrlen = sizeof(struct ether_vlan_header);
1594
1595         /* Setup TX rings and subqueues */
1596         for (i = 0; i < sc->tx_ring_cnt; ++i) {
1597                 struct ifaltq_subque *ifsq = ifq_get_subq(&ifp->if_snd, i);
1598                 struct igb_tx_ring *txr = &sc->tx_rings[i];
1599
1600                 ifsq_set_cpuid(ifsq, txr->tx_intr_cpuid);
1601                 ifsq_set_priv(ifsq, txr);
1602                 ifsq_set_hw_serialize(ifsq, &txr->tx_serialize);
1603                 txr->ifsq = ifsq;
1604
1605                 ifsq_watchdog_init(&txr->tx_watchdog, ifsq, igb_watchdog);
1606         }
1607
1608         /*
1609          * Specify the media types supported by this adapter and register
1610          * callbacks to update media and link information
1611          */
1612         ifmedia_init(&sc->media, IFM_IMASK, igb_media_change, igb_media_status);
1613         if (sc->hw.phy.media_type == e1000_media_type_fiber ||
1614             sc->hw.phy.media_type == e1000_media_type_internal_serdes) {
1615                 ifmedia_add(&sc->media, IFM_ETHER | IFM_1000_SX | IFM_FDX,
1616                     0, NULL);
1617                 ifmedia_add(&sc->media, IFM_ETHER | IFM_1000_SX, 0, NULL);
1618         } else {
1619                 ifmedia_add(&sc->media, IFM_ETHER | IFM_10_T, 0, NULL);
1620                 ifmedia_add(&sc->media, IFM_ETHER | IFM_10_T | IFM_FDX,
1621                     0, NULL);
1622                 ifmedia_add(&sc->media, IFM_ETHER | IFM_100_TX, 0, NULL);
1623                 ifmedia_add(&sc->media, IFM_ETHER | IFM_100_TX | IFM_FDX,
1624                     0, NULL);
1625                 if (sc->hw.phy.type != e1000_phy_ife) {
1626                         ifmedia_add(&sc->media,
1627                             IFM_ETHER | IFM_1000_T | IFM_FDX, 0, NULL);
1628                         ifmedia_add(&sc->media,
1629                             IFM_ETHER | IFM_1000_T, 0, NULL);
1630                 }
1631         }
1632         ifmedia_add(&sc->media, IFM_ETHER | IFM_AUTO, 0, NULL);
1633         ifmedia_set(&sc->media, IFM_ETHER | IFM_AUTO);
1634 }
1635
1636 static void
1637 igb_add_sysctl(struct igb_softc *sc)
1638 {
1639         struct sysctl_ctx_list *ctx;
1640         struct sysctl_oid *tree;
1641         char node[32];
1642         int i;
1643
1644         ctx = device_get_sysctl_ctx(sc->dev);
1645         tree = device_get_sysctl_tree(sc->dev);
1646         SYSCTL_ADD_INT(ctx, SYSCTL_CHILDREN(tree),
1647             OID_AUTO, "rxr", CTLFLAG_RD, &sc->rx_ring_cnt, 0, "# of RX rings");
1648         SYSCTL_ADD_INT(ctx, SYSCTL_CHILDREN(tree),
1649             OID_AUTO, "rxr_inuse", CTLFLAG_RD, &sc->rx_ring_inuse, 0,
1650             "# of RX rings used");
1651         SYSCTL_ADD_INT(ctx, SYSCTL_CHILDREN(tree),
1652             OID_AUTO, "txr", CTLFLAG_RD, &sc->tx_ring_cnt, 0, "# of TX rings");
1653         SYSCTL_ADD_INT(ctx, SYSCTL_CHILDREN(tree),
1654             OID_AUTO, "txr_inuse", CTLFLAG_RD, &sc->tx_ring_inuse, 0,
1655             "# of TX rings used");
1656         SYSCTL_ADD_INT(ctx, SYSCTL_CHILDREN(tree),
1657             OID_AUTO, "rxd", CTLFLAG_RD, &sc->rx_rings[0].num_rx_desc, 0,
1658             "# of RX descs");
1659         SYSCTL_ADD_INT(ctx, SYSCTL_CHILDREN(tree),
1660             OID_AUTO, "txd", CTLFLAG_RD, &sc->tx_rings[0].num_tx_desc, 0,
1661             "# of TX descs");
1662
1663         if (sc->intr_type != PCI_INTR_TYPE_MSIX) {
1664                 SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(tree),
1665                     OID_AUTO, "intr_rate", CTLTYPE_INT | CTLFLAG_RW,
1666                     sc, 0, igb_sysctl_intr_rate, "I", "interrupt rate");
1667         } else {
1668                 for (i = 0; i < sc->msix_cnt; ++i) {
1669                         struct igb_msix_data *msix = &sc->msix_data[i];
1670
1671                         ksnprintf(node, sizeof(node), "msix%d_rate", i);
1672                         SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(tree),
1673                             OID_AUTO, node, CTLTYPE_INT | CTLFLAG_RW,
1674                             msix, 0, igb_sysctl_msix_rate, "I",
1675                             msix->msix_rate_desc);
1676                 }
1677         }
1678
1679         SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(tree),
1680             OID_AUTO, "tx_intr_nsegs", CTLTYPE_INT | CTLFLAG_RW,
1681             sc, 0, igb_sysctl_tx_intr_nsegs, "I",
1682             "# of segments per TX interrupt");
1683
1684         SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(tree),
1685             OID_AUTO, "tx_wreg_nsegs", CTLTYPE_INT | CTLFLAG_RW,
1686             sc, 0, igb_sysctl_tx_wreg_nsegs, "I",
1687             "# of segments sent before write to hardware register");
1688
1689         SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(tree),
1690             OID_AUTO, "rx_wreg_nsegs", CTLTYPE_INT | CTLFLAG_RW,
1691             sc, 0, igb_sysctl_rx_wreg_nsegs, "I",
1692             "# of segments received before write to hardware register");
1693
1694         SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(tree),
1695             OID_AUTO, "flow_ctrl", CTLTYPE_STRING|CTLFLAG_RW, sc, 0,
1696             igb_sysctl_flowctrl, "A",
1697             "flow control: full, rx_pause, tx_pause, none");
1698
1699 #ifdef IFPOLL_ENABLE
1700         SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(tree),
1701             OID_AUTO, "npoll_rxoff", CTLTYPE_INT|CTLFLAG_RW,
1702             sc, 0, igb_sysctl_npoll_rxoff, "I", "NPOLLING RX cpu offset");
1703         SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(tree),
1704             OID_AUTO, "npoll_txoff", CTLTYPE_INT|CTLFLAG_RW,
1705             sc, 0, igb_sysctl_npoll_txoff, "I", "NPOLLING TX cpu offset");
1706 #endif
1707
1708 #ifdef IGB_RSS_DEBUG
1709         SYSCTL_ADD_INT(ctx, SYSCTL_CHILDREN(tree),
1710             OID_AUTO, "rss_debug", CTLFLAG_RW, &sc->rss_debug, 0,
1711             "RSS debug level");
1712         for (i = 0; i < sc->rx_ring_cnt; ++i) {
1713                 ksnprintf(node, sizeof(node), "rx%d_pkt", i);
1714                 SYSCTL_ADD_ULONG(ctx,
1715                     SYSCTL_CHILDREN(tree), OID_AUTO, node,
1716                     CTLFLAG_RW, &sc->rx_rings[i].rx_packets, "RXed packets");
1717         }
1718 #endif
1719 #ifdef IGB_TSS_DEBUG
1720         for  (i = 0; i < sc->tx_ring_cnt; ++i) {
1721                 ksnprintf(node, sizeof(node), "tx%d_pkt", i);
1722                 SYSCTL_ADD_ULONG(ctx,
1723                     SYSCTL_CHILDREN(tree), OID_AUTO, node,
1724                     CTLFLAG_RW, &sc->tx_rings[i].tx_packets, "TXed packets");
1725         }
1726 #endif
1727 }
1728
1729 static int
1730 igb_alloc_rings(struct igb_softc *sc)
1731 {
1732         int error, i;
1733
1734         /*
1735          * Create top level busdma tag
1736          */
1737         error = bus_dma_tag_create(NULL, 1, 0,
1738             BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL,
1739             BUS_SPACE_MAXSIZE_32BIT, 0, BUS_SPACE_MAXSIZE_32BIT, 0,
1740             &sc->parent_tag);
1741         if (error) {
1742                 device_printf(sc->dev, "could not create top level DMA tag\n");
1743                 return error;
1744         }
1745
1746         /*
1747          * Allocate TX descriptor rings and buffers
1748          */
1749         sc->tx_rings = kmalloc_cachealign(
1750             sizeof(struct igb_tx_ring) * sc->tx_ring_cnt,
1751             M_DEVBUF, M_WAITOK | M_ZERO);
1752         for (i = 0; i < sc->tx_ring_cnt; ++i) {
1753                 struct igb_tx_ring *txr = &sc->tx_rings[i];
1754
1755                 /* Set up some basics */
1756                 txr->sc = sc;
1757                 txr->me = i;
1758                 lwkt_serialize_init(&txr->tx_serialize);
1759
1760                 error = igb_create_tx_ring(txr);
1761                 if (error)
1762                         return error;
1763         }
1764
1765         /*
1766          * Allocate RX descriptor rings and buffers
1767          */ 
1768         sc->rx_rings = kmalloc_cachealign(
1769             sizeof(struct igb_rx_ring) * sc->rx_ring_cnt,
1770             M_DEVBUF, M_WAITOK | M_ZERO);
1771         for (i = 0; i < sc->rx_ring_cnt; ++i) {
1772                 struct igb_rx_ring *rxr = &sc->rx_rings[i];
1773
1774                 /* Set up some basics */
1775                 rxr->sc = sc;
1776                 rxr->me = i;
1777                 lwkt_serialize_init(&rxr->rx_serialize);
1778
1779                 error = igb_create_rx_ring(rxr);
1780                 if (error)
1781                         return error;
1782         }
1783
1784         return 0;
1785 }
1786
1787 static void
1788 igb_free_rings(struct igb_softc *sc)
1789 {
1790         int i;
1791
1792         if (sc->tx_rings != NULL) {
1793                 for (i = 0; i < sc->tx_ring_cnt; ++i) {
1794                         struct igb_tx_ring *txr = &sc->tx_rings[i];
1795
1796                         igb_destroy_tx_ring(txr, txr->num_tx_desc);
1797                 }
1798                 kfree(sc->tx_rings, M_DEVBUF);
1799         }
1800
1801         if (sc->rx_rings != NULL) {
1802                 for (i = 0; i < sc->rx_ring_cnt; ++i) {
1803                         struct igb_rx_ring *rxr = &sc->rx_rings[i];
1804
1805                         igb_destroy_rx_ring(rxr, rxr->num_rx_desc);
1806                 }
1807                 kfree(sc->rx_rings, M_DEVBUF);
1808         }
1809 }
1810
1811 static int
1812 igb_create_tx_ring(struct igb_tx_ring *txr)
1813 {
1814         int tsize, error, i, ntxd;
1815
1816         /*
1817          * Validate number of transmit descriptors. It must not exceed
1818          * hardware maximum, and must be multiple of IGB_DBA_ALIGN.
1819          */
1820         ntxd = device_getenv_int(txr->sc->dev, "txd", igb_txd);
1821         if ((ntxd * sizeof(struct e1000_tx_desc)) % IGB_DBA_ALIGN != 0 ||
1822             ntxd > IGB_MAX_TXD || ntxd < IGB_MIN_TXD) {
1823                 device_printf(txr->sc->dev,
1824                     "Using %d TX descriptors instead of %d!\n",
1825                     IGB_DEFAULT_TXD, ntxd);
1826                 txr->num_tx_desc = IGB_DEFAULT_TXD;
1827         } else {
1828                 txr->num_tx_desc = ntxd;
1829         }
1830
1831         /*
1832          * Allocate TX descriptor ring
1833          */
1834         tsize = roundup2(txr->num_tx_desc * sizeof(union e1000_adv_tx_desc),
1835             IGB_DBA_ALIGN);
1836         txr->txdma.dma_vaddr = bus_dmamem_coherent_any(txr->sc->parent_tag,
1837             IGB_DBA_ALIGN, tsize, BUS_DMA_WAITOK,
1838             &txr->txdma.dma_tag, &txr->txdma.dma_map, &txr->txdma.dma_paddr);
1839         if (txr->txdma.dma_vaddr == NULL) {
1840                 device_printf(txr->sc->dev,
1841                     "Unable to allocate TX Descriptor memory\n");
1842                 return ENOMEM;
1843         }
1844         txr->tx_base = txr->txdma.dma_vaddr;
1845         bzero(txr->tx_base, tsize);
1846
1847         tsize = __VM_CACHELINE_ALIGN(
1848             sizeof(struct igb_tx_buf) * txr->num_tx_desc);
1849         txr->tx_buf = kmalloc_cachealign(tsize, M_DEVBUF, M_WAITOK | M_ZERO);
1850
1851         /*
1852          * Allocate TX head write-back buffer
1853          */
1854         txr->tx_hdr = bus_dmamem_coherent_any(txr->sc->parent_tag,
1855             __VM_CACHELINE_SIZE, __VM_CACHELINE_SIZE, BUS_DMA_WAITOK,
1856             &txr->tx_hdr_dtag, &txr->tx_hdr_dmap, &txr->tx_hdr_paddr);
1857         if (txr->tx_hdr == NULL) {
1858                 device_printf(txr->sc->dev,
1859                     "Unable to allocate TX head write-back buffer\n");
1860                 return ENOMEM;
1861         }
1862
1863         /*
1864          * Create DMA tag for TX buffers
1865          */
1866         error = bus_dma_tag_create(txr->sc->parent_tag,
1867             1, 0,               /* alignment, bounds */
1868             BUS_SPACE_MAXADDR,  /* lowaddr */
1869             BUS_SPACE_MAXADDR,  /* highaddr */
1870             NULL, NULL,         /* filter, filterarg */
1871             IGB_TSO_SIZE,       /* maxsize */
1872             IGB_MAX_SCATTER,    /* nsegments */
1873             PAGE_SIZE,          /* maxsegsize */
1874             BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW |
1875             BUS_DMA_ONEBPAGE,   /* flags */
1876             &txr->tx_tag);
1877         if (error) {
1878                 device_printf(txr->sc->dev, "Unable to allocate TX DMA tag\n");
1879                 kfree(txr->tx_buf, M_DEVBUF);
1880                 txr->tx_buf = NULL;
1881                 return error;
1882         }
1883
1884         /*
1885          * Create DMA maps for TX buffers
1886          */
1887         for (i = 0; i < txr->num_tx_desc; ++i) {
1888                 struct igb_tx_buf *txbuf = &txr->tx_buf[i];
1889
1890                 error = bus_dmamap_create(txr->tx_tag,
1891                     BUS_DMA_WAITOK | BUS_DMA_ONEBPAGE, &txbuf->map);
1892                 if (error) {
1893                         device_printf(txr->sc->dev,
1894                             "Unable to create TX DMA map\n");
1895                         igb_destroy_tx_ring(txr, i);
1896                         return error;
1897                 }
1898         }
1899
1900         if (txr->sc->hw.mac.type == e1000_82575)
1901                 txr->tx_flags |= IGB_TXFLAG_TSO_IPLEN0;
1902
1903         /*
1904          * Initialize various watermark
1905          */
1906         txr->spare_desc = IGB_TX_SPARE;
1907         txr->intr_nsegs = txr->num_tx_desc / 16;
1908         txr->wreg_nsegs = IGB_DEF_TXWREG_NSEGS;
1909         txr->oact_hi_desc = txr->num_tx_desc / 2;
1910         txr->oact_lo_desc = txr->num_tx_desc / 8;
1911         if (txr->oact_lo_desc > IGB_TX_OACTIVE_MAX)
1912                 txr->oact_lo_desc = IGB_TX_OACTIVE_MAX;
1913         if (txr->oact_lo_desc < txr->spare_desc + IGB_TX_RESERVED)
1914                 txr->oact_lo_desc = txr->spare_desc + IGB_TX_RESERVED;
1915
1916         return 0;
1917 }
1918
1919 static void
1920 igb_free_tx_ring(struct igb_tx_ring *txr)
1921 {
1922         int i;
1923
1924         for (i = 0; i < txr->num_tx_desc; ++i) {
1925                 struct igb_tx_buf *txbuf = &txr->tx_buf[i];
1926
1927                 if (txbuf->m_head != NULL) {
1928                         bus_dmamap_unload(txr->tx_tag, txbuf->map);
1929                         m_freem(txbuf->m_head);
1930                         txbuf->m_head = NULL;
1931                 }
1932         }
1933 }
1934
1935 static void
1936 igb_destroy_tx_ring(struct igb_tx_ring *txr, int ndesc)
1937 {
1938         int i;
1939
1940         if (txr->txdma.dma_vaddr != NULL) {
1941                 bus_dmamap_unload(txr->txdma.dma_tag, txr->txdma.dma_map);
1942                 bus_dmamem_free(txr->txdma.dma_tag, txr->txdma.dma_vaddr,
1943                     txr->txdma.dma_map);
1944                 bus_dma_tag_destroy(txr->txdma.dma_tag);
1945                 txr->txdma.dma_vaddr = NULL;
1946         }
1947
1948         if (txr->tx_hdr != NULL) {
1949                 bus_dmamap_unload(txr->tx_hdr_dtag, txr->tx_hdr_dmap);
1950                 bus_dmamem_free(txr->tx_hdr_dtag, txr->tx_hdr,
1951                     txr->tx_hdr_dmap);
1952                 bus_dma_tag_destroy(txr->tx_hdr_dtag);
1953                 txr->tx_hdr = NULL;
1954         }
1955
1956         if (txr->tx_buf == NULL)
1957                 return;
1958
1959         for (i = 0; i < ndesc; ++i) {
1960                 struct igb_tx_buf *txbuf = &txr->tx_buf[i];
1961
1962                 KKASSERT(txbuf->m_head == NULL);
1963                 bus_dmamap_destroy(txr->tx_tag, txbuf->map);
1964         }
1965         bus_dma_tag_destroy(txr->tx_tag);
1966
1967         kfree(txr->tx_buf, M_DEVBUF);
1968         txr->tx_buf = NULL;
1969 }
1970
1971 static void
1972 igb_init_tx_ring(struct igb_tx_ring *txr)
1973 {
1974         /* Clear the old descriptor contents */
1975         bzero(txr->tx_base,
1976             sizeof(union e1000_adv_tx_desc) * txr->num_tx_desc);
1977
1978         /* Clear TX head write-back buffer */
1979         *(txr->tx_hdr) = 0;
1980
1981         /* Reset indices */
1982         txr->next_avail_desc = 0;
1983         txr->next_to_clean = 0;
1984         txr->tx_nsegs = 0;
1985
1986         /* Set number of descriptors available */
1987         txr->tx_avail = txr->num_tx_desc;
1988
1989         /* Enable this TX ring */
1990         txr->tx_flags |= IGB_TXFLAG_ENABLED;
1991 }
1992
1993 static void
1994 igb_init_tx_unit(struct igb_softc *sc)
1995 {
1996         struct e1000_hw *hw = &sc->hw;
1997         uint32_t tctl;
1998         int i;
1999
2000         /* Setup the Tx Descriptor Rings */
2001         for (i = 0; i < sc->tx_ring_inuse; ++i) {
2002                 struct igb_tx_ring *txr = &sc->tx_rings[i];
2003                 uint64_t bus_addr = txr->txdma.dma_paddr;
2004                 uint64_t hdr_paddr = txr->tx_hdr_paddr;
2005                 uint32_t txdctl = 0;
2006                 uint32_t dca_txctrl;
2007
2008                 E1000_WRITE_REG(hw, E1000_TDLEN(i),
2009                     txr->num_tx_desc * sizeof(struct e1000_tx_desc));
2010                 E1000_WRITE_REG(hw, E1000_TDBAH(i),
2011                     (uint32_t)(bus_addr >> 32));
2012                 E1000_WRITE_REG(hw, E1000_TDBAL(i),
2013                     (uint32_t)bus_addr);
2014
2015                 /* Setup the HW Tx Head and Tail descriptor pointers */
2016                 E1000_WRITE_REG(hw, E1000_TDT(i), 0);
2017                 E1000_WRITE_REG(hw, E1000_TDH(i), 0);
2018
2019                 dca_txctrl = E1000_READ_REG(hw, E1000_DCA_TXCTRL(i));
2020                 dca_txctrl &= ~E1000_DCA_TXCTRL_TX_WB_RO_EN;
2021                 E1000_WRITE_REG(hw, E1000_DCA_TXCTRL(i), dca_txctrl);
2022
2023                 /*
2024                  * Don't set WB_on_EITR:
2025                  * - 82575 does not have it
2026                  * - It almost has no effect on 82576, see:
2027                  *   82576 specification update errata #26
2028                  * - It causes unnecessary bus traffic
2029                  */
2030                 E1000_WRITE_REG(hw, E1000_TDWBAH(i),
2031                     (uint32_t)(hdr_paddr >> 32));
2032                 E1000_WRITE_REG(hw, E1000_TDWBAL(i),
2033                     ((uint32_t)hdr_paddr) | E1000_TX_HEAD_WB_ENABLE);
2034
2035                 /*
2036                  * WTHRESH is ignored by the hardware, since header
2037                  * write back mode is used.
2038                  */
2039                 txdctl |= IGB_TX_PTHRESH;
2040                 txdctl |= IGB_TX_HTHRESH << 8;
2041                 txdctl |= IGB_TX_WTHRESH << 16;
2042                 txdctl |= E1000_TXDCTL_QUEUE_ENABLE;
2043                 E1000_WRITE_REG(hw, E1000_TXDCTL(i), txdctl);
2044         }
2045
2046         if (sc->vf_ifp)
2047                 return;
2048
2049         e1000_config_collision_dist(hw);
2050
2051         /* Program the Transmit Control Register */
2052         tctl = E1000_READ_REG(hw, E1000_TCTL);
2053         tctl &= ~E1000_TCTL_CT;
2054         tctl |= (E1000_TCTL_PSP | E1000_TCTL_RTLC | E1000_TCTL_EN |
2055             (E1000_COLLISION_THRESHOLD << E1000_CT_SHIFT));
2056
2057         /* This write will effectively turn on the transmit unit. */
2058         E1000_WRITE_REG(hw, E1000_TCTL, tctl);
2059 }
2060
2061 static boolean_t
2062 igb_txcsum_ctx(struct igb_tx_ring *txr, struct mbuf *mp)
2063 {
2064         struct e1000_adv_tx_context_desc *TXD;
2065         uint32_t vlan_macip_lens, type_tucmd_mlhl, mss_l4len_idx;
2066         int ehdrlen, ctxd, ip_hlen = 0;
2067         boolean_t offload = TRUE;
2068
2069         if ((mp->m_pkthdr.csum_flags & IGB_CSUM_FEATURES) == 0)
2070                 offload = FALSE;
2071
2072         vlan_macip_lens = type_tucmd_mlhl = mss_l4len_idx = 0;
2073
2074         ctxd = txr->next_avail_desc;
2075         TXD = (struct e1000_adv_tx_context_desc *)&txr->tx_base[ctxd];
2076
2077         /*
2078          * In advanced descriptors the vlan tag must 
2079          * be placed into the context descriptor, thus
2080          * we need to be here just for that setup.
2081          */
2082         if (mp->m_flags & M_VLANTAG) {
2083                 uint16_t vlantag;
2084
2085                 vlantag = htole16(mp->m_pkthdr.ether_vlantag);
2086                 vlan_macip_lens |= (vlantag << E1000_ADVTXD_VLAN_SHIFT);
2087         } else if (!offload) {
2088                 return FALSE;
2089         }
2090
2091         ehdrlen = mp->m_pkthdr.csum_lhlen;
2092         KASSERT(ehdrlen > 0, ("invalid ether hlen"));
2093
2094         /* Set the ether header length */
2095         vlan_macip_lens |= ehdrlen << E1000_ADVTXD_MACLEN_SHIFT;
2096         if (mp->m_pkthdr.csum_flags & CSUM_IP) {
2097                 type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_IPV4;
2098                 ip_hlen = mp->m_pkthdr.csum_iphlen;
2099                 KASSERT(ip_hlen > 0, ("invalid ip hlen"));
2100         }
2101         vlan_macip_lens |= ip_hlen;
2102
2103         type_tucmd_mlhl |= E1000_ADVTXD_DCMD_DEXT | E1000_ADVTXD_DTYP_CTXT;
2104         if (mp->m_pkthdr.csum_flags & CSUM_TCP)
2105                 type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_TCP;
2106         else if (mp->m_pkthdr.csum_flags & CSUM_UDP)
2107                 type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_UDP;
2108
2109         /*
2110          * 82575 needs the TX context index added; the queue
2111          * index is used as TX context index here.
2112          */
2113         if (txr->sc->hw.mac.type == e1000_82575)
2114                 mss_l4len_idx = txr->me << 4;
2115
2116         /* Now copy bits into descriptor */
2117         TXD->vlan_macip_lens = htole32(vlan_macip_lens);
2118         TXD->type_tucmd_mlhl = htole32(type_tucmd_mlhl);
2119         TXD->seqnum_seed = htole32(0);
2120         TXD->mss_l4len_idx = htole32(mss_l4len_idx);
2121
2122         /* We've consumed the first desc, adjust counters */
2123         if (++ctxd == txr->num_tx_desc)
2124                 ctxd = 0;
2125         txr->next_avail_desc = ctxd;
2126         --txr->tx_avail;
2127
2128         return offload;
2129 }
2130
2131 static void
2132 igb_txeof(struct igb_tx_ring *txr)
2133 {
2134         int first, hdr, avail;
2135
2136         if (txr->tx_avail == txr->num_tx_desc)
2137                 return;
2138
2139         first = txr->next_to_clean;
2140         hdr = *(txr->tx_hdr);
2141
2142         if (first == hdr)
2143                 return;
2144
2145         avail = txr->tx_avail;
2146         while (first != hdr) {
2147                 struct igb_tx_buf *txbuf = &txr->tx_buf[first];
2148
2149                 ++avail;
2150                 if (txbuf->m_head) {
2151                         bus_dmamap_unload(txr->tx_tag, txbuf->map);
2152                         m_freem(txbuf->m_head);
2153                         txbuf->m_head = NULL;
2154                 }
2155                 if (++first == txr->num_tx_desc)
2156                         first = 0;
2157         }
2158         txr->next_to_clean = first;
2159         txr->tx_avail = avail;
2160
2161         /*
2162          * If we have a minimum free, clear OACTIVE
2163          * to tell the stack that it is OK to send packets.
2164          */
2165         if (IGB_IS_NOT_OACTIVE(txr)) {
2166                 ifsq_clr_oactive(txr->ifsq);
2167
2168                 /*
2169                  * We have enough TX descriptors, turn off
2170                  * the watchdog.  We allow small amount of
2171                  * packets (roughly intr_nsegs) pending on
2172                  * the transmit ring.
2173                  */
2174                 txr->tx_watchdog.wd_timer = 0;
2175         }
2176 }
2177
2178 static int
2179 igb_create_rx_ring(struct igb_rx_ring *rxr)
2180 {
2181         int rsize, i, error, nrxd;
2182
2183         /*
2184          * Validate number of receive descriptors. It must not exceed
2185          * hardware maximum, and must be multiple of IGB_DBA_ALIGN.
2186          */
2187         nrxd = device_getenv_int(rxr->sc->dev, "rxd", igb_rxd);
2188         if ((nrxd * sizeof(struct e1000_rx_desc)) % IGB_DBA_ALIGN != 0 ||
2189             nrxd > IGB_MAX_RXD || nrxd < IGB_MIN_RXD) {
2190                 device_printf(rxr->sc->dev,
2191                     "Using %d RX descriptors instead of %d!\n",
2192                     IGB_DEFAULT_RXD, nrxd);
2193                 rxr->num_rx_desc = IGB_DEFAULT_RXD;
2194         } else {
2195                 rxr->num_rx_desc = nrxd;
2196         }
2197
2198         /*
2199          * Allocate RX descriptor ring
2200          */
2201         rsize = roundup2(rxr->num_rx_desc * sizeof(union e1000_adv_rx_desc),
2202             IGB_DBA_ALIGN);
2203         rxr->rxdma.dma_vaddr = bus_dmamem_coherent_any(rxr->sc->parent_tag,
2204             IGB_DBA_ALIGN, rsize, BUS_DMA_WAITOK,
2205             &rxr->rxdma.dma_tag, &rxr->rxdma.dma_map,
2206             &rxr->rxdma.dma_paddr);
2207         if (rxr->rxdma.dma_vaddr == NULL) {
2208                 device_printf(rxr->sc->dev,
2209                     "Unable to allocate RxDescriptor memory\n");
2210                 return ENOMEM;
2211         }
2212         rxr->rx_base = rxr->rxdma.dma_vaddr;
2213         bzero(rxr->rx_base, rsize);
2214
2215         rsize = __VM_CACHELINE_ALIGN(
2216             sizeof(struct igb_rx_buf) * rxr->num_rx_desc);
2217         rxr->rx_buf = kmalloc_cachealign(rsize, M_DEVBUF, M_WAITOK | M_ZERO);
2218
2219         /*
2220          * Create DMA tag for RX buffers
2221          */
2222         error = bus_dma_tag_create(rxr->sc->parent_tag,
2223             1, 0,               /* alignment, bounds */
2224             BUS_SPACE_MAXADDR,  /* lowaddr */
2225             BUS_SPACE_MAXADDR,  /* highaddr */
2226             NULL, NULL,         /* filter, filterarg */
2227             MCLBYTES,           /* maxsize */
2228             1,                  /* nsegments */
2229             MCLBYTES,           /* maxsegsize */
2230             BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW, /* flags */
2231             &rxr->rx_tag);
2232         if (error) {
2233                 device_printf(rxr->sc->dev,
2234                     "Unable to create RX payload DMA tag\n");
2235                 kfree(rxr->rx_buf, M_DEVBUF);
2236                 rxr->rx_buf = NULL;
2237                 return error;
2238         }
2239
2240         /*
2241          * Create spare DMA map for RX buffers
2242          */
2243         error = bus_dmamap_create(rxr->rx_tag, BUS_DMA_WAITOK,
2244             &rxr->rx_sparemap);
2245         if (error) {
2246                 device_printf(rxr->sc->dev,
2247                     "Unable to create spare RX DMA maps\n");
2248                 bus_dma_tag_destroy(rxr->rx_tag);
2249                 kfree(rxr->rx_buf, M_DEVBUF);
2250                 rxr->rx_buf = NULL;
2251                 return error;
2252         }
2253
2254         /*
2255          * Create DMA maps for RX buffers
2256          */
2257         for (i = 0; i < rxr->num_rx_desc; i++) {
2258                 struct igb_rx_buf *rxbuf = &rxr->rx_buf[i];
2259
2260                 error = bus_dmamap_create(rxr->rx_tag,
2261                     BUS_DMA_WAITOK, &rxbuf->map);
2262                 if (error) {
2263                         device_printf(rxr->sc->dev,
2264                             "Unable to create RX DMA maps\n");
2265                         igb_destroy_rx_ring(rxr, i);
2266                         return error;
2267                 }
2268         }
2269
2270         /*
2271          * Initialize various watermark
2272          */
2273         rxr->wreg_nsegs = IGB_DEF_RXWREG_NSEGS;
2274
2275         return 0;
2276 }
2277
2278 static void
2279 igb_free_rx_ring(struct igb_rx_ring *rxr)
2280 {
2281         int i;
2282
2283         for (i = 0; i < rxr->num_rx_desc; ++i) {
2284                 struct igb_rx_buf *rxbuf = &rxr->rx_buf[i];
2285
2286                 if (rxbuf->m_head != NULL) {
2287                         bus_dmamap_unload(rxr->rx_tag, rxbuf->map);
2288                         m_freem(rxbuf->m_head);
2289                         rxbuf->m_head = NULL;
2290                 }
2291         }
2292
2293         if (rxr->fmp != NULL)
2294                 m_freem(rxr->fmp);
2295         rxr->fmp = NULL;
2296         rxr->lmp = NULL;
2297 }
2298
2299 static void
2300 igb_destroy_rx_ring(struct igb_rx_ring *rxr, int ndesc)
2301 {
2302         int i;
2303
2304         if (rxr->rxdma.dma_vaddr != NULL) {
2305                 bus_dmamap_unload(rxr->rxdma.dma_tag, rxr->rxdma.dma_map);
2306                 bus_dmamem_free(rxr->rxdma.dma_tag, rxr->rxdma.dma_vaddr,
2307                     rxr->rxdma.dma_map);
2308                 bus_dma_tag_destroy(rxr->rxdma.dma_tag);
2309                 rxr->rxdma.dma_vaddr = NULL;
2310         }
2311
2312         if (rxr->rx_buf == NULL)
2313                 return;
2314
2315         for (i = 0; i < ndesc; ++i) {
2316                 struct igb_rx_buf *rxbuf = &rxr->rx_buf[i];
2317
2318                 KKASSERT(rxbuf->m_head == NULL);
2319                 bus_dmamap_destroy(rxr->rx_tag, rxbuf->map);
2320         }
2321         bus_dmamap_destroy(rxr->rx_tag, rxr->rx_sparemap);
2322         bus_dma_tag_destroy(rxr->rx_tag);
2323
2324         kfree(rxr->rx_buf, M_DEVBUF);
2325         rxr->rx_buf = NULL;
2326 }
2327
2328 static void
2329 igb_setup_rxdesc(union e1000_adv_rx_desc *rxd, const struct igb_rx_buf *rxbuf)
2330 {
2331         rxd->read.pkt_addr = htole64(rxbuf->paddr);
2332         rxd->wb.upper.status_error = 0;
2333 }
2334
2335 static int
2336 igb_newbuf(struct igb_rx_ring *rxr, int i, boolean_t wait)
2337 {
2338         struct mbuf *m;
2339         bus_dma_segment_t seg;
2340         bus_dmamap_t map;
2341         struct igb_rx_buf *rxbuf;
2342         int error, nseg;
2343
2344         m = m_getcl(wait ? M_WAITOK : M_NOWAIT, MT_DATA, M_PKTHDR);
2345         if (m == NULL) {
2346                 if (wait) {
2347                         if_printf(&rxr->sc->arpcom.ac_if,
2348                             "Unable to allocate RX mbuf\n");
2349                 }
2350                 return ENOBUFS;
2351         }
2352         m->m_len = m->m_pkthdr.len = MCLBYTES;
2353
2354         if (rxr->sc->max_frame_size <= MCLBYTES - ETHER_ALIGN)
2355                 m_adj(m, ETHER_ALIGN);
2356
2357         error = bus_dmamap_load_mbuf_segment(rxr->rx_tag,
2358             rxr->rx_sparemap, m, &seg, 1, &nseg, BUS_DMA_NOWAIT);
2359         if (error) {
2360                 m_freem(m);
2361                 if (wait) {
2362                         if_printf(&rxr->sc->arpcom.ac_if,
2363                             "Unable to load RX mbuf\n");
2364                 }
2365                 return error;
2366         }
2367
2368         rxbuf = &rxr->rx_buf[i];
2369         if (rxbuf->m_head != NULL)
2370                 bus_dmamap_unload(rxr->rx_tag, rxbuf->map);
2371
2372         map = rxbuf->map;
2373         rxbuf->map = rxr->rx_sparemap;
2374         rxr->rx_sparemap = map;
2375
2376         rxbuf->m_head = m;
2377         rxbuf->paddr = seg.ds_addr;
2378
2379         igb_setup_rxdesc(&rxr->rx_base[i], rxbuf);
2380         return 0;
2381 }
2382
2383 static int
2384 igb_init_rx_ring(struct igb_rx_ring *rxr)
2385 {
2386         int i;
2387
2388         /* Clear the ring contents */
2389         bzero(rxr->rx_base,
2390             rxr->num_rx_desc * sizeof(union e1000_adv_rx_desc));
2391
2392         /* Now replenish the ring mbufs */
2393         for (i = 0; i < rxr->num_rx_desc; ++i) {
2394                 int error;
2395
2396                 error = igb_newbuf(rxr, i, TRUE);
2397                 if (error)
2398                         return error;
2399         }
2400
2401         /* Setup our descriptor indices */
2402         rxr->next_to_check = 0;
2403
2404         rxr->fmp = NULL;
2405         rxr->lmp = NULL;
2406         rxr->discard = FALSE;
2407
2408         return 0;
2409 }
2410
2411 static void
2412 igb_init_rx_unit(struct igb_softc *sc)
2413 {
2414         struct ifnet *ifp = &sc->arpcom.ac_if;
2415         struct e1000_hw *hw = &sc->hw;
2416         uint32_t rctl, rxcsum, srrctl = 0;
2417         int i;
2418
2419         /*
2420          * Make sure receives are disabled while setting
2421          * up the descriptor ring
2422          */
2423         rctl = E1000_READ_REG(hw, E1000_RCTL);
2424         E1000_WRITE_REG(hw, E1000_RCTL, rctl & ~E1000_RCTL_EN);
2425
2426 #if 0
2427         /*
2428         ** Set up for header split
2429         */
2430         if (igb_header_split) {
2431                 /* Use a standard mbuf for the header */
2432                 srrctl |= IGB_HDR_BUF << E1000_SRRCTL_BSIZEHDRSIZE_SHIFT;
2433                 srrctl |= E1000_SRRCTL_DESCTYPE_HDR_SPLIT_ALWAYS;
2434         } else
2435 #endif
2436                 srrctl |= E1000_SRRCTL_DESCTYPE_ADV_ONEBUF;
2437
2438         /*
2439         ** Set up for jumbo frames
2440         */
2441         if (ifp->if_mtu > ETHERMTU) {
2442                 rctl |= E1000_RCTL_LPE;
2443 #if 0
2444                 if (adapter->rx_mbuf_sz == MJUMPAGESIZE) {
2445                         srrctl |= 4096 >> E1000_SRRCTL_BSIZEPKT_SHIFT;
2446                         rctl |= E1000_RCTL_SZ_4096 | E1000_RCTL_BSEX;
2447                 } else if (adapter->rx_mbuf_sz > MJUMPAGESIZE) {
2448                         srrctl |= 8192 >> E1000_SRRCTL_BSIZEPKT_SHIFT;
2449                         rctl |= E1000_RCTL_SZ_8192 | E1000_RCTL_BSEX;
2450                 }
2451                 /* Set maximum packet len */
2452                 psize = adapter->max_frame_size;
2453                 /* are we on a vlan? */
2454                 if (adapter->ifp->if_vlantrunk != NULL)
2455                         psize += VLAN_TAG_SIZE;
2456                 E1000_WRITE_REG(&adapter->hw, E1000_RLPML, psize);
2457 #else
2458                 srrctl |= 2048 >> E1000_SRRCTL_BSIZEPKT_SHIFT;
2459                 rctl |= E1000_RCTL_SZ_2048;
2460 #endif
2461         } else {
2462                 rctl &= ~E1000_RCTL_LPE;
2463                 srrctl |= 2048 >> E1000_SRRCTL_BSIZEPKT_SHIFT;
2464                 rctl |= E1000_RCTL_SZ_2048;
2465         }
2466
2467         /* Setup the Base and Length of the Rx Descriptor Rings */
2468         for (i = 0; i < sc->rx_ring_inuse; ++i) {
2469                 struct igb_rx_ring *rxr = &sc->rx_rings[i];
2470                 uint64_t bus_addr = rxr->rxdma.dma_paddr;
2471                 uint32_t rxdctl;
2472
2473                 E1000_WRITE_REG(hw, E1000_RDLEN(i),
2474                     rxr->num_rx_desc * sizeof(struct e1000_rx_desc));
2475                 E1000_WRITE_REG(hw, E1000_RDBAH(i),
2476                     (uint32_t)(bus_addr >> 32));
2477                 E1000_WRITE_REG(hw, E1000_RDBAL(i),
2478                     (uint32_t)bus_addr);
2479                 E1000_WRITE_REG(hw, E1000_SRRCTL(i), srrctl);
2480                 /* Enable this Queue */
2481                 rxdctl = E1000_READ_REG(hw, E1000_RXDCTL(i));
2482                 rxdctl |= E1000_RXDCTL_QUEUE_ENABLE;
2483                 rxdctl &= 0xFFF00000;
2484                 rxdctl |= IGB_RX_PTHRESH;
2485                 rxdctl |= IGB_RX_HTHRESH << 8;
2486                 /*
2487                  * Don't set WTHRESH to a value above 1 on 82576, see:
2488                  * 82576 specification update errata #26
2489                  */
2490                 rxdctl |= IGB_RX_WTHRESH << 16;
2491                 E1000_WRITE_REG(hw, E1000_RXDCTL(i), rxdctl);
2492         }
2493
2494         rxcsum = E1000_READ_REG(&sc->hw, E1000_RXCSUM);
2495         rxcsum &= ~(E1000_RXCSUM_PCSS_MASK | E1000_RXCSUM_IPPCSE);
2496
2497         /*
2498          * Receive Checksum Offload for TCP and UDP
2499          *
2500          * Checksum offloading is also enabled if multiple receive
2501          * queue is to be supported, since we need it to figure out
2502          * fragments.
2503          */
2504         if ((ifp->if_capenable & IFCAP_RXCSUM) || IGB_ENABLE_HWRSS(sc)) {
2505                 /*
2506                  * NOTE:
2507                  * PCSD must be enabled to enable multiple
2508                  * receive queues.
2509                  */
2510                 rxcsum |= E1000_RXCSUM_IPOFL | E1000_RXCSUM_TUOFL |
2511                     E1000_RXCSUM_PCSD;
2512         } else {
2513                 rxcsum &= ~(E1000_RXCSUM_IPOFL | E1000_RXCSUM_TUOFL |
2514                     E1000_RXCSUM_PCSD);
2515         }
2516         E1000_WRITE_REG(&sc->hw, E1000_RXCSUM, rxcsum);
2517
2518         if (IGB_ENABLE_HWRSS(sc)) {
2519                 uint8_t key[IGB_NRSSRK * IGB_RSSRK_SIZE];
2520                 uint32_t reta_shift;
2521                 int j, r;
2522
2523                 /*
2524                  * NOTE:
2525                  * When we reach here, RSS has already been disabled
2526                  * in igb_stop(), so we could safely configure RSS key
2527                  * and redirect table.
2528                  */
2529
2530                 /*
2531                  * Configure RSS key
2532                  */
2533                 toeplitz_get_key(key, sizeof(key));
2534                 for (i = 0; i < IGB_NRSSRK; ++i) {
2535                         uint32_t rssrk;
2536
2537                         rssrk = IGB_RSSRK_VAL(key, i);
2538                         IGB_RSS_DPRINTF(sc, 1, "rssrk%d 0x%08x\n", i, rssrk);
2539
2540                         E1000_WRITE_REG(hw, E1000_RSSRK(i), rssrk);
2541                 }
2542
2543                 /*
2544                  * Configure RSS redirect table in following fashion:
2545                  * (hash & ring_cnt_mask) == rdr_table[(hash & rdr_table_mask)]
2546                  */
2547                 reta_shift = IGB_RETA_SHIFT;
2548                 if (hw->mac.type == e1000_82575)
2549                         reta_shift = IGB_RETA_SHIFT_82575;
2550
2551                 r = 0;
2552                 for (j = 0; j < IGB_NRETA; ++j) {
2553                         uint32_t reta = 0;
2554
2555                         for (i = 0; i < IGB_RETA_SIZE; ++i) {
2556                                 uint32_t q;
2557
2558                                 q = (r % sc->rx_ring_inuse) << reta_shift;
2559                                 reta |= q << (8 * i);
2560                                 ++r;
2561                         }
2562                         IGB_RSS_DPRINTF(sc, 1, "reta 0x%08x\n", reta);
2563                         E1000_WRITE_REG(hw, E1000_RETA(j), reta);
2564                 }
2565
2566                 /*
2567                  * Enable multiple receive queues.
2568                  * Enable IPv4 RSS standard hash functions.
2569                  * Disable RSS interrupt on 82575
2570                  */
2571                 E1000_WRITE_REG(&sc->hw, E1000_MRQC,
2572                                 E1000_MRQC_ENABLE_RSS_4Q |
2573                                 E1000_MRQC_RSS_FIELD_IPV4_TCP |
2574                                 E1000_MRQC_RSS_FIELD_IPV4);
2575         }
2576
2577         /* Setup the Receive Control Register */
2578         rctl &= ~(3 << E1000_RCTL_MO_SHIFT);
2579         rctl |= E1000_RCTL_EN | E1000_RCTL_BAM | E1000_RCTL_LBM_NO |
2580             E1000_RCTL_RDMTS_HALF |
2581             (hw->mac.mc_filter_type << E1000_RCTL_MO_SHIFT);
2582         /* Strip CRC bytes. */
2583         rctl |= E1000_RCTL_SECRC;
2584         /* Make sure VLAN Filters are off */
2585         rctl &= ~E1000_RCTL_VFE;
2586         /* Don't store bad packets */
2587         rctl &= ~E1000_RCTL_SBP;
2588
2589         /* Enable Receives */
2590         E1000_WRITE_REG(hw, E1000_RCTL, rctl);
2591
2592         /*
2593          * Setup the HW Rx Head and Tail Descriptor Pointers
2594          *   - needs to be after enable
2595          */
2596         for (i = 0; i < sc->rx_ring_inuse; ++i) {
2597                 struct igb_rx_ring *rxr = &sc->rx_rings[i];
2598
2599                 E1000_WRITE_REG(hw, E1000_RDH(i), rxr->next_to_check);
2600                 E1000_WRITE_REG(hw, E1000_RDT(i), rxr->num_rx_desc - 1);
2601         }
2602 }
2603
2604 static void
2605 igb_rx_refresh(struct igb_rx_ring *rxr, int i)
2606 {
2607         if (--i < 0)
2608                 i = rxr->num_rx_desc - 1;
2609         E1000_WRITE_REG(&rxr->sc->hw, E1000_RDT(rxr->me), i);
2610 }
2611
2612 static void
2613 igb_rxeof(struct igb_rx_ring *rxr, int count)
2614 {
2615         struct ifnet *ifp = &rxr->sc->arpcom.ac_if;
2616         union e1000_adv_rx_desc *cur;
2617         uint32_t staterr;
2618         int i, ncoll = 0, cpuid = mycpuid;
2619
2620         i = rxr->next_to_check;
2621         cur = &rxr->rx_base[i];
2622         staterr = le32toh(cur->wb.upper.status_error);
2623
2624         if ((staterr & E1000_RXD_STAT_DD) == 0)
2625                 return;
2626
2627         while ((staterr & E1000_RXD_STAT_DD) && count != 0) {
2628                 struct pktinfo *pi = NULL, pi0;
2629                 struct igb_rx_buf *rxbuf = &rxr->rx_buf[i];
2630                 struct mbuf *m = NULL;
2631                 boolean_t eop;
2632
2633                 eop = (staterr & E1000_RXD_STAT_EOP) ? TRUE : FALSE;
2634                 if (eop)
2635                         --count;
2636
2637                 ++ncoll;
2638                 if ((staterr & E1000_RXDEXT_ERR_FRAME_ERR_MASK) == 0 &&
2639                     !rxr->discard) {
2640                         struct mbuf *mp = rxbuf->m_head;
2641                         uint32_t hash, hashtype;
2642                         uint16_t vlan;
2643                         int len;
2644
2645                         len = le16toh(cur->wb.upper.length);
2646                         if ((rxr->sc->hw.mac.type == e1000_i350 ||
2647                              rxr->sc->hw.mac.type == e1000_i354) &&
2648                             (staterr & E1000_RXDEXT_STATERR_LB))
2649                                 vlan = be16toh(cur->wb.upper.vlan);
2650                         else
2651                                 vlan = le16toh(cur->wb.upper.vlan);
2652
2653                         hash = le32toh(cur->wb.lower.hi_dword.rss);
2654                         hashtype = le32toh(cur->wb.lower.lo_dword.data) &
2655                             E1000_RXDADV_RSSTYPE_MASK;
2656
2657                         IGB_RSS_DPRINTF(rxr->sc, 10,
2658                             "ring%d, hash 0x%08x, hashtype %u\n",
2659                             rxr->me, hash, hashtype);
2660
2661                         bus_dmamap_sync(rxr->rx_tag, rxbuf->map,
2662                             BUS_DMASYNC_POSTREAD);
2663
2664                         if (igb_newbuf(rxr, i, FALSE) != 0) {
2665                                 IFNET_STAT_INC(ifp, iqdrops, 1);
2666                                 goto discard;
2667                         }
2668
2669                         mp->m_len = len;
2670                         if (rxr->fmp == NULL) {
2671                                 mp->m_pkthdr.len = len;
2672                                 rxr->fmp = mp;
2673                                 rxr->lmp = mp;
2674                         } else {
2675                                 rxr->lmp->m_next = mp;
2676                                 rxr->lmp = rxr->lmp->m_next;
2677                                 rxr->fmp->m_pkthdr.len += len;
2678                         }
2679
2680                         if (eop) {
2681                                 m = rxr->fmp;
2682                                 rxr->fmp = NULL;
2683                                 rxr->lmp = NULL;
2684
2685                                 m->m_pkthdr.rcvif = ifp;
2686                                 IFNET_STAT_INC(ifp, ipackets, 1);
2687
2688                                 if (ifp->if_capenable & IFCAP_RXCSUM)
2689                                         igb_rxcsum(staterr, m);
2690
2691                                 if (staterr & E1000_RXD_STAT_VP) {
2692                                         m->m_pkthdr.ether_vlantag = vlan;
2693                                         m->m_flags |= M_VLANTAG;
2694                                 }
2695
2696                                 if (ifp->if_capenable & IFCAP_RSS) {
2697                                         pi = igb_rssinfo(m, &pi0,
2698                                             hash, hashtype, staterr);
2699                                 }
2700 #ifdef IGB_RSS_DEBUG
2701                                 rxr->rx_packets++;
2702 #endif
2703                         }
2704                 } else {
2705                         IFNET_STAT_INC(ifp, ierrors, 1);
2706 discard:
2707                         igb_setup_rxdesc(cur, rxbuf);
2708                         if (!eop)
2709                                 rxr->discard = TRUE;
2710                         else
2711                                 rxr->discard = FALSE;
2712                         if (rxr->fmp != NULL) {
2713                                 m_freem(rxr->fmp);
2714                                 rxr->fmp = NULL;
2715                                 rxr->lmp = NULL;
2716                         }
2717                         m = NULL;
2718                 }
2719
2720                 if (m != NULL)
2721                         ifp->if_input(ifp, m, pi, cpuid);
2722
2723                 /* Advance our pointers to the next descriptor. */
2724                 if (++i == rxr->num_rx_desc)
2725                         i = 0;
2726
2727                 if (ncoll >= rxr->wreg_nsegs) {
2728                         igb_rx_refresh(rxr, i);
2729                         ncoll = 0;
2730                 }
2731
2732                 cur = &rxr->rx_base[i];
2733                 staterr = le32toh(cur->wb.upper.status_error);
2734         }
2735         rxr->next_to_check = i;
2736
2737         if (ncoll > 0)
2738                 igb_rx_refresh(rxr, i);
2739 }
2740
2741
2742 static void
2743 igb_set_vlan(struct igb_softc *sc)
2744 {
2745         struct e1000_hw *hw = &sc->hw;
2746         uint32_t reg;
2747 #if 0
2748         struct ifnet *ifp = sc->arpcom.ac_if;
2749 #endif
2750
2751         if (sc->vf_ifp) {
2752                 e1000_rlpml_set_vf(hw, sc->max_frame_size + VLAN_TAG_SIZE);
2753                 return;
2754         }
2755
2756         reg = E1000_READ_REG(hw, E1000_CTRL);
2757         reg |= E1000_CTRL_VME;
2758         E1000_WRITE_REG(hw, E1000_CTRL, reg);
2759
2760 #if 0
2761         /* Enable the Filter Table */
2762         if (ifp->if_capenable & IFCAP_VLAN_HWFILTER) {
2763                 reg = E1000_READ_REG(hw, E1000_RCTL);
2764                 reg &= ~E1000_RCTL_CFIEN;
2765                 reg |= E1000_RCTL_VFE;
2766                 E1000_WRITE_REG(hw, E1000_RCTL, reg);
2767         }
2768 #endif
2769
2770         /* Update the frame size */
2771         E1000_WRITE_REG(&sc->hw, E1000_RLPML,
2772             sc->max_frame_size + VLAN_TAG_SIZE);
2773
2774 #if 0
2775         /* Don't bother with table if no vlans */
2776         if ((adapter->num_vlans == 0) ||
2777             ((ifp->if_capenable & IFCAP_VLAN_HWFILTER) == 0))
2778                 return;
2779         /*
2780         ** A soft reset zero's out the VFTA, so
2781         ** we need to repopulate it now.
2782         */
2783         for (int i = 0; i < IGB_VFTA_SIZE; i++)
2784                 if (adapter->shadow_vfta[i] != 0) {
2785                         if (adapter->vf_ifp)
2786                                 e1000_vfta_set_vf(hw,
2787                                     adapter->shadow_vfta[i], TRUE);
2788                         else
2789                                 E1000_WRITE_REG_ARRAY(hw, E1000_VFTA,
2790                                  i, adapter->shadow_vfta[i]);
2791                 }
2792 #endif
2793 }
2794
2795 static void
2796 igb_enable_intr(struct igb_softc *sc)
2797 {
2798         if (sc->intr_type != PCI_INTR_TYPE_MSIX) {
2799                 lwkt_serialize_handler_enable(&sc->main_serialize);
2800         } else {
2801                 int i;
2802
2803                 for (i = 0; i < sc->msix_cnt; ++i) {
2804                         lwkt_serialize_handler_enable(
2805                             sc->msix_data[i].msix_serialize);
2806                 }
2807         }
2808
2809         if ((sc->flags & IGB_FLAG_SHARED_INTR) == 0) {
2810                 if (sc->intr_type == PCI_INTR_TYPE_MSIX)
2811                         E1000_WRITE_REG(&sc->hw, E1000_EIAC, sc->intr_mask);
2812                 else
2813                         E1000_WRITE_REG(&sc->hw, E1000_EIAC, 0);
2814                 E1000_WRITE_REG(&sc->hw, E1000_EIAM, sc->intr_mask);
2815                 E1000_WRITE_REG(&sc->hw, E1000_EIMS, sc->intr_mask);
2816                 E1000_WRITE_REG(&sc->hw, E1000_IMS, E1000_IMS_LSC);
2817         } else {
2818                 E1000_WRITE_REG(&sc->hw, E1000_IMS, IMS_ENABLE_MASK);
2819         }
2820         E1000_WRITE_FLUSH(&sc->hw);
2821 }
2822
2823 static void
2824 igb_disable_intr(struct igb_softc *sc)
2825 {
2826         if ((sc->flags & IGB_FLAG_SHARED_INTR) == 0) {
2827                 E1000_WRITE_REG(&sc->hw, E1000_EIMC, 0xffffffff);
2828                 E1000_WRITE_REG(&sc->hw, E1000_EIAC, 0);
2829         }
2830         E1000_WRITE_REG(&sc->hw, E1000_IMC, 0xffffffff);
2831         E1000_WRITE_FLUSH(&sc->hw);
2832
2833         if (sc->intr_type != PCI_INTR_TYPE_MSIX) {
2834                 lwkt_serialize_handler_disable(&sc->main_serialize);
2835         } else {
2836                 int i;
2837
2838                 for (i = 0; i < sc->msix_cnt; ++i) {
2839                         lwkt_serialize_handler_disable(
2840                             sc->msix_data[i].msix_serialize);
2841                 }
2842         }
2843 }
2844
2845 /*
2846  * Bit of a misnomer, what this really means is
2847  * to enable OS management of the system... aka
2848  * to disable special hardware management features 
2849  */
2850 static void
2851 igb_get_mgmt(struct igb_softc *sc)
2852 {
2853         if (sc->flags & IGB_FLAG_HAS_MGMT) {
2854                 int manc2h = E1000_READ_REG(&sc->hw, E1000_MANC2H);
2855                 int manc = E1000_READ_REG(&sc->hw, E1000_MANC);
2856
2857                 /* disable hardware interception of ARP */
2858                 manc &= ~E1000_MANC_ARP_EN;
2859
2860                 /* enable receiving management packets to the host */
2861                 manc |= E1000_MANC_EN_MNG2HOST;
2862                 manc2h |= 1 << 5; /* Mng Port 623 */
2863                 manc2h |= 1 << 6; /* Mng Port 664 */
2864                 E1000_WRITE_REG(&sc->hw, E1000_MANC2H, manc2h);
2865                 E1000_WRITE_REG(&sc->hw, E1000_MANC, manc);
2866         }
2867 }
2868
2869 /*
2870  * Give control back to hardware management controller
2871  * if there is one.
2872  */
2873 static void
2874 igb_rel_mgmt(struct igb_softc *sc)
2875 {
2876         if (sc->flags & IGB_FLAG_HAS_MGMT) {
2877                 int manc = E1000_READ_REG(&sc->hw, E1000_MANC);
2878
2879                 /* Re-enable hardware interception of ARP */
2880                 manc |= E1000_MANC_ARP_EN;
2881                 manc &= ~E1000_MANC_EN_MNG2HOST;
2882
2883                 E1000_WRITE_REG(&sc->hw, E1000_MANC, manc);
2884         }
2885 }
2886
2887 /*
2888  * Sets CTRL_EXT:DRV_LOAD bit.
2889  *
2890  * For ASF and Pass Through versions of f/w this means that
2891  * the driver is loaded. 
2892  */
2893 static void
2894 igb_get_hw_control(struct igb_softc *sc)
2895 {
2896         uint32_t ctrl_ext;
2897
2898         if (sc->vf_ifp)
2899                 return;
2900
2901         /* Let firmware know the driver has taken over */
2902         ctrl_ext = E1000_READ_REG(&sc->hw, E1000_CTRL_EXT);
2903         E1000_WRITE_REG(&sc->hw, E1000_CTRL_EXT,
2904             ctrl_ext | E1000_CTRL_EXT_DRV_LOAD);
2905 }
2906
2907 /*
2908  * Resets CTRL_EXT:DRV_LOAD bit.
2909  *
2910  * For ASF and Pass Through versions of f/w this means that the
2911  * driver is no longer loaded.
2912  */
2913 static void
2914 igb_rel_hw_control(struct igb_softc *sc)
2915 {
2916         uint32_t ctrl_ext;
2917
2918         if (sc->vf_ifp)
2919                 return;
2920
2921         /* Let firmware taken over control of h/w */
2922         ctrl_ext = E1000_READ_REG(&sc->hw, E1000_CTRL_EXT);
2923         E1000_WRITE_REG(&sc->hw, E1000_CTRL_EXT,
2924             ctrl_ext & ~E1000_CTRL_EXT_DRV_LOAD);
2925 }
2926
2927 static boolean_t
2928 igb_is_valid_ether_addr(const uint8_t *addr)
2929 {
2930         uint8_t zero_addr[ETHER_ADDR_LEN] = { 0, 0, 0, 0, 0, 0 };
2931
2932         if ((addr[0] & 1) || !bcmp(addr, zero_addr, ETHER_ADDR_LEN))
2933                 return FALSE;
2934         return TRUE;
2935 }
2936
2937 /*
2938  * Enable PCI Wake On Lan capability
2939  */
2940 static void
2941 igb_enable_wol(device_t dev)
2942 {
2943         uint16_t cap, status;
2944         uint8_t id;
2945
2946         /* First find the capabilities pointer*/
2947         cap = pci_read_config(dev, PCIR_CAP_PTR, 2);
2948
2949         /* Read the PM Capabilities */
2950         id = pci_read_config(dev, cap, 1);
2951         if (id != PCIY_PMG)     /* Something wrong */
2952                 return;
2953
2954         /*
2955          * OK, we have the power capabilities,
2956          * so now get the status register
2957          */
2958         cap += PCIR_POWER_STATUS;
2959         status = pci_read_config(dev, cap, 2);
2960         status |= PCIM_PSTAT_PME | PCIM_PSTAT_PMEENABLE;
2961         pci_write_config(dev, cap, status, 2);
2962 }
2963
2964 static void
2965 igb_update_stats_counters(struct igb_softc *sc)
2966 {
2967         struct e1000_hw *hw = &sc->hw;
2968         struct e1000_hw_stats *stats;
2969         struct ifnet *ifp = &sc->arpcom.ac_if;
2970
2971         /* 
2972          * The virtual function adapter has only a
2973          * small controlled set of stats, do only 
2974          * those and return.
2975          */
2976         if (sc->vf_ifp) {
2977                 igb_update_vf_stats_counters(sc);
2978                 return;
2979         }
2980         stats = sc->stats;
2981
2982         if (sc->hw.phy.media_type == e1000_media_type_copper ||
2983             (E1000_READ_REG(hw, E1000_STATUS) & E1000_STATUS_LU)) {
2984                 stats->symerrs +=
2985                     E1000_READ_REG(hw,E1000_SYMERRS);
2986                 stats->sec += E1000_READ_REG(hw, E1000_SEC);
2987         }
2988
2989         stats->crcerrs += E1000_READ_REG(hw, E1000_CRCERRS);
2990         stats->mpc += E1000_READ_REG(hw, E1000_MPC);
2991         stats->scc += E1000_READ_REG(hw, E1000_SCC);
2992         stats->ecol += E1000_READ_REG(hw, E1000_ECOL);
2993
2994         stats->mcc += E1000_READ_REG(hw, E1000_MCC);
2995         stats->latecol += E1000_READ_REG(hw, E1000_LATECOL);
2996         stats->colc += E1000_READ_REG(hw, E1000_COLC);
2997         stats->dc += E1000_READ_REG(hw, E1000_DC);
2998         stats->rlec += E1000_READ_REG(hw, E1000_RLEC);
2999         stats->xonrxc += E1000_READ_REG(hw, E1000_XONRXC);
3000         stats->xontxc += E1000_READ_REG(hw, E1000_XONTXC);
3001
3002         /*
3003          * For watchdog management we need to know if we have been
3004          * paused during the last interval, so capture that here.
3005          */ 
3006         sc->pause_frames = E1000_READ_REG(hw, E1000_XOFFRXC);
3007         stats->xoffrxc += sc->pause_frames;
3008         stats->xofftxc += E1000_READ_REG(hw, E1000_XOFFTXC);
3009         stats->fcruc += E1000_READ_REG(hw, E1000_FCRUC);
3010         stats->prc64 += E1000_READ_REG(hw, E1000_PRC64);
3011         stats->prc127 += E1000_READ_REG(hw, E1000_PRC127);
3012         stats->prc255 += E1000_READ_REG(hw, E1000_PRC255);
3013         stats->prc511 += E1000_READ_REG(hw, E1000_PRC511);
3014         stats->prc1023 += E1000_READ_REG(hw, E1000_PRC1023);
3015         stats->prc1522 += E1000_READ_REG(hw, E1000_PRC1522);
3016         stats->gprc += E1000_READ_REG(hw, E1000_GPRC);
3017         stats->bprc += E1000_READ_REG(hw, E1000_BPRC);
3018         stats->mprc += E1000_READ_REG(hw, E1000_MPRC);
3019         stats->gptc += E1000_READ_REG(hw, E1000_GPTC);
3020
3021         /* For the 64-bit byte counters the low dword must be read first. */
3022         /* Both registers clear on the read of the high dword */
3023
3024         stats->gorc += E1000_READ_REG(hw, E1000_GORCL) +
3025             ((uint64_t)E1000_READ_REG(hw, E1000_GORCH) << 32);
3026         stats->gotc += E1000_READ_REG(hw, E1000_GOTCL) +
3027             ((uint64_t)E1000_READ_REG(hw, E1000_GOTCH) << 32);
3028
3029         stats->rnbc += E1000_READ_REG(hw, E1000_RNBC);
3030         stats->ruc += E1000_READ_REG(hw, E1000_RUC);
3031         stats->rfc += E1000_READ_REG(hw, E1000_RFC);
3032         stats->roc += E1000_READ_REG(hw, E1000_ROC);
3033         stats->rjc += E1000_READ_REG(hw, E1000_RJC);
3034
3035         stats->tor += E1000_READ_REG(hw, E1000_TORH);
3036         stats->tot += E1000_READ_REG(hw, E1000_TOTH);
3037
3038         stats->tpr += E1000_READ_REG(hw, E1000_TPR);
3039         stats->tpt += E1000_READ_REG(hw, E1000_TPT);
3040         stats->ptc64 += E1000_READ_REG(hw, E1000_PTC64);
3041         stats->ptc127 += E1000_READ_REG(hw, E1000_PTC127);
3042         stats->ptc255 += E1000_READ_REG(hw, E1000_PTC255);
3043         stats->ptc511 += E1000_READ_REG(hw, E1000_PTC511);
3044         stats->ptc1023 += E1000_READ_REG(hw, E1000_PTC1023);
3045         stats->ptc1522 += E1000_READ_REG(hw, E1000_PTC1522);
3046         stats->mptc += E1000_READ_REG(hw, E1000_MPTC);
3047         stats->bptc += E1000_READ_REG(hw, E1000_BPTC);
3048
3049         /* Interrupt Counts */
3050
3051         stats->iac += E1000_READ_REG(hw, E1000_IAC);
3052         stats->icrxptc += E1000_READ_REG(hw, E1000_ICRXPTC);
3053         stats->icrxatc += E1000_READ_REG(hw, E1000_ICRXATC);
3054         stats->ictxptc += E1000_READ_REG(hw, E1000_ICTXPTC);
3055         stats->ictxatc += E1000_READ_REG(hw, E1000_ICTXATC);
3056         stats->ictxqec += E1000_READ_REG(hw, E1000_ICTXQEC);
3057         stats->ictxqmtc += E1000_READ_REG(hw, E1000_ICTXQMTC);
3058         stats->icrxdmtc += E1000_READ_REG(hw, E1000_ICRXDMTC);
3059         stats->icrxoc += E1000_READ_REG(hw, E1000_ICRXOC);
3060
3061         /* Host to Card Statistics */
3062
3063         stats->cbtmpc += E1000_READ_REG(hw, E1000_CBTMPC);
3064         stats->htdpmc += E1000_READ_REG(hw, E1000_HTDPMC);
3065         stats->cbrdpc += E1000_READ_REG(hw, E1000_CBRDPC);
3066         stats->cbrmpc += E1000_READ_REG(hw, E1000_CBRMPC);
3067         stats->rpthc += E1000_READ_REG(hw, E1000_RPTHC);
3068         stats->hgptc += E1000_READ_REG(hw, E1000_HGPTC);
3069         stats->htcbdpc += E1000_READ_REG(hw, E1000_HTCBDPC);
3070         stats->hgorc += (E1000_READ_REG(hw, E1000_HGORCL) +
3071             ((uint64_t)E1000_READ_REG(hw, E1000_HGORCH) << 32));
3072         stats->hgotc += (E1000_READ_REG(hw, E1000_HGOTCL) +
3073             ((uint64_t)E1000_READ_REG(hw, E1000_HGOTCH) << 32));
3074         stats->lenerrs += E1000_READ_REG(hw, E1000_LENERRS);
3075         stats->scvpc += E1000_READ_REG(hw, E1000_SCVPC);
3076         stats->hrmpc += E1000_READ_REG(hw, E1000_HRMPC);
3077
3078         stats->algnerrc += E1000_READ_REG(hw, E1000_ALGNERRC);
3079         stats->rxerrc += E1000_READ_REG(hw, E1000_RXERRC);
3080         stats->tncrs += E1000_READ_REG(hw, E1000_TNCRS);
3081         stats->cexterr += E1000_READ_REG(hw, E1000_CEXTERR);
3082         stats->tsctc += E1000_READ_REG(hw, E1000_TSCTC);
3083         stats->tsctfc += E1000_READ_REG(hw, E1000_TSCTFC);
3084
3085         IFNET_STAT_SET(ifp, collisions, stats->colc);
3086
3087         /* Rx Errors */
3088         IFNET_STAT_SET(ifp, ierrors,
3089             stats->rxerrc + stats->crcerrs + stats->algnerrc +
3090             stats->ruc + stats->roc + stats->mpc + stats->cexterr);
3091
3092         /* Tx Errors */
3093         IFNET_STAT_SET(ifp, oerrors,
3094             stats->ecol + stats->latecol + sc->watchdog_events);
3095
3096         /* Driver specific counters */
3097         sc->device_control = E1000_READ_REG(hw, E1000_CTRL);
3098         sc->rx_control = E1000_READ_REG(hw, E1000_RCTL);
3099         sc->int_mask = E1000_READ_REG(hw, E1000_IMS);
3100         sc->eint_mask = E1000_READ_REG(hw, E1000_EIMS);
3101         sc->packet_buf_alloc_tx =
3102             ((E1000_READ_REG(hw, E1000_PBA) & 0xffff0000) >> 16);
3103         sc->packet_buf_alloc_rx =
3104             (E1000_READ_REG(hw, E1000_PBA) & 0xffff);
3105 }
3106
3107 static void
3108 igb_vf_init_stats(struct igb_softc *sc)
3109 {
3110         struct e1000_hw *hw = &sc->hw;
3111         struct e1000_vf_stats *stats;
3112
3113         stats = sc->stats;
3114         stats->last_gprc = E1000_READ_REG(hw, E1000_VFGPRC);
3115         stats->last_gorc = E1000_READ_REG(hw, E1000_VFGORC);
3116         stats->last_gptc = E1000_READ_REG(hw, E1000_VFGPTC);
3117         stats->last_gotc = E1000_READ_REG(hw, E1000_VFGOTC);
3118         stats->last_mprc = E1000_READ_REG(hw, E1000_VFMPRC);
3119 }
3120  
3121 static void
3122 igb_update_vf_stats_counters(struct igb_softc *sc)
3123 {
3124         struct e1000_hw *hw = &sc->hw;
3125         struct e1000_vf_stats *stats;
3126
3127         if (sc->link_speed == 0)
3128                 return;
3129
3130         stats = sc->stats;
3131         UPDATE_VF_REG(E1000_VFGPRC, stats->last_gprc, stats->gprc);
3132         UPDATE_VF_REG(E1000_VFGORC, stats->last_gorc, stats->gorc);
3133         UPDATE_VF_REG(E1000_VFGPTC, stats->last_gptc, stats->gptc);
3134         UPDATE_VF_REG(E1000_VFGOTC, stats->last_gotc, stats->gotc);
3135         UPDATE_VF_REG(E1000_VFMPRC, stats->last_mprc, stats->mprc);
3136 }
3137
3138 #ifdef IFPOLL_ENABLE
3139
3140 static void
3141 igb_npoll_status(struct ifnet *ifp)
3142 {
3143         struct igb_softc *sc = ifp->if_softc;
3144         uint32_t reg_icr;
3145
3146         ASSERT_SERIALIZED(&sc->main_serialize);
3147
3148         reg_icr = E1000_READ_REG(&sc->hw, E1000_ICR);
3149         if (reg_icr & (E1000_ICR_RXSEQ | E1000_ICR_LSC)) {
3150                 sc->hw.mac.get_link_status = 1;
3151                 igb_update_link_status(sc);
3152         }
3153 }
3154
3155 static void
3156 igb_npoll_tx(struct ifnet *ifp, void *arg, int cycle __unused)
3157 {
3158         struct igb_tx_ring *txr = arg;
3159
3160         ASSERT_SERIALIZED(&txr->tx_serialize);
3161
3162         igb_txeof(txr);
3163         if (!ifsq_is_empty(txr->ifsq))
3164                 ifsq_devstart(txr->ifsq);
3165 }
3166
3167 static void
3168 igb_npoll_rx(struct ifnet *ifp __unused, void *arg, int cycle)
3169 {
3170         struct igb_rx_ring *rxr = arg;
3171
3172         ASSERT_SERIALIZED(&rxr->rx_serialize);
3173
3174         igb_rxeof(rxr, cycle);
3175 }
3176
3177 static void
3178 igb_npoll(struct ifnet *ifp, struct ifpoll_info *info)
3179 {
3180         struct igb_softc *sc = ifp->if_softc;
3181         int i, txr_cnt, rxr_cnt;
3182
3183         ASSERT_IFNET_SERIALIZED_ALL(ifp);
3184
3185         if (info) {
3186                 int off;
3187
3188                 info->ifpi_status.status_func = igb_npoll_status;
3189                 info->ifpi_status.serializer = &sc->main_serialize;
3190
3191                 txr_cnt = igb_get_txring_inuse(sc, TRUE);
3192                 off = sc->tx_npoll_off;
3193                 for (i = 0; i < txr_cnt; ++i) {
3194                         struct igb_tx_ring *txr = &sc->tx_rings[i];
3195                         int idx = i + off;
3196
3197                         KKASSERT(idx < ncpus2);
3198                         info->ifpi_tx[idx].poll_func = igb_npoll_tx;
3199                         info->ifpi_tx[idx].arg = txr;
3200                         info->ifpi_tx[idx].serializer = &txr->tx_serialize;
3201                         ifsq_set_cpuid(txr->ifsq, idx);
3202                 }
3203
3204                 rxr_cnt = igb_get_rxring_inuse(sc, TRUE);
3205                 off = sc->rx_npoll_off;
3206                 for (i = 0; i < rxr_cnt; ++i) {
3207                         struct igb_rx_ring *rxr = &sc->rx_rings[i];
3208                         int idx = i + off;
3209
3210                         KKASSERT(idx < ncpus2);
3211                         info->ifpi_rx[idx].poll_func = igb_npoll_rx;
3212                         info->ifpi_rx[idx].arg = rxr;
3213                         info->ifpi_rx[idx].serializer = &rxr->rx_serialize;
3214                 }
3215
3216                 if (ifp->if_flags & IFF_RUNNING) {
3217                         if (rxr_cnt == sc->rx_ring_inuse &&
3218                             txr_cnt == sc->tx_ring_inuse) {
3219                                 igb_set_timer_cpuid(sc, TRUE);
3220                                 igb_disable_intr(sc);
3221                         } else {
3222                                 igb_init(sc);
3223                         }
3224                 }
3225         } else {
3226                 for (i = 0; i < sc->tx_ring_cnt; ++i) {
3227                         struct igb_tx_ring *txr = &sc->tx_rings[i];
3228
3229                         ifsq_set_cpuid(txr->ifsq, txr->tx_intr_cpuid);
3230                 }
3231
3232                 if (ifp->if_flags & IFF_RUNNING) {
3233                         txr_cnt = igb_get_txring_inuse(sc, FALSE);
3234                         rxr_cnt = igb_get_rxring_inuse(sc, FALSE);
3235
3236                         if (rxr_cnt == sc->rx_ring_inuse &&
3237                             txr_cnt == sc->tx_ring_inuse) {
3238                                 igb_set_timer_cpuid(sc, FALSE);
3239                                 igb_enable_intr(sc);
3240                         } else {
3241                                 igb_init(sc);
3242                         }
3243                 }
3244         }
3245 }
3246
3247 #endif /* IFPOLL_ENABLE */
3248
3249 static void
3250 igb_intr(void *xsc)
3251 {
3252         struct igb_softc *sc = xsc;
3253         struct ifnet *ifp = &sc->arpcom.ac_if;
3254         uint32_t eicr;
3255
3256         ASSERT_SERIALIZED(&sc->main_serialize);
3257
3258         eicr = E1000_READ_REG(&sc->hw, E1000_EICR);
3259
3260         if (eicr == 0)
3261                 return;
3262
3263         if (ifp->if_flags & IFF_RUNNING) {
3264                 struct igb_tx_ring *txr = &sc->tx_rings[0];
3265                 int i;
3266
3267                 for (i = 0; i < sc->rx_ring_inuse; ++i) {
3268                         struct igb_rx_ring *rxr = &sc->rx_rings[i];
3269
3270                         if (eicr & rxr->rx_intr_mask) {
3271                                 lwkt_serialize_enter(&rxr->rx_serialize);
3272                                 igb_rxeof(rxr, -1);
3273                                 lwkt_serialize_exit(&rxr->rx_serialize);
3274                         }
3275                 }
3276
3277                 if (eicr & txr->tx_intr_mask) {
3278                         lwkt_serialize_enter(&txr->tx_serialize);
3279                         igb_txeof(txr);
3280                         if (!ifsq_is_empty(txr->ifsq))
3281                                 ifsq_devstart(txr->ifsq);
3282                         lwkt_serialize_exit(&txr->tx_serialize);
3283                 }
3284         }
3285
3286         if (eicr & E1000_EICR_OTHER) {
3287                 uint32_t icr = E1000_READ_REG(&sc->hw, E1000_ICR);
3288
3289                 /* Link status change */
3290                 if (icr & E1000_ICR_LSC) {
3291                         sc->hw.mac.get_link_status = 1;
3292                         igb_update_link_status(sc);
3293                 }
3294         }
3295
3296         /*
3297          * Reading EICR has the side effect to clear interrupt mask,
3298          * so all interrupts need to be enabled here.
3299          */
3300         E1000_WRITE_REG(&sc->hw, E1000_EIMS, sc->intr_mask);
3301 }
3302
3303 static void
3304 igb_intr_shared(void *xsc)
3305 {
3306         struct igb_softc *sc = xsc;
3307         struct ifnet *ifp = &sc->arpcom.ac_if;
3308         uint32_t reg_icr;
3309
3310         ASSERT_SERIALIZED(&sc->main_serialize);
3311
3312         reg_icr = E1000_READ_REG(&sc->hw, E1000_ICR);
3313
3314         /* Hot eject?  */
3315         if (reg_icr == 0xffffffff)
3316                 return;
3317
3318         /* Definitely not our interrupt.  */
3319         if (reg_icr == 0x0)
3320                 return;
3321
3322         if ((reg_icr & E1000_ICR_INT_ASSERTED) == 0)
3323                 return;
3324
3325         if (ifp->if_flags & IFF_RUNNING) {
3326                 if (reg_icr &
3327                     (E1000_ICR_RXT0 | E1000_ICR_RXDMT0 | E1000_ICR_RXO)) {
3328                         int i;
3329
3330                         for (i = 0; i < sc->rx_ring_inuse; ++i) {
3331                                 struct igb_rx_ring *rxr = &sc->rx_rings[i];
3332
3333                                 lwkt_serialize_enter(&rxr->rx_serialize);
3334                                 igb_rxeof(rxr, -1);
3335                                 lwkt_serialize_exit(&rxr->rx_serialize);
3336                         }
3337                 }
3338
3339                 if (reg_icr & E1000_ICR_TXDW) {
3340                         struct igb_tx_ring *txr = &sc->tx_rings[0];
3341
3342                         lwkt_serialize_enter(&txr->tx_serialize);
3343                         igb_txeof(txr);
3344                         if (!ifsq_is_empty(txr->ifsq))
3345                                 ifsq_devstart(txr->ifsq);
3346                         lwkt_serialize_exit(&txr->tx_serialize);
3347                 }
3348         }
3349
3350         /* Link status change */
3351         if (reg_icr & (E1000_ICR_RXSEQ | E1000_ICR_LSC)) {
3352                 sc->hw.mac.get_link_status = 1;
3353                 igb_update_link_status(sc);
3354         }
3355
3356         if (reg_icr & E1000_ICR_RXO)
3357                 sc->rx_overruns++;
3358 }
3359
3360 static int
3361 igb_encap(struct igb_tx_ring *txr, struct mbuf **m_headp,
3362     int *segs_used, int *idx)
3363 {
3364         bus_dma_segment_t segs[IGB_MAX_SCATTER];
3365         bus_dmamap_t map;
3366         struct igb_tx_buf *tx_buf, *tx_buf_mapped;
3367         union e1000_adv_tx_desc *txd = NULL;
3368         struct mbuf *m_head = *m_headp;
3369         uint32_t olinfo_status = 0, cmd_type_len = 0, cmd_rs = 0;
3370         int maxsegs, nsegs, i, j, error;
3371         uint32_t hdrlen = 0;
3372
3373         if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3374                 error = igb_tso_pullup(txr, m_headp);
3375                 if (error)
3376                         return error;
3377                 m_head = *m_headp;
3378         }
3379
3380         /* Set basic descriptor constants */
3381         cmd_type_len |= E1000_ADVTXD_DTYP_DATA;
3382         cmd_type_len |= E1000_ADVTXD_DCMD_IFCS | E1000_ADVTXD_DCMD_DEXT;
3383         if (m_head->m_flags & M_VLANTAG)
3384                 cmd_type_len |= E1000_ADVTXD_DCMD_VLE;
3385
3386         /*
3387          * Map the packet for DMA.
3388          */
3389         tx_buf = &txr->tx_buf[txr->next_avail_desc];
3390         tx_buf_mapped = tx_buf;
3391         map = tx_buf->map;
3392
3393         maxsegs = txr->tx_avail - IGB_TX_RESERVED;
3394         KASSERT(maxsegs >= txr->spare_desc, ("not enough spare TX desc\n"));
3395         if (maxsegs > IGB_MAX_SCATTER)
3396                 maxsegs = IGB_MAX_SCATTER;
3397
3398         error = bus_dmamap_load_mbuf_defrag(txr->tx_tag, map, m_headp,
3399             segs, maxsegs, &nsegs, BUS_DMA_NOWAIT);
3400         if (error) {
3401                 if (error == ENOBUFS)
3402                         txr->sc->mbuf_defrag_failed++;
3403                 else
3404                         txr->sc->no_tx_dma_setup++;
3405
3406                 m_freem(*m_headp);
3407                 *m_headp = NULL;
3408                 return error;
3409         }
3410         bus_dmamap_sync(txr->tx_tag, map, BUS_DMASYNC_PREWRITE);
3411
3412         m_head = *m_headp;
3413
3414         /*
3415          * Set up the TX context descriptor, if any hardware offloading is
3416          * needed.  This includes CSUM, VLAN, and TSO.  It will consume one
3417          * TX descriptor.
3418          *
3419          * Unlike these chips' predecessors (em/emx), TX context descriptor
3420          * will _not_ interfere TX data fetching pipelining.
3421          */
3422         if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3423                 igb_tso_ctx(txr, m_head, &hdrlen);
3424                 cmd_type_len |= E1000_ADVTXD_DCMD_TSE;
3425                 olinfo_status |= E1000_TXD_POPTS_IXSM << 8;
3426                 olinfo_status |= E1000_TXD_POPTS_TXSM << 8;
3427                 txr->tx_nsegs++;
3428                 (*segs_used)++;
3429         } else if (igb_txcsum_ctx(txr, m_head)) {
3430                 if (m_head->m_pkthdr.csum_flags & CSUM_IP)
3431                         olinfo_status |= (E1000_TXD_POPTS_IXSM << 8);
3432                 if (m_head->m_pkthdr.csum_flags & (CSUM_UDP | CSUM_TCP))
3433                         olinfo_status |= (E1000_TXD_POPTS_TXSM << 8);
3434                 txr->tx_nsegs++;
3435                 (*segs_used)++;
3436         }
3437
3438         *segs_used += nsegs;
3439         txr->tx_nsegs += nsegs;
3440         if (txr->tx_nsegs >= txr->intr_nsegs) {
3441                 /*
3442                  * Report Status (RS) is turned on every intr_nsegs
3443                  * descriptors (roughly).
3444                  */
3445                 txr->tx_nsegs = 0;
3446                 cmd_rs = E1000_ADVTXD_DCMD_RS;
3447         }
3448
3449         /* Calculate payload length */
3450         olinfo_status |= ((m_head->m_pkthdr.len - hdrlen)
3451             << E1000_ADVTXD_PAYLEN_SHIFT);
3452
3453         /*
3454          * 82575 needs the TX context index added; the queue
3455          * index is used as TX context index here.
3456          */
3457         if (txr->sc->hw.mac.type == e1000_82575)
3458                 olinfo_status |= txr->me << 4;
3459
3460         /* Set up our transmit descriptors */
3461         i = txr->next_avail_desc;
3462         for (j = 0; j < nsegs; j++) {
3463                 bus_size_t seg_len;
3464                 bus_addr_t seg_addr;
3465
3466                 tx_buf = &txr->tx_buf[i];
3467                 txd = (union e1000_adv_tx_desc *)&txr->tx_base[i];
3468                 seg_addr = segs[j].ds_addr;
3469                 seg_len = segs[j].ds_len;
3470
3471                 txd->read.buffer_addr = htole64(seg_addr);
3472                 txd->read.cmd_type_len = htole32(cmd_type_len | seg_len);
3473                 txd->read.olinfo_status = htole32(olinfo_status);
3474                 if (++i == txr->num_tx_desc)
3475                         i = 0;
3476                 tx_buf->m_head = NULL;
3477         }
3478
3479         KASSERT(txr->tx_avail > nsegs, ("invalid avail TX desc\n"));
3480         txr->next_avail_desc = i;
3481         txr->tx_avail -= nsegs;
3482
3483         tx_buf->m_head = m_head;
3484         tx_buf_mapped->map = tx_buf->map;
3485         tx_buf->map = map;
3486
3487         /*
3488          * Last Descriptor of Packet needs End Of Packet (EOP)
3489          */
3490         txd->read.cmd_type_len |= htole32(E1000_ADVTXD_DCMD_EOP | cmd_rs);
3491
3492         /*
3493          * Defer TDT updating, until enough descrptors are setup
3494          */
3495         *idx = i;
3496 #ifdef IGB_TSS_DEBUG
3497         ++txr->tx_packets;
3498 #endif
3499
3500         return 0;
3501 }
3502
3503 static void
3504 igb_start(struct ifnet *ifp, struct ifaltq_subque *ifsq)
3505 {
3506         struct igb_softc *sc = ifp->if_softc;
3507         struct igb_tx_ring *txr = ifsq_get_priv(ifsq);
3508         struct mbuf *m_head;
3509         int idx = -1, nsegs = 0;
3510
3511         KKASSERT(txr->ifsq == ifsq);
3512         ASSERT_SERIALIZED(&txr->tx_serialize);
3513
3514         if ((ifp->if_flags & IFF_RUNNING) == 0 || ifsq_is_oactive(ifsq))
3515                 return;
3516
3517         if (!sc->link_active || (txr->tx_flags & IGB_TXFLAG_ENABLED) == 0) {
3518                 ifsq_purge(ifsq);
3519                 return;
3520         }
3521
3522         if (!IGB_IS_NOT_OACTIVE(txr))
3523                 igb_txeof(txr);
3524
3525         while (!ifsq_is_empty(ifsq)) {
3526                 if (IGB_IS_OACTIVE(txr)) {
3527                         ifsq_set_oactive(ifsq);
3528                         /* Set watchdog on */
3529                         txr->tx_watchdog.wd_timer = 5;
3530                         break;
3531                 }
3532
3533                 m_head = ifsq_dequeue(ifsq);
3534                 if (m_head == NULL)
3535                         break;
3536
3537                 if (igb_encap(txr, &m_head, &nsegs, &idx)) {
3538                         IFNET_STAT_INC(ifp, oerrors, 1);
3539                         continue;
3540                 }
3541
3542                 /*
3543                  * TX interrupt are aggressively aggregated, so increasing
3544                  * opackets at TX interrupt time will make the opackets
3545                  * statistics vastly inaccurate; we do the opackets increment
3546                  * now.
3547                  */
3548                 IFNET_STAT_INC(ifp, opackets, 1);
3549
3550                 if (nsegs >= txr->wreg_nsegs) {
3551                         E1000_WRITE_REG(&txr->sc->hw, E1000_TDT(txr->me), idx);
3552                         idx = -1;
3553                         nsegs = 0;
3554                 }
3555
3556                 /* Send a copy of the frame to the BPF listener */
3557                 ETHER_BPF_MTAP(ifp, m_head);
3558         }
3559         if (idx >= 0)
3560                 E1000_WRITE_REG(&txr->sc->hw, E1000_TDT(txr->me), idx);
3561 }
3562
3563 static void
3564 igb_watchdog(struct ifaltq_subque *ifsq)
3565 {
3566         struct igb_tx_ring *txr = ifsq_get_priv(ifsq);
3567         struct ifnet *ifp = ifsq_get_ifp(ifsq);
3568         struct igb_softc *sc = ifp->if_softc;
3569         int i;
3570
3571         KKASSERT(txr->ifsq == ifsq);
3572         ASSERT_IFNET_SERIALIZED_ALL(ifp);
3573
3574         /* 
3575          * If flow control has paused us since last checking
3576          * it invalidates the watchdog timing, so dont run it.
3577          */
3578         if (sc->pause_frames) {
3579                 sc->pause_frames = 0;
3580                 txr->tx_watchdog.wd_timer = 5;
3581                 return;
3582         }
3583
3584         if_printf(ifp, "Watchdog timeout -- resetting\n");
3585         if_printf(ifp, "Queue(%d) tdh = %d, hw tdt = %d\n", txr->me,
3586             E1000_READ_REG(&sc->hw, E1000_TDH(txr->me)),
3587             E1000_READ_REG(&sc->hw, E1000_TDT(txr->me)));
3588         if_printf(ifp, "TX(%d) desc avail = %d, "
3589             "Next TX to Clean = %d\n",
3590             txr->me, txr->tx_avail, txr->next_to_clean);
3591
3592         IFNET_STAT_INC(ifp, oerrors, 1);
3593         sc->watchdog_events++;
3594
3595         igb_init(sc);
3596         for (i = 0; i < sc->tx_ring_inuse; ++i)
3597                 ifsq_devstart_sched(sc->tx_rings[i].ifsq);
3598 }
3599
3600 static void
3601 igb_set_eitr(struct igb_softc *sc, int idx, int rate)
3602 {
3603         uint32_t eitr = 0;
3604
3605         if (rate > 0) {
3606                 if (sc->hw.mac.type == e1000_82575) {
3607                         eitr = 1000000000 / 256 / rate;
3608                         /*
3609                          * NOTE:
3610                          * Document is wrong on the 2 bits left shift
3611                          */
3612                 } else {
3613                         eitr = 1000000 / rate;
3614                         eitr <<= IGB_EITR_INTVL_SHIFT;
3615                 }
3616
3617                 if (eitr == 0) {
3618                         /* Don't disable it */
3619                         eitr = 1 << IGB_EITR_INTVL_SHIFT;
3620                 } else if (eitr > IGB_EITR_INTVL_MASK) {
3621                         /* Don't allow it to be too large */
3622                         eitr = IGB_EITR_INTVL_MASK;
3623                 }
3624         }
3625         if (sc->hw.mac.type == e1000_82575)
3626                 eitr |= eitr << 16;
3627         else
3628                 eitr |= E1000_EITR_CNT_IGNR;
3629         E1000_WRITE_REG(&sc->hw, E1000_EITR(idx), eitr);
3630 }
3631
3632 static int
3633 igb_sysctl_intr_rate(SYSCTL_HANDLER_ARGS)
3634 {
3635         struct igb_softc *sc = (void *)arg1;
3636         struct ifnet *ifp = &sc->arpcom.ac_if;
3637         int error, intr_rate;
3638
3639         intr_rate = sc->intr_rate;
3640         error = sysctl_handle_int(oidp, &intr_rate, 0, req);
3641         if (error || req->newptr == NULL)
3642                 return error;
3643         if (intr_rate < 0)
3644                 return EINVAL;
3645
3646         ifnet_serialize_all(ifp);
3647
3648         sc->intr_rate = intr_rate;
3649         if (ifp->if_flags & IFF_RUNNING)
3650                 igb_set_eitr(sc, 0, sc->intr_rate);
3651
3652         if (bootverbose)
3653                 if_printf(ifp, "interrupt rate set to %d/sec\n", sc->intr_rate);
3654
3655         ifnet_deserialize_all(ifp);
3656
3657         return 0;
3658 }
3659
3660 static int
3661 igb_sysctl_msix_rate(SYSCTL_HANDLER_ARGS)
3662 {
3663         struct igb_msix_data *msix = (void *)arg1;
3664         struct igb_softc *sc = msix->msix_sc;
3665         struct ifnet *ifp = &sc->arpcom.ac_if;
3666         int error, msix_rate;
3667
3668         msix_rate = msix->msix_rate;
3669         error = sysctl_handle_int(oidp, &msix_rate, 0, req);
3670         if (error || req->newptr == NULL)
3671                 return error;
3672         if (msix_rate < 0)
3673                 return EINVAL;
3674
3675         lwkt_serialize_enter(msix->msix_serialize);
3676
3677         msix->msix_rate = msix_rate;
3678         if (ifp->if_flags & IFF_RUNNING)
3679                 igb_set_eitr(sc, msix->msix_vector, msix->msix_rate);
3680
3681         if (bootverbose) {
3682                 if_printf(ifp, "%s set to %d/sec\n", msix->msix_rate_desc,
3683                     msix->msix_rate);
3684         }
3685
3686         lwkt_serialize_exit(msix->msix_serialize);
3687
3688         return 0;
3689 }
3690
3691 static int
3692 igb_sysctl_tx_intr_nsegs(SYSCTL_HANDLER_ARGS)
3693 {
3694         struct igb_softc *sc = (void *)arg1;
3695         struct ifnet *ifp = &sc->arpcom.ac_if;
3696         struct igb_tx_ring *txr = &sc->tx_rings[0];
3697         int error, nsegs;
3698
3699         nsegs = txr->intr_nsegs;
3700         error = sysctl_handle_int(oidp, &nsegs, 0, req);
3701         if (error || req->newptr == NULL)
3702                 return error;
3703         if (nsegs <= 0)
3704                 return EINVAL;
3705
3706         ifnet_serialize_all(ifp);
3707
3708         if (nsegs >= txr->num_tx_desc - txr->oact_lo_desc ||
3709             nsegs >= txr->oact_hi_desc - IGB_MAX_SCATTER) {
3710                 error = EINVAL;
3711         } else {
3712                 int i;
3713
3714                 error = 0;
3715                 for (i = 0; i < sc->tx_ring_cnt; ++i)
3716                         sc->tx_rings[i].intr_nsegs = nsegs;
3717         }
3718
3719         ifnet_deserialize_all(ifp);
3720
3721         return error;
3722 }
3723
3724 static int
3725 igb_sysctl_rx_wreg_nsegs(SYSCTL_HANDLER_ARGS)
3726 {
3727         struct igb_softc *sc = (void *)arg1;
3728         struct ifnet *ifp = &sc->arpcom.ac_if;
3729         int error, nsegs, i;
3730
3731         nsegs = sc->rx_rings[0].wreg_nsegs;
3732         error = sysctl_handle_int(oidp, &nsegs, 0, req);
3733         if (error || req->newptr == NULL)
3734                 return error;
3735
3736         ifnet_serialize_all(ifp);
3737         for (i = 0; i < sc->rx_ring_cnt; ++i)
3738                 sc->rx_rings[i].wreg_nsegs =nsegs;
3739         ifnet_deserialize_all(ifp);
3740
3741         return 0;
3742 }
3743
3744 static int
3745 igb_sysctl_tx_wreg_nsegs(SYSCTL_HANDLER_ARGS)
3746 {
3747         struct igb_softc *sc = (void *)arg1;
3748         struct ifnet *ifp = &sc->arpcom.ac_if;
3749         int error, nsegs, i;
3750
3751         nsegs = sc->tx_rings[0].wreg_nsegs;
3752         error = sysctl_handle_int(oidp, &nsegs, 0, req);
3753         if (error || req->newptr == NULL)
3754                 return error;
3755
3756         ifnet_serialize_all(ifp);
3757         for (i = 0; i < sc->tx_ring_cnt; ++i)
3758                 sc->tx_rings[i].wreg_nsegs =nsegs;
3759         ifnet_deserialize_all(ifp);
3760
3761         return 0;
3762 }
3763
3764 #ifdef IFPOLL_ENABLE
3765
3766 static int
3767 igb_sysctl_npoll_rxoff(SYSCTL_HANDLER_ARGS)
3768 {
3769         struct igb_softc *sc = (void *)arg1;
3770         struct ifnet *ifp = &sc->arpcom.ac_if;
3771         int error, off;
3772
3773         off = sc->rx_npoll_off;
3774         error = sysctl_handle_int(oidp, &off, 0, req);
3775         if (error || req->newptr == NULL)
3776                 return error;
3777         if (off < 0)
3778                 return EINVAL;
3779
3780         ifnet_serialize_all(ifp);
3781         if (off >= ncpus2 || off % sc->rx_ring_cnt != 0) {
3782                 error = EINVAL;
3783         } else {
3784                 error = 0;
3785                 sc->rx_npoll_off = off;
3786         }
3787         ifnet_deserialize_all(ifp);
3788
3789         return error;
3790 }
3791
3792 static int
3793 igb_sysctl_npoll_txoff(SYSCTL_HANDLER_ARGS)
3794 {
3795         struct igb_softc *sc = (void *)arg1;
3796         struct ifnet *ifp = &sc->arpcom.ac_if;
3797         int error, off;
3798
3799         off = sc->tx_npoll_off;
3800         error = sysctl_handle_int(oidp, &off, 0, req);
3801         if (error || req->newptr == NULL)
3802                 return error;
3803         if (off < 0)
3804                 return EINVAL;
3805
3806         ifnet_serialize_all(ifp);
3807         if (off >= ncpus2 || off % sc->tx_ring_cnt != 0) {
3808                 error = EINVAL;
3809         } else {
3810                 error = 0;
3811                 sc->tx_npoll_off = off;
3812         }
3813         ifnet_deserialize_all(ifp);
3814
3815         return error;
3816 }
3817
3818 #endif  /* IFPOLL_ENABLE */
3819
3820 static void
3821 igb_init_intr(struct igb_softc *sc)
3822 {
3823         igb_set_intr_mask(sc);
3824
3825         if ((sc->flags & IGB_FLAG_SHARED_INTR) == 0)
3826                 igb_init_unshared_intr(sc);
3827
3828         if (sc->intr_type != PCI_INTR_TYPE_MSIX) {
3829                 igb_set_eitr(sc, 0, sc->intr_rate);
3830         } else {
3831                 int i;
3832
3833                 for (i = 0; i < sc->msix_cnt; ++i)
3834                         igb_set_eitr(sc, i, sc->msix_data[i].msix_rate);
3835         }
3836 }
3837
3838 static void
3839 igb_init_unshared_intr(struct igb_softc *sc)
3840 {
3841         struct e1000_hw *hw = &sc->hw;
3842         const struct igb_rx_ring *rxr;
3843         const struct igb_tx_ring *txr;
3844         uint32_t ivar, index;
3845         int i;
3846
3847         /*
3848          * Enable extended mode
3849          */
3850         if (sc->hw.mac.type != e1000_82575) {
3851                 uint32_t gpie;
3852                 int ivar_max;
3853
3854                 gpie = E1000_GPIE_NSICR;
3855                 if (sc->intr_type == PCI_INTR_TYPE_MSIX) {
3856                         gpie |= E1000_GPIE_MSIX_MODE |
3857                             E1000_GPIE_EIAME |
3858                             E1000_GPIE_PBA;
3859                 }
3860                 E1000_WRITE_REG(hw, E1000_GPIE, gpie);
3861
3862                 /*
3863                  * Clear IVARs
3864                  */
3865                 switch (sc->hw.mac.type) {
3866                 case e1000_82576:
3867                         ivar_max = IGB_MAX_IVAR_82576;
3868                         break;
3869
3870                 case e1000_82580:
3871                         ivar_max = IGB_MAX_IVAR_82580;
3872                         break;
3873
3874                 case e1000_i350:
3875                         ivar_max = IGB_MAX_IVAR_I350;
3876                         break;
3877
3878                 case e1000_i354:
3879                         ivar_max = IGB_MAX_IVAR_I354;
3880                         break;
3881
3882                 case e1000_vfadapt:
3883                 case e1000_vfadapt_i350:
3884                         ivar_max = IGB_MAX_IVAR_VF;
3885                         break;
3886
3887                 case e1000_i210:
3888                         ivar_max = IGB_MAX_IVAR_I210;
3889                         break;
3890
3891                 case e1000_i211:
3892                         ivar_max = IGB_MAX_IVAR_I211;
3893                         break;
3894
3895                 default:
3896                         panic("unknown mac type %d\n", sc->hw.mac.type);
3897                 }
3898                 for (i = 0; i < ivar_max; ++i)
3899                         E1000_WRITE_REG_ARRAY(hw, E1000_IVAR0, i, 0);
3900                 E1000_WRITE_REG(hw, E1000_IVAR_MISC, 0);
3901         } else {
3902                 uint32_t tmp;
3903
3904                 KASSERT(sc->intr_type != PCI_INTR_TYPE_MSIX,
3905                     ("82575 w/ MSI-X"));
3906                 tmp = E1000_READ_REG(hw, E1000_CTRL_EXT);
3907                 tmp |= E1000_CTRL_EXT_IRCA;
3908                 E1000_WRITE_REG(hw, E1000_CTRL_EXT, tmp);
3909         }
3910
3911         /*
3912          * Map TX/RX interrupts to EICR
3913          */
3914         switch (sc->hw.mac.type) {
3915         case e1000_82580:
3916         case e1000_i350:
3917         case e1000_i354:
3918         case e1000_vfadapt:
3919         case e1000_vfadapt_i350:
3920         case e1000_i210:
3921         case e1000_i211:
3922                 /* RX entries */
3923                 for (i = 0; i < sc->rx_ring_inuse; ++i) {
3924                         rxr = &sc->rx_rings[i];
3925
3926                         index = i >> 1;
3927                         ivar = E1000_READ_REG_ARRAY(hw, E1000_IVAR0, index);
3928
3929                         if (i & 1) {
3930                                 ivar &= 0xff00ffff;
3931                                 ivar |=
3932                                 (rxr->rx_intr_bit | E1000_IVAR_VALID) << 16;
3933                         } else {
3934                                 ivar &= 0xffffff00;
3935                                 ivar |=
3936                                 (rxr->rx_intr_bit | E1000_IVAR_VALID);
3937                         }
3938                         E1000_WRITE_REG_ARRAY(hw, E1000_IVAR0, index, ivar);
3939                 }
3940                 /* TX entries */
3941                 for (i = 0; i < sc->tx_ring_inuse; ++i) {
3942                         txr = &sc->tx_rings[i];
3943
3944                         index = i >> 1;
3945                         ivar = E1000_READ_REG_ARRAY(hw, E1000_IVAR0, index);
3946
3947                         if (i & 1) {
3948                                 ivar &= 0x00ffffff;
3949                                 ivar |=
3950                                 (txr->tx_intr_bit | E1000_IVAR_VALID) << 24;
3951                         } else {
3952                                 ivar &= 0xffff00ff;
3953                                 ivar |=
3954                                 (txr->tx_intr_bit | E1000_IVAR_VALID) << 8;
3955                         }
3956                         E1000_WRITE_REG_ARRAY(hw, E1000_IVAR0, index, ivar);
3957                 }
3958                 if (sc->intr_type == PCI_INTR_TYPE_MSIX) {
3959                         ivar = (sc->sts_intr_bit | E1000_IVAR_VALID) << 8;
3960                         E1000_WRITE_REG(hw, E1000_IVAR_MISC, ivar);
3961                 }
3962                 break;
3963
3964         case e1000_82576:
3965                 /* RX entries */
3966                 for (i = 0; i < sc->rx_ring_inuse; ++i) {
3967                         rxr = &sc->rx_rings[i];
3968
3969                         index = i & 0x7; /* Each IVAR has two entries */
3970                         ivar = E1000_READ_REG_ARRAY(hw, E1000_IVAR0, index);
3971
3972                         if (i < 8) {
3973                                 ivar &= 0xffffff00;
3974                                 ivar |=
3975                                 (rxr->rx_intr_bit | E1000_IVAR_VALID);
3976                         } else {
3977                                 ivar &= 0xff00ffff;
3978                                 ivar |=
3979                                 (rxr->rx_intr_bit | E1000_IVAR_VALID) << 16;
3980                         }
3981                         E1000_WRITE_REG_ARRAY(hw, E1000_IVAR0, index, ivar);
3982                 }
3983                 /* TX entries */
3984                 for (i = 0; i < sc->tx_ring_inuse; ++i) {
3985                         txr = &sc->tx_rings[i];
3986
3987                         index = i & 0x7; /* Each IVAR has two entries */
3988                         ivar = E1000_READ_REG_ARRAY(hw, E1000_IVAR0, index);
3989
3990                         if (i < 8) {
3991                                 ivar &= 0xffff00ff;
3992                                 ivar |=
3993                                 (txr->tx_intr_bit | E1000_IVAR_VALID) << 8;
3994                         } else {
3995                                 ivar &= 0x00ffffff;
3996                                 ivar |=
3997                                 (txr->tx_intr_bit | E1000_IVAR_VALID) << 24;
3998                         }
3999                         E1000_WRITE_REG_ARRAY(hw, E1000_IVAR0, index, ivar);
4000                 }
4001                 if (sc->intr_type == PCI_INTR_TYPE_MSIX) {
4002                         ivar = (sc->sts_intr_bit | E1000_IVAR_VALID) << 8;
4003                         E1000_WRITE_REG(hw, E1000_IVAR_MISC, ivar);
4004                 }
4005                 break;
4006
4007         case e1000_82575:
4008                 /*
4009                  * Enable necessary interrupt bits.
4010                  *
4011                  * The name of the register is confusing; in addition to
4012                  * configuring the first vector of MSI-X, it also configures
4013                  * which bits of EICR could be set by the hardware even when
4014                  * MSI or line interrupt is used; it thus controls interrupt
4015                  * generation.  It MUST be configured explicitly; the default
4016                  * value mentioned in the datasheet is wrong: RX queue0 and
4017                  * TX queue0 are NOT enabled by default.
4018                  */
4019                 E1000_WRITE_REG(&sc->hw, E1000_MSIXBM(0), sc->intr_mask);
4020                 break;
4021
4022         default:
4023                 panic("unknown mac type %d\n", sc->hw.mac.type);
4024         }
4025 }
4026
4027 static int
4028 igb_setup_intr(struct igb_softc *sc)
4029 {
4030         int error;
4031
4032         if (sc->intr_type == PCI_INTR_TYPE_MSIX)
4033                 return igb_msix_setup(sc);
4034
4035         error = bus_setup_intr(sc->dev, sc->intr_res, INTR_MPSAFE,
4036             (sc->flags & IGB_FLAG_SHARED_INTR) ? igb_intr_shared : igb_intr,
4037             sc, &sc->intr_tag, &sc->main_serialize);
4038         if (error) {
4039                 device_printf(sc->dev, "Failed to register interrupt handler");
4040                 return error;
4041         }
4042         return 0;
4043 }
4044
4045 static void
4046 igb_set_txintr_mask(struct igb_tx_ring *txr, int *intr_bit0, int intr_bitmax)
4047 {
4048         if (txr->sc->hw.mac.type == e1000_82575) {
4049                 txr->tx_intr_bit = 0;   /* unused */
4050                 switch (txr->me) {
4051                 case 0:
4052                         txr->tx_intr_mask = E1000_EICR_TX_QUEUE0;
4053                         break;
4054                 case 1:
4055                         txr->tx_intr_mask = E1000_EICR_TX_QUEUE1;
4056                         break;
4057                 case 2:
4058                         txr->tx_intr_mask = E1000_EICR_TX_QUEUE2;
4059                         break;
4060                 case 3:
4061                         txr->tx_intr_mask = E1000_EICR_TX_QUEUE3;
4062                         break;
4063                 default:
4064                         panic("unsupported # of TX ring, %d\n", txr->me);
4065                 }
4066         } else {
4067                 int intr_bit = *intr_bit0;
4068
4069                 txr->tx_intr_bit = intr_bit % intr_bitmax;
4070                 txr->tx_intr_mask = 1 << txr->tx_intr_bit;
4071
4072                 *intr_bit0 = intr_bit + 1;
4073         }
4074 }
4075
4076 static void
4077 igb_set_rxintr_mask(struct igb_rx_ring *rxr, int *intr_bit0, int intr_bitmax)
4078 {
4079         if (rxr->sc->hw.mac.type == e1000_82575) {
4080                 rxr->rx_intr_bit = 0;   /* unused */
4081                 switch (rxr->me) {
4082                 case 0:
4083                         rxr->rx_intr_mask = E1000_EICR_RX_QUEUE0;
4084                         break;
4085                 case 1:
4086                         rxr->rx_intr_mask = E1000_EICR_RX_QUEUE1;
4087                         break;
4088                 case 2:
4089                         rxr->rx_intr_mask = E1000_EICR_RX_QUEUE2;
4090                         break;
4091                 case 3:
4092                         rxr->rx_intr_mask = E1000_EICR_RX_QUEUE3;
4093                         break;
4094                 default:
4095                         panic("unsupported # of RX ring, %d\n", rxr->me);
4096                 }
4097         } else {
4098                 int intr_bit = *intr_bit0;
4099
4100                 rxr->rx_intr_bit = intr_bit % intr_bitmax;
4101                 rxr->rx_intr_mask = 1 << rxr->rx_intr_bit;
4102
4103                 *intr_bit0 = intr_bit + 1;
4104         }
4105 }
4106
4107 static void
4108 igb_serialize(struct ifnet *ifp, enum ifnet_serialize slz)
4109 {
4110         struct igb_softc *sc = ifp->if_softc;
4111
4112         ifnet_serialize_array_enter(sc->serializes, sc->serialize_cnt, slz);
4113 }
4114
4115 static void
4116 igb_deserialize(struct ifnet *ifp, enum ifnet_serialize slz)
4117 {
4118         struct igb_softc *sc = ifp->if_softc;
4119
4120         ifnet_serialize_array_exit(sc->serializes, sc->serialize_cnt, slz);
4121 }
4122
4123 static int
4124 igb_tryserialize(struct ifnet *ifp, enum ifnet_serialize slz)
4125 {
4126         struct igb_softc *sc = ifp->if_softc;
4127
4128         return ifnet_serialize_array_try(sc->serializes, sc->serialize_cnt,
4129             slz);
4130 }
4131
4132 #ifdef INVARIANTS
4133
4134 static void
4135 igb_serialize_assert(struct ifnet *ifp, enum ifnet_serialize slz,
4136     boolean_t serialized)
4137 {
4138         struct igb_softc *sc = ifp->if_softc;
4139
4140         ifnet_serialize_array_assert(sc->serializes, sc->serialize_cnt,
4141             slz, serialized);
4142 }
4143
4144 #endif  /* INVARIANTS */
4145
4146 static void
4147 igb_set_intr_mask(struct igb_softc *sc)
4148 {
4149         int i;
4150
4151         sc->intr_mask = sc->sts_intr_mask;
4152         for (i = 0; i < sc->rx_ring_inuse; ++i)
4153                 sc->intr_mask |= sc->rx_rings[i].rx_intr_mask;
4154         for (i = 0; i < sc->tx_ring_inuse; ++i)
4155                 sc->intr_mask |= sc->tx_rings[i].tx_intr_mask;
4156         if (bootverbose) {
4157                 if_printf(&sc->arpcom.ac_if, "intr mask 0x%08x\n",
4158                     sc->intr_mask);
4159         }
4160 }
4161
4162 static int
4163 igb_alloc_intr(struct igb_softc *sc)
4164 {
4165         int i, intr_bit, intr_bitmax;
4166         u_int intr_flags;
4167
4168         igb_msix_try_alloc(sc);
4169         if (sc->intr_type == PCI_INTR_TYPE_MSIX)
4170                 goto done;
4171
4172         /*
4173          * Allocate MSI/legacy interrupt resource
4174          */
4175         sc->intr_type = pci_alloc_1intr(sc->dev, igb_msi_enable,
4176             &sc->intr_rid, &intr_flags);
4177
4178         if (sc->intr_type == PCI_INTR_TYPE_LEGACY) {
4179                 int unshared;
4180
4181                 unshared = device_getenv_int(sc->dev, "irq.unshared", 0);
4182                 if (!unshared) {
4183                         sc->flags |= IGB_FLAG_SHARED_INTR;
4184                         if (bootverbose)
4185                                 device_printf(sc->dev, "IRQ shared\n");
4186                 } else {
4187                         intr_flags &= ~RF_SHAREABLE;
4188                         if (bootverbose)
4189                                 device_printf(sc->dev, "IRQ unshared\n");
4190                 }
4191         }
4192
4193         sc->intr_res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ,
4194             &sc->intr_rid, intr_flags);
4195         if (sc->intr_res == NULL) {
4196                 device_printf(sc->dev, "Unable to allocate bus resource: "
4197                     "interrupt\n");
4198                 return ENXIO;
4199         }
4200
4201         for (i = 0; i < sc->tx_ring_cnt; ++i)
4202                 sc->tx_rings[i].tx_intr_cpuid = rman_get_cpuid(sc->intr_res);
4203
4204         /*
4205          * Setup MSI/legacy interrupt mask
4206          */
4207         switch (sc->hw.mac.type) {
4208         case e1000_82575:
4209                 intr_bitmax = IGB_MAX_TXRXINT_82575;
4210                 break;
4211
4212         case e1000_82576:
4213                 intr_bitmax = IGB_MAX_TXRXINT_82576;
4214                 break;
4215
4216         case e1000_82580:
4217                 intr_bitmax = IGB_MAX_TXRXINT_82580;
4218                 break;
4219
4220         case e1000_i350:
4221                 intr_bitmax = IGB_MAX_TXRXINT_I350;
4222                 break;
4223
4224         case e1000_i354:
4225                 intr_bitmax = IGB_MAX_TXRXINT_I354;
4226                 break;
4227
4228         case e1000_i210:
4229                 intr_bitmax = IGB_MAX_TXRXINT_I210;
4230                 break;
4231
4232         case e1000_i211:
4233                 intr_bitmax = IGB_MAX_TXRXINT_I211;
4234                 break;
4235
4236         default:
4237                 intr_bitmax = IGB_MIN_TXRXINT;
4238                 break;
4239         }
4240         intr_bit = 0;
4241         for (i = 0; i < sc->tx_ring_cnt; ++i)
4242                 igb_set_txintr_mask(&sc->tx_rings[i], &intr_bit, intr_bitmax);
4243         for (i = 0; i < sc->rx_ring_cnt; ++i)
4244                 igb_set_rxintr_mask(&sc->rx_rings[i], &intr_bit, intr_bitmax);
4245         sc->sts_intr_bit = 0;
4246         sc->sts_intr_mask = E1000_EICR_OTHER;
4247
4248         /* Initialize interrupt rate */
4249         sc->intr_rate = IGB_INTR_RATE;
4250 done:
4251         igb_set_ring_inuse(sc, FALSE);
4252         igb_set_intr_mask(sc);
4253         return 0;
4254 }
4255
4256 static void
4257 igb_free_intr(struct igb_softc *sc)
4258 {
4259         if (sc->intr_type != PCI_INTR_TYPE_MSIX) {
4260                 if (sc->intr_res != NULL) {
4261                         bus_release_resource(sc->dev, SYS_RES_IRQ, sc->intr_rid,
4262                             sc->intr_res);
4263                 }
4264                 if (sc->intr_type == PCI_INTR_TYPE_MSI)
4265                         pci_release_msi(sc->dev);
4266         } else {
4267                 igb_msix_free(sc, TRUE);
4268         }
4269 }
4270
4271 static void
4272 igb_teardown_intr(struct igb_softc *sc)
4273 {
4274         if (sc->intr_type != PCI_INTR_TYPE_MSIX)
4275                 bus_teardown_intr(sc->dev, sc->intr_res, sc->intr_tag);
4276         else
4277                 igb_msix_teardown(sc, sc->msix_cnt);
4278 }
4279
4280 static void
4281 igb_msix_try_alloc(struct igb_softc *sc)
4282 {
4283         int msix_enable, msix_cnt, msix_cnt2, alloc_cnt;
4284         int i, x, error;
4285         int offset, offset_def;
4286         struct igb_msix_data *msix;
4287         boolean_t aggregate, setup = FALSE;
4288
4289         /*
4290          * Don't enable MSI-X on 82575, see:
4291          * 82575 specification update errata #25
4292          */
4293         if (sc->hw.mac.type == e1000_82575)
4294                 return;
4295
4296         /* Don't enable MSI-X on VF */
4297         if (sc->vf_ifp)
4298                 return;
4299
4300         msix_enable = device_getenv_int(sc->dev, "msix.enable",
4301             igb_msix_enable);
4302         if (!msix_enable)
4303                 return;
4304
4305         msix_cnt = pci_msix_count(sc->dev);
4306 #ifdef IGB_MSIX_DEBUG
4307         msix_cnt = device_getenv_int(sc->dev, "msix.count", msix_cnt);
4308 #endif
4309         if (msix_cnt <= 1) {
4310                 /* One MSI-X model does not make sense */
4311                 return;
4312         }
4313
4314         i = 0;
4315         while ((1 << (i + 1)) <= msix_cnt)
4316                 ++i;
4317         msix_cnt2 = 1 << i;
4318
4319         if (bootverbose) {
4320                 device_printf(sc->dev, "MSI-X count %d/%d\n",
4321                     msix_cnt2, msix_cnt);
4322         }
4323
4324         KKASSERT(msix_cnt2 <= msix_cnt);
4325         if (msix_cnt == msix_cnt2) {
4326                 /* We need at least one MSI-X for link status */
4327                 msix_cnt2 >>= 1;
4328                 if (msix_cnt2 <= 1) {
4329                         /* One MSI-X for RX/TX does not make sense */
4330                         device_printf(sc->dev, "not enough MSI-X for TX/RX, "
4331                             "MSI-X count %d/%d\n", msix_cnt2, msix_cnt);
4332                         return;
4333                 }
4334                 KKASSERT(msix_cnt > msix_cnt2);
4335
4336                 if (bootverbose) {
4337                         device_printf(sc->dev, "MSI-X count fixup %d/%d\n",
4338                             msix_cnt2, msix_cnt);
4339                 }
4340         }
4341
4342         sc->rx_ring_msix = sc->rx_ring_cnt;
4343         if (sc->rx_ring_msix > msix_cnt2)
4344                 sc->rx_ring_msix = msix_cnt2;
4345
4346         sc->tx_ring_msix = sc->tx_ring_cnt;
4347         if (sc->tx_ring_msix > msix_cnt2)
4348                 sc->tx_ring_msix = msix_cnt2;
4349
4350         if (msix_cnt >= sc->tx_ring_msix + sc->rx_ring_msix + 1) {
4351                 /*
4352                  * Independent TX/RX MSI-X
4353                  */
4354                 aggregate = FALSE;
4355                 if (bootverbose)
4356                         device_printf(sc->dev, "independent TX/RX MSI-X\n");
4357                 alloc_cnt = sc->tx_ring_msix + sc->rx_ring_msix;
4358         } else {
4359                 /*
4360                  * Aggregate TX/RX MSI-X
4361                  */
4362                 aggregate = TRUE;
4363                 if (bootverbose)
4364                         device_printf(sc->dev, "aggregate TX/RX MSI-X\n");
4365                 alloc_cnt = msix_cnt2;
4366                 if (alloc_cnt > ncpus2)
4367                         alloc_cnt = ncpus2;
4368                 if (sc->rx_ring_msix > alloc_cnt)
4369                         sc->rx_ring_msix = alloc_cnt;
4370                 if (sc->tx_ring_msix > alloc_cnt)
4371                         sc->tx_ring_msix = alloc_cnt;
4372         }
4373         ++alloc_cnt;    /* For link status */
4374
4375         if (bootverbose) {
4376                 device_printf(sc->dev, "MSI-X alloc %d, "
4377                     "RX ring %d, TX ring %d\n", alloc_cnt,
4378                     sc->rx_ring_msix, sc->tx_ring_msix);
4379         }
4380
4381         sc->msix_mem_rid = PCIR_BAR(IGB_MSIX_BAR);
4382         sc->msix_mem_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4383             &sc->msix_mem_rid, RF_ACTIVE);
4384         if (sc->msix_mem_res == NULL) {
4385                 sc->msix_mem_rid = PCIR_BAR(IGB_MSIX_BAR_ALT);
4386                 sc->msix_mem_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4387                     &sc->msix_mem_rid, RF_ACTIVE);
4388                 if (sc->msix_mem_res == NULL) {
4389                         device_printf(sc->dev, "Unable to map MSI-X table\n");
4390                         return;
4391                 }
4392         }
4393
4394         sc->msix_cnt = alloc_cnt;
4395         sc->msix_data = kmalloc_cachealign(
4396             sizeof(struct igb_msix_data) * sc->msix_cnt,
4397             M_DEVBUF, M_WAITOK | M_ZERO);
4398         for (x = 0; x < sc->msix_cnt; ++x) {
4399                 msix = &sc->msix_data[x];
4400
4401                 lwkt_serialize_init(&msix->msix_serialize0);
4402                 msix->msix_sc = sc;
4403                 msix->msix_rid = -1;
4404                 msix->msix_vector = x;
4405                 msix->msix_mask = 1 << msix->msix_vector;
4406                 msix->msix_rate = IGB_INTR_RATE;
4407         }
4408
4409         x = 0;
4410         if (!aggregate) {
4411                 /*
4412                  * RX rings
4413                  */
4414                 if (sc->rx_ring_msix == ncpus2) {
4415                         offset = 0;
4416                 } else {
4417                         offset_def = (sc->rx_ring_msix *
4418                             device_get_unit(sc->dev)) % ncpus2;
4419
4420                         offset = device_getenv_int(sc->dev,
4421                             "msix.rxoff", offset_def);
4422                         if (offset >= ncpus2 ||
4423                             offset % sc->rx_ring_msix != 0) {
4424                                 device_printf(sc->dev,
4425                                     "invalid msix.rxoff %d, use %d\n",
4426                                     offset, offset_def);
4427                                 offset = offset_def;
4428                         }
4429                 }
4430                 igb_msix_rx_conf(sc, 0, &x, offset);
4431
4432                 /*
4433                  * TX rings
4434                  */
4435                 if (sc->tx_ring_msix == ncpus2) {
4436                         offset = 0;
4437                 } else {
4438                         offset_def = (sc->tx_ring_msix *
4439                             device_get_unit(sc->dev)) % ncpus2;
4440
4441                         offset = device_getenv_int(sc->dev,
4442                             "msix.txoff", offset_def);
4443                         if (offset >= ncpus2 ||
4444                             offset % sc->tx_ring_msix != 0) {
4445                                 device_printf(sc->dev,
4446                                     "invalid msix.txoff %d, use %d\n",
4447                                     offset, offset_def);
4448                                 offset = offset_def;
4449                         }
4450                 }
4451                 igb_msix_tx_conf(sc, 0, &x, offset);
4452         } else {
4453                 int ring_agg, ring_max;
4454
4455                 ring_agg = sc->rx_ring_msix;
4456                 if (ring_agg > sc->tx_ring_msix)
4457                         ring_agg = sc->tx_ring_msix;
4458
4459                 ring_max = sc->rx_ring_msix;
4460                 if (ring_max < sc->tx_ring_msix)
4461                         ring_max = sc->tx_ring_msix;
4462
4463                 if (ring_max == ncpus2) {
4464                         offset = 0;
4465                 } else {
4466                         offset_def = (ring_max * device_get_unit(sc->dev)) %
4467                             ncpus2;
4468
4469                         offset = device_getenv_int(sc->dev, "msix.off",
4470                             offset_def);
4471                         if (offset >= ncpus2 || offset % ring_max != 0) {
4472                                 device_printf(sc->dev,
4473                                     "invalid msix.off %d, use %d\n",
4474                                     offset, offset_def);
4475                                 offset = offset_def;
4476                         }
4477                 }
4478
4479                 for (i = 0; i < ring_agg; ++i) {
4480                         struct igb_tx_ring *txr = &sc->tx_rings[i];
4481                         struct igb_rx_ring *rxr = &sc->rx_rings[i];
4482
4483                         KKASSERT(x < sc->msix_cnt);
4484                         msix = &sc->msix_data[x++];
4485
4486                         txr->tx_intr_bit = msix->msix_vector;
4487                         txr->tx_intr_mask = msix->msix_mask;
4488                         rxr->rx_intr_bit = msix->msix_vector;
4489                         rxr->rx_intr_mask = msix->msix_mask;
4490
4491                         msix->msix_serialize = &msix->msix_serialize0;
4492                         msix->msix_func = igb_msix_rxtx;
4493                         msix->msix_arg = msix;
4494                         msix->msix_rx = rxr;
4495                         msix->msix_tx = txr;
4496
4497                         msix->msix_cpuid = i + offset;
4498                         KKASSERT(msix->msix_cpuid < ncpus2);
4499                         txr->tx_intr_cpuid = msix->msix_cpuid;
4500
4501                         ksnprintf(msix->msix_desc, sizeof(msix->msix_desc),
4502                             "%s rxtx%d", device_get_nameunit(sc->dev), i);
4503                         msix->msix_rate = IGB_MSIX_RX_RATE;
4504                         ksnprintf(msix->msix_rate_desc,
4505                             sizeof(msix->msix_rate_desc),
4506                             "RXTX%d interrupt rate", i);
4507                 }
4508
4509                 if (ring_agg != ring_max) {
4510                         if (ring_max == sc->tx_ring_msix)
4511                                 igb_msix_tx_conf(sc, i, &x, offset);
4512                         else
4513                                 igb_msix_rx_conf(sc, i, &x, offset);
4514                 }
4515         }
4516
4517         /*
4518          * Link status
4519          */
4520         KKASSERT(x < sc->msix_cnt);
4521         msix = &sc->msix_data[x++];
4522         sc->sts_intr_bit = msix->msix_vector;
4523         sc->sts_intr_mask = msix->msix_mask;
4524
4525         msix->msix_serialize = &sc->main_serialize;
4526         msix->msix_func = igb_msix_status;
4527         msix->msix_arg = sc;
4528         msix->msix_cpuid = 0;
4529         ksnprintf(msix->msix_desc, sizeof(msix->msix_desc), "%s sts",
4530             device_get_nameunit(sc->dev));
4531         ksnprintf(msix->msix_rate_desc, sizeof(msix->msix_rate_desc),
4532             "status interrupt rate");
4533
4534         KKASSERT(x == sc->msix_cnt);
4535
4536         error = pci_setup_msix(sc->dev);
4537         if (error) {
4538                 device_printf(sc->dev, "Setup MSI-X failed\n");
4539                 goto back;
4540         }
4541         setup = TRUE;
4542
4543         for (i = 0; i < sc->msix_cnt; ++i) {
4544                 msix = &sc->msix_data[i];
4545
4546                 error = pci_alloc_msix_vector(sc->dev, msix->msix_vector,
4547                     &msix->msix_rid, msix->msix_cpuid);
4548                 if (error) {
4549                         device_printf(sc->dev,
4550                             "Unable to allocate MSI-X %d on cpu%d\n",
4551                             msix->msix_vector, msix->msix_cpuid);
4552                         goto back;
4553                 }
4554
4555                 msix->msix_res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ,
4556                     &msix->msix_rid, RF_ACTIVE);
4557                 if (msix->msix_res == NULL) {
4558                         device_printf(sc->dev,
4559                             "Unable to allocate MSI-X %d resource\n",
4560                             msix->msix_vector);
4561                         error = ENOMEM;
4562                         goto back;
4563                 }
4564         }
4565
4566         pci_enable_msix(sc->dev);
4567         sc->intr_type = PCI_INTR_TYPE_MSIX;
4568 back:
4569         if (error)
4570                 igb_msix_free(sc, setup);
4571 }
4572
4573 static void
4574 igb_msix_free(struct igb_softc *sc, boolean_t setup)
4575 {
4576         int i;
4577
4578         KKASSERT(sc->msix_cnt > 1);
4579
4580         for (i = 0; i < sc->msix_cnt; ++i) {
4581                 struct igb_msix_data *msix = &sc->msix_data[i];
4582
4583                 if (msix->msix_res != NULL) {
4584                         bus_release_resource(sc->dev, SYS_RES_IRQ,
4585                             msix->msix_rid, msix->msix_res);
4586                 }
4587                 if (msix->msix_rid >= 0)
4588                         pci_release_msix_vector(sc->dev, msix->msix_rid);
4589         }
4590         if (setup)
4591                 pci_teardown_msix(sc->dev);
4592
4593         sc->msix_cnt = 0;
4594         kfree(sc->msix_data, M_DEVBUF);
4595         sc->msix_data = NULL;
4596 }
4597
4598 static int
4599 igb_msix_setup(struct igb_softc *sc)
4600 {
4601         int i;
4602
4603         for (i = 0; i < sc->msix_cnt; ++i) {
4604                 struct igb_msix_data *msix = &sc->msix_data[i];
4605                 int error;
4606
4607                 error = bus_setup_intr_descr(sc->dev, msix->msix_res,
4608                     INTR_MPSAFE, msix->msix_func, msix->msix_arg,
4609                     &msix->msix_handle, msix->msix_serialize, msix->msix_desc);
4610                 if (error) {
4611                         device_printf(sc->dev, "could not set up %s "
4612                             "interrupt handler.\n", msix->msix_desc);
4613                         igb_msix_teardown(sc, i);
4614                         return error;
4615                 }
4616         }
4617         return 0;
4618 }
4619
4620 static void
4621 igb_msix_teardown(struct igb_softc *sc, int msix_cnt)
4622 {
4623         int i;
4624
4625         for (i = 0; i < msix_cnt; ++i) {
4626                 struct igb_msix_data *msix = &sc->msix_data[i];
4627
4628                 bus_teardown_intr(sc->dev, msix->msix_res, msix->msix_handle);
4629         }
4630 }
4631
4632 static void
4633 igb_msix_rx(void *arg)
4634 {
4635         struct igb_rx_ring *rxr = arg;
4636
4637         ASSERT_SERIALIZED(&rxr->rx_serialize);
4638         igb_rxeof(rxr, -1);
4639
4640         E1000_WRITE_REG(&rxr->sc->hw, E1000_EIMS, rxr->rx_intr_mask);
4641 }
4642
4643 static void
4644 igb_msix_tx(void *arg)
4645 {
4646         struct igb_tx_ring *txr = arg;
4647
4648         ASSERT_SERIALIZED(&txr->tx_serialize);
4649
4650         igb_txeof(txr);
4651         if (!ifsq_is_empty(txr->ifsq))
4652                 ifsq_devstart(txr->ifsq);
4653
4654         E1000_WRITE_REG(&txr->sc->hw, E1000_EIMS, txr->tx_intr_mask);
4655 }
4656
4657 static void
4658 igb_msix_status(void *arg)
4659 {
4660         struct igb_softc *sc = arg;
4661         uint32_t icr;
4662
4663         ASSERT_SERIALIZED(&sc->main_serialize);
4664
4665         icr = E1000_READ_REG(&sc->hw, E1000_ICR);
4666         if (icr & E1000_ICR_LSC) {
4667                 sc->hw.mac.get_link_status = 1;
4668                 igb_update_link_status(sc);
4669         }
4670
4671         E1000_WRITE_REG(&sc->hw, E1000_EIMS, sc->sts_intr_mask);
4672 }
4673
4674 static void
4675 igb_set_ring_inuse(struct igb_softc *sc, boolean_t polling)
4676 {
4677         sc->rx_ring_inuse = igb_get_rxring_inuse(sc, polling);
4678         sc->tx_ring_inuse = igb_get_txring_inuse(sc, polling);
4679         if (bootverbose) {
4680                 if_printf(&sc->arpcom.ac_if, "RX rings %d/%d, TX rings %d/%d\n",
4681                     sc->rx_ring_inuse, sc->rx_ring_cnt,
4682                     sc->tx_ring_inuse, sc->tx_ring_cnt);
4683         }
4684 }
4685
4686 static int
4687 igb_get_rxring_inuse(const struct igb_softc *sc, boolean_t polling)
4688 {
4689         if (!IGB_ENABLE_HWRSS(sc))
4690                 return 1;
4691
4692         if (polling)
4693                 return sc->rx_ring_cnt;
4694         else if (sc->intr_type != PCI_INTR_TYPE_MSIX)
4695                 return IGB_MIN_RING_RSS;
4696         else
4697                 return sc->rx_ring_msix;
4698 }
4699
4700 static int
4701 igb_get_txring_inuse(const struct igb_softc *sc, boolean_t polling)
4702 {
4703         if (!IGB_ENABLE_HWTSS(sc))
4704                 return 1;
4705
4706         if (polling)
4707                 return sc->tx_ring_cnt;
4708         else if (sc->intr_type != PCI_INTR_TYPE_MSIX)
4709                 return IGB_MIN_RING;
4710         else
4711                 return sc->tx_ring_msix;
4712 }
4713
4714 static int
4715 igb_tso_pullup(struct igb_tx_ring *txr, struct mbuf **mp)
4716 {
4717         int hoff, iphlen, thoff;
4718         struct mbuf *m;
4719
4720         m = *mp;
4721         KASSERT(M_WRITABLE(m), ("TSO mbuf not writable"));
4722
4723         iphlen = m->m_pkthdr.csum_iphlen;
4724         thoff = m->m_pkthdr.csum_thlen;
4725         hoff = m->m_pkthdr.csum_lhlen;
4726
4727         KASSERT(iphlen > 0, ("invalid ip hlen"));
4728         KASSERT(thoff > 0, ("invalid tcp hlen"));
4729         KASSERT(hoff > 0, ("invalid ether hlen"));
4730
4731         if (__predict_false(m->m_len < hoff + iphlen + thoff)) {
4732                 m = m_pullup(m, hoff + iphlen + thoff);
4733                 if (m == NULL) {
4734                         *mp = NULL;
4735                         return ENOBUFS;
4736                 }
4737                 *mp = m;
4738         }
4739         if (txr->tx_flags & IGB_TXFLAG_TSO_IPLEN0) {
4740                 struct ip *ip;
4741
4742                 ip = mtodoff(m, struct ip *, hoff);
4743                 ip->ip_len = 0;
4744         }
4745
4746         return 0;
4747 }
4748
4749 static void
4750 igb_tso_ctx(struct igb_tx_ring *txr, struct mbuf *m, uint32_t *hlen)
4751 {
4752         struct e1000_adv_tx_context_desc *TXD;
4753         uint32_t vlan_macip_lens, type_tucmd_mlhl, mss_l4len_idx;
4754         int hoff, ctxd, iphlen, thoff;
4755
4756         iphlen = m->m_pkthdr.csum_iphlen;
4757         thoff = m->m_pkthdr.csum_thlen;
4758         hoff = m->m_pkthdr.csum_lhlen;
4759
4760         vlan_macip_lens = type_tucmd_mlhl = mss_l4len_idx = 0;
4761
4762         ctxd = txr->next_avail_desc;
4763         TXD = (struct e1000_adv_tx_context_desc *)&txr->tx_base[ctxd];
4764
4765         if (m->m_flags & M_VLANTAG) {
4766                 uint16_t vlantag;
4767
4768                 vlantag = htole16(m->m_pkthdr.ether_vlantag);
4769                 vlan_macip_lens |= (vlantag << E1000_ADVTXD_VLAN_SHIFT);
4770         }
4771
4772         vlan_macip_lens |= (hoff << E1000_ADVTXD_MACLEN_SHIFT);
4773         vlan_macip_lens |= iphlen;
4774
4775         type_tucmd_mlhl |= E1000_ADVTXD_DCMD_DEXT | E1000_ADVTXD_DTYP_CTXT;
4776         type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_TCP;
4777         type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_IPV4;
4778
4779         mss_l4len_idx |= (m->m_pkthdr.tso_segsz << E1000_ADVTXD_MSS_SHIFT);
4780         mss_l4len_idx |= (thoff << E1000_ADVTXD_L4LEN_SHIFT);
4781
4782         /*
4783          * 82575 needs the TX context index added; the queue
4784          * index is used as TX context index here.
4785          */
4786         if (txr->sc->hw.mac.type == e1000_82575)
4787                 mss_l4len_idx |= txr->me << 4;
4788
4789         TXD->vlan_macip_lens = htole32(vlan_macip_lens);
4790         TXD->type_tucmd_mlhl = htole32(type_tucmd_mlhl);
4791         TXD->seqnum_seed = htole32(0);
4792         TXD->mss_l4len_idx = htole32(mss_l4len_idx);
4793
4794         /* We've consumed the first desc, adjust counters */
4795         if (++ctxd == txr->num_tx_desc)
4796                 ctxd = 0;
4797         txr->next_avail_desc = ctxd;
4798         --txr->tx_avail;
4799
4800         *hlen = hoff + iphlen + thoff;
4801 }
4802
4803 static void
4804 igb_setup_serializer(struct igb_softc *sc)
4805 {
4806         const struct igb_msix_data *msix;
4807         int i, j;
4808
4809         /*
4810          * Allocate serializer array
4811          */
4812
4813         /* Main + TX + RX */
4814         sc->serialize_cnt = 1 + sc->tx_ring_cnt + sc->rx_ring_cnt;
4815
4816         /* Aggregate TX/RX MSI-X */
4817         for (i = 0; i < sc->msix_cnt; ++i) {
4818                 msix = &sc->msix_data[i];
4819                 if (msix->msix_serialize == &msix->msix_serialize0)
4820                         sc->serialize_cnt++;
4821         }
4822
4823         sc->serializes =
4824             kmalloc(sc->serialize_cnt * sizeof(struct lwkt_serialize *),
4825                 M_DEVBUF, M_WAITOK | M_ZERO);
4826
4827         /*
4828          * Setup serializers
4829          *
4830          * NOTE: Order is critical
4831          */
4832
4833         i = 0;
4834
4835         KKASSERT(i < sc->serialize_cnt);
4836         sc->serializes[i++] = &sc->main_serialize;
4837
4838         for (j = 0; j < sc->msix_cnt; ++j) {
4839                 msix = &sc->msix_data[j];
4840                 if (msix->msix_serialize == &msix->msix_serialize0) {
4841                         KKASSERT(i < sc->serialize_cnt);
4842                         sc->serializes[i++] = msix->msix_serialize;
4843                 }
4844         }
4845
4846         for (j = 0; j < sc->tx_ring_cnt; ++j) {
4847                 KKASSERT(i < sc->serialize_cnt);
4848                 sc->serializes[i++] = &sc->tx_rings[j].tx_serialize;
4849         }
4850
4851         for (j = 0; j < sc->rx_ring_cnt; ++j) {
4852                 KKASSERT(i < sc->serialize_cnt);
4853                 sc->serializes[i++] = &sc->rx_rings[j].rx_serialize;
4854         }
4855
4856         KKASSERT(i == sc->serialize_cnt);
4857 }
4858
4859 static void
4860 igb_msix_rx_conf(struct igb_softc *sc, int i, int *x0, int offset)
4861 {
4862         int x = *x0;
4863
4864         for (; i < sc->rx_ring_msix; ++i) {
4865                 struct igb_rx_ring *rxr = &sc->rx_rings[i];
4866                 struct igb_msix_data *msix;
4867
4868                 KKASSERT(x < sc->msix_cnt);
4869                 msix = &sc->msix_data[x++];
4870
4871                 rxr->rx_intr_bit = msix->msix_vector;
4872                 rxr->rx_intr_mask = msix->msix_mask;
4873
4874                 msix->msix_serialize = &rxr->rx_serialize;
4875                 msix->msix_func = igb_msix_rx;
4876                 msix->msix_arg = rxr;
4877
4878                 msix->msix_cpuid = i + offset;
4879                 KKASSERT(msix->msix_cpuid < ncpus2);
4880
4881                 ksnprintf(msix->msix_desc, sizeof(msix->msix_desc), "%s rx%d",
4882                     device_get_nameunit(sc->dev), i);
4883
4884                 msix->msix_rate = IGB_MSIX_RX_RATE;
4885                 ksnprintf(msix->msix_rate_desc, sizeof(msix->msix_rate_desc),
4886                     "RX%d interrupt rate", i);
4887         }
4888         *x0 = x;
4889 }
4890
4891 static void
4892 igb_msix_tx_conf(struct igb_softc *sc, int i, int *x0, int offset)
4893 {
4894         int x = *x0;
4895
4896         for (; i < sc->tx_ring_msix; ++i) {
4897                 struct igb_tx_ring *txr = &sc->tx_rings[i];
4898                 struct igb_msix_data *msix;
4899
4900                 KKASSERT(x < sc->msix_cnt);
4901                 msix = &sc->msix_data[x++];
4902
4903                 txr->tx_intr_bit = msix->msix_vector;
4904                 txr->tx_intr_mask = msix->msix_mask;
4905
4906                 msix->msix_serialize = &txr->tx_serialize;
4907                 msix->msix_func = igb_msix_tx;
4908                 msix->msix_arg = txr;
4909
4910                 msix->msix_cpuid = i + offset;
4911                 KKASSERT(msix->msix_cpuid < ncpus2);
4912                 txr->tx_intr_cpuid = msix->msix_cpuid;
4913
4914                 ksnprintf(msix->msix_desc, sizeof(msix->msix_desc), "%s tx%d",
4915                     device_get_nameunit(sc->dev), i);
4916
4917                 msix->msix_rate = IGB_MSIX_TX_RATE;
4918                 ksnprintf(msix->msix_rate_desc, sizeof(msix->msix_rate_desc),
4919                     "TX%d interrupt rate", i);
4920         }
4921         *x0 = x;
4922 }
4923
4924 static void
4925 igb_msix_rxtx(void *arg)
4926 {
4927         struct igb_msix_data *msix = arg;
4928         struct igb_rx_ring *rxr = msix->msix_rx;
4929         struct igb_tx_ring *txr = msix->msix_tx;
4930
4931         ASSERT_SERIALIZED(&msix->msix_serialize0);
4932
4933         lwkt_serialize_enter(&rxr->rx_serialize);
4934         igb_rxeof(rxr, -1);
4935         lwkt_serialize_exit(&rxr->rx_serialize);
4936
4937         lwkt_serialize_enter(&txr->tx_serialize);
4938         igb_txeof(txr);
4939         if (!ifsq_is_empty(txr->ifsq))
4940                 ifsq_devstart(txr->ifsq);
4941         lwkt_serialize_exit(&txr->tx_serialize);
4942
4943         E1000_WRITE_REG(&msix->msix_sc->hw, E1000_EIMS, msix->msix_mask);
4944 }
4945
4946 static void
4947 igb_set_timer_cpuid(struct igb_softc *sc, boolean_t polling)
4948 {
4949         if (polling || sc->intr_type == PCI_INTR_TYPE_MSIX)
4950                 sc->timer_cpuid = 0; /* XXX fixed */
4951         else
4952                 sc->timer_cpuid = rman_get_cpuid(sc->intr_res);
4953 }
4954
4955 static enum e1000_fc_mode
4956 igb_str2fc(const char *str)
4957 {
4958         if (strcmp(str, "none") == 0)
4959                 return e1000_fc_none;
4960         else if (strcmp(str, "rx_pause") == 0)
4961                 return e1000_fc_rx_pause;
4962         else if (strcmp(str, "tx_pause") == 0)
4963                 return e1000_fc_tx_pause;
4964         else
4965                 return e1000_fc_full;
4966 }
4967
4968 static void
4969 igb_fc2str(enum e1000_fc_mode fc, char *str, int len)
4970 {
4971         const char *fc_str = "full";
4972
4973         switch (fc) {
4974         case e1000_fc_none:
4975                 fc_str = "none";
4976                 break;
4977
4978         case e1000_fc_rx_pause:
4979                 fc_str = "rx_pause";
4980                 break;
4981
4982         case e1000_fc_tx_pause:
4983                 fc_str = "tx_pause";
4984                 break;
4985
4986         default:
4987                 break;
4988         }
4989         strlcpy(str, fc_str, len);
4990 }
4991
4992 static int
4993 igb_sysctl_flowctrl(SYSCTL_HANDLER_ARGS)
4994 {
4995         struct igb_softc *sc = arg1;
4996         struct ifnet *ifp = &sc->arpcom.ac_if;
4997         char flowctrl[IGB_FLOWCTRL_STRLEN];
4998         enum e1000_fc_mode fc;
4999         int error;
5000
5001         igb_fc2str(sc->flow_ctrl, flowctrl, sizeof(flowctrl));
5002         error = sysctl_handle_string(oidp, flowctrl, sizeof(flowctrl), req);
5003         if (error != 0 || req->newptr == NULL)
5004                 return error;
5005
5006         fc = igb_str2fc(flowctrl);
5007
5008         ifnet_serialize_all(ifp);
5009         if (fc == sc->flow_ctrl)
5010                 goto done;
5011
5012         sc->flow_ctrl = fc;
5013         sc->hw.fc.requested_mode = sc->flow_ctrl;
5014         sc->hw.fc.current_mode = sc->flow_ctrl;
5015         e1000_force_mac_fc(&sc->hw);
5016 done:
5017         ifnet_deserialize_all(ifp);
5018
5019         return 0;
5020 }