if: Per-cpu ifnet/ifaddr statistics, step 1/3
[dragonfly.git] / sys / dev / netif / igb / if_igb.c
1 /*
2  * Copyright (c) 2001-2011, Intel Corporation 
3  * All rights reserved.
4  * 
5  * Redistribution and use in source and binary forms, with or without 
6  * modification, are permitted provided that the following conditions are met:
7  * 
8  *  1. Redistributions of source code must retain the above copyright notice, 
9  *     this list of conditions and the following disclaimer.
10  * 
11  *  2. Redistributions in binary form must reproduce the above copyright 
12  *     notice, this list of conditions and the following disclaimer in the 
13  *     documentation and/or other materials provided with the distribution.
14  * 
15  *  3. Neither the name of the Intel Corporation nor the names of its 
16  *     contributors may be used to endorse or promote products derived from 
17  *     this software without specific prior written permission.
18  * 
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
22  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
23  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31
32 #include "opt_ifpoll.h"
33 #include "opt_igb.h"
34
35 #include <sys/param.h>
36 #include <sys/bus.h>
37 #include <sys/endian.h>
38 #include <sys/interrupt.h>
39 #include <sys/kernel.h>
40 #include <sys/malloc.h>
41 #include <sys/mbuf.h>
42 #include <sys/proc.h>
43 #include <sys/rman.h>
44 #include <sys/serialize.h>
45 #include <sys/serialize2.h>
46 #include <sys/socket.h>
47 #include <sys/sockio.h>
48 #include <sys/sysctl.h>
49 #include <sys/systm.h>
50
51 #include <net/bpf.h>
52 #include <net/ethernet.h>
53 #include <net/if.h>
54 #include <net/if_arp.h>
55 #include <net/if_dl.h>
56 #include <net/if_media.h>
57 #include <net/ifq_var.h>
58 #include <net/toeplitz.h>
59 #include <net/toeplitz2.h>
60 #include <net/vlan/if_vlan_var.h>
61 #include <net/vlan/if_vlan_ether.h>
62 #include <net/if_poll.h>
63
64 #include <netinet/in_systm.h>
65 #include <netinet/in.h>
66 #include <netinet/ip.h>
67 #include <netinet/tcp.h>
68 #include <netinet/udp.h>
69
70 #include <bus/pci/pcivar.h>
71 #include <bus/pci/pcireg.h>
72
73 #include <dev/netif/ig_hal/e1000_api.h>
74 #include <dev/netif/ig_hal/e1000_82575.h>
75 #include <dev/netif/igb/if_igb.h>
76
77 #ifdef IGB_RSS_DEBUG
78 #define IGB_RSS_DPRINTF(sc, lvl, fmt, ...) \
79 do { \
80         if (sc->rss_debug >= lvl) \
81                 if_printf(&sc->arpcom.ac_if, fmt, __VA_ARGS__); \
82 } while (0)
83 #else   /* !IGB_RSS_DEBUG */
84 #define IGB_RSS_DPRINTF(sc, lvl, fmt, ...)      ((void)0)
85 #endif  /* IGB_RSS_DEBUG */
86
87 #define IGB_NAME        "Intel(R) PRO/1000 "
88 #define IGB_DEVICE(id)  \
89         { IGB_VENDOR_ID, E1000_DEV_ID_##id, IGB_NAME #id }
90 #define IGB_DEVICE_NULL { 0, 0, NULL }
91
92 static struct igb_device {
93         uint16_t        vid;
94         uint16_t        did;
95         const char      *desc;
96 } igb_devices[] = {
97         IGB_DEVICE(82575EB_COPPER),
98         IGB_DEVICE(82575EB_FIBER_SERDES),
99         IGB_DEVICE(82575GB_QUAD_COPPER),
100         IGB_DEVICE(82576),
101         IGB_DEVICE(82576_NS),
102         IGB_DEVICE(82576_NS_SERDES),
103         IGB_DEVICE(82576_FIBER),
104         IGB_DEVICE(82576_SERDES),
105         IGB_DEVICE(82576_SERDES_QUAD),
106         IGB_DEVICE(82576_QUAD_COPPER),
107         IGB_DEVICE(82576_QUAD_COPPER_ET2),
108         IGB_DEVICE(82576_VF),
109         IGB_DEVICE(82580_COPPER),
110         IGB_DEVICE(82580_FIBER),
111         IGB_DEVICE(82580_SERDES),
112         IGB_DEVICE(82580_SGMII),
113         IGB_DEVICE(82580_COPPER_DUAL),
114         IGB_DEVICE(82580_QUAD_FIBER),
115         IGB_DEVICE(DH89XXCC_SERDES),
116         IGB_DEVICE(DH89XXCC_SGMII),
117         IGB_DEVICE(DH89XXCC_SFP),
118         IGB_DEVICE(DH89XXCC_BACKPLANE),
119         IGB_DEVICE(I350_COPPER),
120         IGB_DEVICE(I350_FIBER),
121         IGB_DEVICE(I350_SERDES),
122         IGB_DEVICE(I350_SGMII),
123         IGB_DEVICE(I350_VF),
124         IGB_DEVICE(I210_COPPER),
125         IGB_DEVICE(I210_COPPER_IT),
126         IGB_DEVICE(I210_COPPER_OEM1),
127         IGB_DEVICE(I210_FIBER),
128         IGB_DEVICE(I210_SERDES),
129         IGB_DEVICE(I210_SGMII),
130         IGB_DEVICE(I211_COPPER),
131
132         /* required last entry */
133         IGB_DEVICE_NULL
134 };
135
136 static int      igb_probe(device_t);
137 static int      igb_attach(device_t);
138 static int      igb_detach(device_t);
139 static int      igb_shutdown(device_t);
140 static int      igb_suspend(device_t);
141 static int      igb_resume(device_t);
142
143 static boolean_t igb_is_valid_ether_addr(const uint8_t *);
144 static void     igb_setup_ifp(struct igb_softc *);
145 static boolean_t igb_txcsum_ctx(struct igb_tx_ring *, struct mbuf *);
146 static int      igb_tso_pullup(struct igb_tx_ring *, struct mbuf **);
147 static void     igb_tso_ctx(struct igb_tx_ring *, struct mbuf *, uint32_t *);
148 static void     igb_add_sysctl(struct igb_softc *);
149 static int      igb_sysctl_intr_rate(SYSCTL_HANDLER_ARGS);
150 static int      igb_sysctl_msix_rate(SYSCTL_HANDLER_ARGS);
151 static int      igb_sysctl_tx_intr_nsegs(SYSCTL_HANDLER_ARGS);
152 static int      igb_sysctl_tx_wreg_nsegs(SYSCTL_HANDLER_ARGS);
153 static int      igb_sysctl_rx_wreg_nsegs(SYSCTL_HANDLER_ARGS);
154 static void     igb_set_ring_inuse(struct igb_softc *, boolean_t);
155 static int      igb_get_rxring_inuse(const struct igb_softc *, boolean_t);
156 static int      igb_get_txring_inuse(const struct igb_softc *, boolean_t);
157 #ifdef IFPOLL_ENABLE
158 static int      igb_sysctl_npoll_rxoff(SYSCTL_HANDLER_ARGS);
159 static int      igb_sysctl_npoll_txoff(SYSCTL_HANDLER_ARGS);
160 #endif
161
162 static void     igb_vf_init_stats(struct igb_softc *);
163 static void     igb_reset(struct igb_softc *);
164 static void     igb_update_stats_counters(struct igb_softc *);
165 static void     igb_update_vf_stats_counters(struct igb_softc *);
166 static void     igb_update_link_status(struct igb_softc *);
167 static void     igb_init_tx_unit(struct igb_softc *);
168 static void     igb_init_rx_unit(struct igb_softc *);
169
170 static void     igb_set_vlan(struct igb_softc *);
171 static void     igb_set_multi(struct igb_softc *);
172 static void     igb_set_promisc(struct igb_softc *);
173 static void     igb_disable_promisc(struct igb_softc *);
174
175 static int      igb_alloc_rings(struct igb_softc *);
176 static void     igb_free_rings(struct igb_softc *);
177 static int      igb_create_tx_ring(struct igb_tx_ring *);
178 static int      igb_create_rx_ring(struct igb_rx_ring *);
179 static void     igb_free_tx_ring(struct igb_tx_ring *);
180 static void     igb_free_rx_ring(struct igb_rx_ring *);
181 static void     igb_destroy_tx_ring(struct igb_tx_ring *, int);
182 static void     igb_destroy_rx_ring(struct igb_rx_ring *, int);
183 static void     igb_init_tx_ring(struct igb_tx_ring *);
184 static int      igb_init_rx_ring(struct igb_rx_ring *);
185 static int      igb_newbuf(struct igb_rx_ring *, int, boolean_t);
186 static int      igb_encap(struct igb_tx_ring *, struct mbuf **, int *, int *);
187 static void     igb_rx_refresh(struct igb_rx_ring *, int);
188 static void     igb_setup_serializer(struct igb_softc *);
189
190 static void     igb_stop(struct igb_softc *);
191 static void     igb_init(void *);
192 static int      igb_ioctl(struct ifnet *, u_long, caddr_t, struct ucred *);
193 static void     igb_media_status(struct ifnet *, struct ifmediareq *);
194 static int      igb_media_change(struct ifnet *);
195 static void     igb_timer(void *);
196 static void     igb_watchdog(struct ifaltq_subque *);
197 static void     igb_start(struct ifnet *, struct ifaltq_subque *);
198 #ifdef IFPOLL_ENABLE
199 static void     igb_npoll(struct ifnet *, struct ifpoll_info *);
200 static void     igb_npoll_rx(struct ifnet *, void *, int);
201 static void     igb_npoll_tx(struct ifnet *, void *, int);
202 static void     igb_npoll_status(struct ifnet *);
203 #endif
204 static void     igb_serialize(struct ifnet *, enum ifnet_serialize);
205 static void     igb_deserialize(struct ifnet *, enum ifnet_serialize);
206 static int      igb_tryserialize(struct ifnet *, enum ifnet_serialize);
207 #ifdef INVARIANTS
208 static void     igb_serialize_assert(struct ifnet *, enum ifnet_serialize,
209                     boolean_t);
210 #endif
211
212 static void     igb_intr(void *);
213 static void     igb_intr_shared(void *);
214 static void     igb_rxeof(struct igb_rx_ring *, int);
215 static void     igb_txeof(struct igb_tx_ring *);
216 static void     igb_set_eitr(struct igb_softc *, int, int);
217 static void     igb_enable_intr(struct igb_softc *);
218 static void     igb_disable_intr(struct igb_softc *);
219 static void     igb_init_unshared_intr(struct igb_softc *);
220 static void     igb_init_intr(struct igb_softc *);
221 static int      igb_setup_intr(struct igb_softc *);
222 static void     igb_set_txintr_mask(struct igb_tx_ring *, int *, int);
223 static void     igb_set_rxintr_mask(struct igb_rx_ring *, int *, int);
224 static void     igb_set_intr_mask(struct igb_softc *);
225 static int      igb_alloc_intr(struct igb_softc *);
226 static void     igb_free_intr(struct igb_softc *);
227 static void     igb_teardown_intr(struct igb_softc *);
228 static void     igb_msix_try_alloc(struct igb_softc *);
229 static void     igb_msix_rx_conf(struct igb_softc *, int, int *, int);
230 static void     igb_msix_tx_conf(struct igb_softc *, int, int *, int);
231 static void     igb_msix_free(struct igb_softc *, boolean_t);
232 static int      igb_msix_setup(struct igb_softc *);
233 static void     igb_msix_teardown(struct igb_softc *, int);
234 static void     igb_msix_rx(void *);
235 static void     igb_msix_tx(void *);
236 static void     igb_msix_status(void *);
237 static void     igb_msix_rxtx(void *);
238
239 /* Management and WOL Support */
240 static void     igb_get_mgmt(struct igb_softc *);
241 static void     igb_rel_mgmt(struct igb_softc *);
242 static void     igb_get_hw_control(struct igb_softc *);
243 static void     igb_rel_hw_control(struct igb_softc *);
244 static void     igb_enable_wol(device_t);
245
246 static device_method_t igb_methods[] = {
247         /* Device interface */
248         DEVMETHOD(device_probe,         igb_probe),
249         DEVMETHOD(device_attach,        igb_attach),
250         DEVMETHOD(device_detach,        igb_detach),
251         DEVMETHOD(device_shutdown,      igb_shutdown),
252         DEVMETHOD(device_suspend,       igb_suspend),
253         DEVMETHOD(device_resume,        igb_resume),
254         { 0, 0 }
255 };
256
257 static driver_t igb_driver = {
258         "igb",
259         igb_methods,
260         sizeof(struct igb_softc),
261 };
262
263 static devclass_t igb_devclass;
264
265 DECLARE_DUMMY_MODULE(if_igb);
266 MODULE_DEPEND(igb, ig_hal, 1, 1, 1);
267 DRIVER_MODULE(if_igb, pci, igb_driver, igb_devclass, NULL, NULL);
268
269 static int      igb_rxd = IGB_DEFAULT_RXD;
270 static int      igb_txd = IGB_DEFAULT_TXD;
271 static int      igb_rxr = 0;
272 static int      igb_txr = 0;
273 static int      igb_msi_enable = 1;
274 static int      igb_msix_enable = 1;
275 static int      igb_eee_disabled = 1;   /* Energy Efficient Ethernet */
276 static int      igb_fc_setting = e1000_fc_full;
277
278 /*
279  * DMA Coalescing, only for i350 - default to off,
280  * this feature is for power savings
281  */
282 static int      igb_dma_coalesce = 0;
283
284 TUNABLE_INT("hw.igb.rxd", &igb_rxd);
285 TUNABLE_INT("hw.igb.txd", &igb_txd);
286 TUNABLE_INT("hw.igb.rxr", &igb_rxr);
287 TUNABLE_INT("hw.igb.txr", &igb_txr);
288 TUNABLE_INT("hw.igb.msi.enable", &igb_msi_enable);
289 TUNABLE_INT("hw.igb.msix.enable", &igb_msix_enable);
290 TUNABLE_INT("hw.igb.fc_setting", &igb_fc_setting);
291
292 /* i350 specific */
293 TUNABLE_INT("hw.igb.eee_disabled", &igb_eee_disabled);
294 TUNABLE_INT("hw.igb.dma_coalesce", &igb_dma_coalesce);
295
296 static __inline void
297 igb_rxcsum(uint32_t staterr, struct mbuf *mp)
298 {
299         /* Ignore Checksum bit is set */
300         if (staterr & E1000_RXD_STAT_IXSM)
301                 return;
302
303         if ((staterr & (E1000_RXD_STAT_IPCS | E1000_RXDEXT_STATERR_IPE)) ==
304             E1000_RXD_STAT_IPCS)
305                 mp->m_pkthdr.csum_flags |= CSUM_IP_CHECKED | CSUM_IP_VALID;
306
307         if (staterr & (E1000_RXD_STAT_TCPCS | E1000_RXD_STAT_UDPCS)) {
308                 if ((staterr & E1000_RXDEXT_STATERR_TCPE) == 0) {
309                         mp->m_pkthdr.csum_flags |= CSUM_DATA_VALID |
310                             CSUM_PSEUDO_HDR | CSUM_FRAG_NOT_CHECKED;
311                         mp->m_pkthdr.csum_data = htons(0xffff);
312                 }
313         }
314 }
315
316 static __inline struct pktinfo *
317 igb_rssinfo(struct mbuf *m, struct pktinfo *pi,
318     uint32_t hash, uint32_t hashtype, uint32_t staterr)
319 {
320         switch (hashtype) {
321         case E1000_RXDADV_RSSTYPE_IPV4_TCP:
322                 pi->pi_netisr = NETISR_IP;
323                 pi->pi_flags = 0;
324                 pi->pi_l3proto = IPPROTO_TCP;
325                 break;
326
327         case E1000_RXDADV_RSSTYPE_IPV4:
328                 if (staterr & E1000_RXD_STAT_IXSM)
329                         return NULL;
330
331                 if ((staterr &
332                      (E1000_RXD_STAT_TCPCS | E1000_RXDEXT_STATERR_TCPE)) ==
333                     E1000_RXD_STAT_TCPCS) {
334                         pi->pi_netisr = NETISR_IP;
335                         pi->pi_flags = 0;
336                         pi->pi_l3proto = IPPROTO_UDP;
337                         break;
338                 }
339                 /* FALL THROUGH */
340         default:
341                 return NULL;
342         }
343
344         m->m_flags |= M_HASH;
345         m->m_pkthdr.hash = toeplitz_hash(hash);
346         return pi;
347 }
348
349 static int
350 igb_probe(device_t dev)
351 {
352         const struct igb_device *d;
353         uint16_t vid, did;
354
355         vid = pci_get_vendor(dev);
356         did = pci_get_device(dev);
357
358         for (d = igb_devices; d->desc != NULL; ++d) {
359                 if (vid == d->vid && did == d->did) {
360                         device_set_desc(dev, d->desc);
361                         return 0;
362                 }
363         }
364         return ENXIO;
365 }
366
367 static int
368 igb_attach(device_t dev)
369 {
370         struct igb_softc *sc = device_get_softc(dev);
371         uint16_t eeprom_data;
372         int error = 0, i, ring_max;
373 #ifdef IFPOLL_ENABLE
374         int offset, offset_def;
375 #endif
376
377 #ifdef notyet
378         /* SYSCTL stuff */
379         SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev),
380             SYSCTL_CHILDREN(device_get_sysctl_tree(dev)),
381             OID_AUTO, "nvm", CTLTYPE_INT|CTLFLAG_RW, adapter, 0,
382             igb_sysctl_nvm_info, "I", "NVM Information");
383         SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev),
384             SYSCTL_CHILDREN(device_get_sysctl_tree(dev)),
385             OID_AUTO, "flow_control", CTLTYPE_INT|CTLFLAG_RW,
386             adapter, 0, igb_set_flowcntl, "I", "Flow Control");
387 #endif
388
389         callout_init_mp(&sc->timer);
390         lwkt_serialize_init(&sc->main_serialize);
391
392         if_initname(&sc->arpcom.ac_if, device_get_name(dev),
393             device_get_unit(dev));
394         sc->dev = sc->osdep.dev = dev;
395
396         /*
397          * Determine hardware and mac type
398          */
399         sc->hw.vendor_id = pci_get_vendor(dev);
400         sc->hw.device_id = pci_get_device(dev);
401         sc->hw.revision_id = pci_read_config(dev, PCIR_REVID, 1);
402         sc->hw.subsystem_vendor_id = pci_read_config(dev, PCIR_SUBVEND_0, 2);
403         sc->hw.subsystem_device_id = pci_read_config(dev, PCIR_SUBDEV_0, 2);
404
405         if (e1000_set_mac_type(&sc->hw))
406                 return ENXIO;
407
408         /* Are we a VF device? */
409         if (sc->hw.mac.type == e1000_vfadapt ||
410             sc->hw.mac.type == e1000_vfadapt_i350)
411                 sc->vf_ifp = 1;
412         else
413                 sc->vf_ifp = 0;
414
415         /*
416          * Configure total supported RX/TX ring count
417          */
418         switch (sc->hw.mac.type) {
419         case e1000_82575:
420                 ring_max = IGB_MAX_RING_82575;
421                 break;
422
423         case e1000_82576:
424                 ring_max = IGB_MAX_RING_82576;
425                 break;
426
427         case e1000_82580:
428                 ring_max = IGB_MAX_RING_82580;
429                 break;
430
431         case e1000_i350:
432                 ring_max = IGB_MAX_RING_I350;
433                 break;
434
435         case e1000_i210:
436                 ring_max = IGB_MAX_RING_I210;
437                 break;
438
439         case e1000_i211:
440                 ring_max = IGB_MAX_RING_I211;
441                 break;
442
443         default:
444                 ring_max = IGB_MIN_RING;
445                 break;
446         }
447
448         sc->rx_ring_cnt = device_getenv_int(dev, "rxr", igb_rxr);
449         sc->rx_ring_cnt = if_ring_count2(sc->rx_ring_cnt, ring_max);
450 #ifdef IGB_RSS_DEBUG
451         sc->rx_ring_cnt = device_getenv_int(dev, "rxr_debug", sc->rx_ring_cnt);
452 #endif
453         sc->rx_ring_inuse = sc->rx_ring_cnt;
454
455         sc->tx_ring_cnt = device_getenv_int(dev, "txr", igb_txr);
456         sc->tx_ring_cnt = if_ring_count2(sc->tx_ring_cnt, ring_max);
457 #ifdef IGB_TSS_DEBUG
458         sc->tx_ring_cnt = device_getenv_int(dev, "txr_debug", sc->tx_ring_cnt);
459 #endif
460         sc->tx_ring_inuse = sc->tx_ring_cnt;
461
462         /* Enable bus mastering */
463         pci_enable_busmaster(dev);
464
465         /*
466          * Allocate IO memory
467          */
468         sc->mem_rid = PCIR_BAR(0);
469         sc->mem_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &sc->mem_rid,
470             RF_ACTIVE);
471         if (sc->mem_res == NULL) {
472                 device_printf(dev, "Unable to allocate bus resource: memory\n");
473                 error = ENXIO;
474                 goto failed;
475         }
476         sc->osdep.mem_bus_space_tag = rman_get_bustag(sc->mem_res);
477         sc->osdep.mem_bus_space_handle = rman_get_bushandle(sc->mem_res);
478
479         sc->hw.hw_addr = (uint8_t *)&sc->osdep.mem_bus_space_handle;
480
481         /* Save PCI command register for Shared Code */
482         sc->hw.bus.pci_cmd_word = pci_read_config(dev, PCIR_COMMAND, 2);
483         sc->hw.back = &sc->osdep;
484
485         /* Do Shared Code initialization */
486         if (e1000_setup_init_funcs(&sc->hw, TRUE)) {
487                 device_printf(dev, "Setup of Shared code failed\n");
488                 error = ENXIO;
489                 goto failed;
490         }
491
492         e1000_get_bus_info(&sc->hw);
493
494         sc->hw.mac.autoneg = DO_AUTO_NEG;
495         sc->hw.phy.autoneg_wait_to_complete = FALSE;
496         sc->hw.phy.autoneg_advertised = AUTONEG_ADV_DEFAULT;
497
498         /* Copper options */
499         if (sc->hw.phy.media_type == e1000_media_type_copper) {
500                 sc->hw.phy.mdix = AUTO_ALL_MODES;
501                 sc->hw.phy.disable_polarity_correction = FALSE;
502                 sc->hw.phy.ms_type = IGB_MASTER_SLAVE;
503         }
504
505         /* Set the frame limits assuming  standard ethernet sized frames. */
506         sc->max_frame_size = ETHERMTU + ETHER_HDR_LEN + ETHER_CRC_LEN;
507
508         /* Allocate RX/TX rings */
509         error = igb_alloc_rings(sc);
510         if (error)
511                 goto failed;
512
513 #ifdef IFPOLL_ENABLE
514         /*
515          * NPOLLING RX CPU offset
516          */
517         if (sc->rx_ring_cnt == ncpus2) {
518                 offset = 0;
519         } else {
520                 offset_def = (sc->rx_ring_cnt * device_get_unit(dev)) % ncpus2;
521                 offset = device_getenv_int(dev, "npoll.rxoff", offset_def);
522                 if (offset >= ncpus2 ||
523                     offset % sc->rx_ring_cnt != 0) {
524                         device_printf(dev, "invalid npoll.rxoff %d, use %d\n",
525                             offset, offset_def);
526                         offset = offset_def;
527                 }
528         }
529         sc->rx_npoll_off = offset;
530
531         /*
532          * NPOLLING TX CPU offset
533          */
534         if (sc->tx_ring_cnt == ncpus2) {
535                 offset = 0;
536         } else {
537                 offset_def = (sc->tx_ring_cnt * device_get_unit(dev)) % ncpus2;
538                 offset = device_getenv_int(dev, "npoll.txoff", offset_def);
539                 if (offset >= ncpus2 ||
540                     offset % sc->tx_ring_cnt != 0) {
541                         device_printf(dev, "invalid npoll.txoff %d, use %d\n",
542                             offset, offset_def);
543                         offset = offset_def;
544                 }
545         }
546         sc->tx_npoll_off = offset;
547 #endif
548
549         /* Allocate interrupt */
550         error = igb_alloc_intr(sc);
551         if (error)
552                 goto failed;
553
554         /* Setup serializers */
555         igb_setup_serializer(sc);
556
557         /* Allocate the appropriate stats memory */
558         if (sc->vf_ifp) {
559                 sc->stats = kmalloc(sizeof(struct e1000_vf_stats), M_DEVBUF,
560                     M_WAITOK | M_ZERO);
561                 igb_vf_init_stats(sc);
562         } else {
563                 sc->stats = kmalloc(sizeof(struct e1000_hw_stats), M_DEVBUF,
564                     M_WAITOK | M_ZERO);
565         }
566
567         /* Allocate multicast array memory. */
568         sc->mta = kmalloc(ETHER_ADDR_LEN * MAX_NUM_MULTICAST_ADDRESSES,
569             M_DEVBUF, M_WAITOK);
570
571         /* Some adapter-specific advanced features */
572         if (sc->hw.mac.type >= e1000_i350) {
573 #ifdef notyet
574                 igb_set_sysctl_value(adapter, "dma_coalesce",
575                     "configure dma coalesce",
576                     &adapter->dma_coalesce, igb_dma_coalesce);
577                 igb_set_sysctl_value(adapter, "eee_disabled",
578                     "enable Energy Efficient Ethernet",
579                     &adapter->hw.dev_spec._82575.eee_disable,
580                     igb_eee_disabled);
581 #else
582                 sc->dma_coalesce = igb_dma_coalesce;
583                 sc->hw.dev_spec._82575.eee_disable = igb_eee_disabled;
584 #endif
585                 if (sc->hw.phy.media_type == e1000_media_type_copper)
586                         e1000_set_eee_i350(&sc->hw);
587         }
588
589         /*
590          * Start from a known state, this is important in reading the nvm and
591          * mac from that.
592          */
593         e1000_reset_hw(&sc->hw);
594
595         /* Make sure we have a good EEPROM before we read from it */
596         if (sc->hw.mac.type != e1000_i210 && sc->hw.mac.type != e1000_i211 &&
597             e1000_validate_nvm_checksum(&sc->hw) < 0) {
598                 /*
599                  * Some PCI-E parts fail the first check due to
600                  * the link being in sleep state, call it again,
601                  * if it fails a second time its a real issue.
602                  */
603                 if (e1000_validate_nvm_checksum(&sc->hw) < 0) {
604                         device_printf(dev,
605                             "The EEPROM Checksum Is Not Valid\n");
606                         error = EIO;
607                         goto failed;
608                 }
609         }
610
611         /* Copy the permanent MAC address out of the EEPROM */
612         if (e1000_read_mac_addr(&sc->hw) < 0) {
613                 device_printf(dev, "EEPROM read error while reading MAC"
614                     " address\n");
615                 error = EIO;
616                 goto failed;
617         }
618         if (!igb_is_valid_ether_addr(sc->hw.mac.addr)) {
619                 device_printf(dev, "Invalid MAC address\n");
620                 error = EIO;
621                 goto failed;
622         }
623
624         /* Setup OS specific network interface */
625         igb_setup_ifp(sc);
626
627         /* Add sysctl tree, must after igb_setup_ifp() */
628         igb_add_sysctl(sc);
629
630         /* Now get a good starting state */
631         igb_reset(sc);
632
633         /* Initialize statistics */
634         igb_update_stats_counters(sc);
635
636         sc->hw.mac.get_link_status = 1;
637         igb_update_link_status(sc);
638
639         /* Indicate SOL/IDER usage */
640         if (e1000_check_reset_block(&sc->hw)) {
641                 device_printf(dev,
642                     "PHY reset is blocked due to SOL/IDER session.\n");
643         }
644
645         /* Determine if we have to control management hardware */
646         if (e1000_enable_mng_pass_thru(&sc->hw))
647                 sc->flags |= IGB_FLAG_HAS_MGMT;
648
649         /*
650          * Setup Wake-on-Lan
651          */
652         /* APME bit in EEPROM is mapped to WUC.APME */
653         eeprom_data = E1000_READ_REG(&sc->hw, E1000_WUC) & E1000_WUC_APME;
654         if (eeprom_data)
655                 sc->wol = E1000_WUFC_MAG;
656         /* XXX disable WOL */
657         sc->wol = 0; 
658
659 #ifdef notyet
660         /* Register for VLAN events */
661         adapter->vlan_attach = EVENTHANDLER_REGISTER(vlan_config,
662              igb_register_vlan, adapter, EVENTHANDLER_PRI_FIRST);
663         adapter->vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig,
664              igb_unregister_vlan, adapter, EVENTHANDLER_PRI_FIRST);
665 #endif
666
667 #ifdef notyet
668         igb_add_hw_stats(adapter);
669 #endif
670
671         error = igb_setup_intr(sc);
672         if (error) {
673                 ether_ifdetach(&sc->arpcom.ac_if);
674                 goto failed;
675         }
676
677         for (i = 0; i < sc->tx_ring_cnt; ++i) {
678                 struct ifaltq_subque *ifsq =
679                     ifq_get_subq(&sc->arpcom.ac_if.if_snd, i);
680                 struct igb_tx_ring *txr = &sc->tx_rings[i];
681
682                 ifsq_set_cpuid(ifsq, txr->tx_intr_cpuid);
683                 ifsq_set_priv(ifsq, txr);
684                 txr->ifsq = ifsq;
685
686                 ifsq_watchdog_init(&txr->tx_watchdog, ifsq, igb_watchdog);
687         }
688
689         return 0;
690
691 failed:
692         igb_detach(dev);
693         return error;
694 }
695
696 static int
697 igb_detach(device_t dev)
698 {
699         struct igb_softc *sc = device_get_softc(dev);
700
701         if (device_is_attached(dev)) {
702                 struct ifnet *ifp = &sc->arpcom.ac_if;
703
704                 ifnet_serialize_all(ifp);
705
706                 igb_stop(sc);
707
708                 e1000_phy_hw_reset(&sc->hw);
709
710                 /* Give control back to firmware */
711                 igb_rel_mgmt(sc);
712                 igb_rel_hw_control(sc);
713
714                 if (sc->wol) {
715                         E1000_WRITE_REG(&sc->hw, E1000_WUC, E1000_WUC_PME_EN);
716                         E1000_WRITE_REG(&sc->hw, E1000_WUFC, sc->wol);
717                         igb_enable_wol(dev);
718                 }
719
720                 igb_teardown_intr(sc);
721
722                 ifnet_deserialize_all(ifp);
723
724                 ether_ifdetach(ifp);
725         } else if (sc->mem_res != NULL) {
726                 igb_rel_hw_control(sc);
727         }
728         bus_generic_detach(dev);
729
730         if (sc->sysctl_tree != NULL)
731                 sysctl_ctx_free(&sc->sysctl_ctx);
732
733         igb_free_intr(sc);
734
735         if (sc->msix_mem_res != NULL) {
736                 bus_release_resource(dev, SYS_RES_MEMORY, sc->msix_mem_rid,
737                     sc->msix_mem_res);
738         }
739         if (sc->mem_res != NULL) {
740                 bus_release_resource(dev, SYS_RES_MEMORY, sc->mem_rid,
741                     sc->mem_res);
742         }
743
744         igb_free_rings(sc);
745
746         if (sc->mta != NULL)
747                 kfree(sc->mta, M_DEVBUF);
748         if (sc->stats != NULL)
749                 kfree(sc->stats, M_DEVBUF);
750         if (sc->serializes != NULL)
751                 kfree(sc->serializes, M_DEVBUF);
752
753         return 0;
754 }
755
756 static int
757 igb_shutdown(device_t dev)
758 {
759         return igb_suspend(dev);
760 }
761
762 static int
763 igb_suspend(device_t dev)
764 {
765         struct igb_softc *sc = device_get_softc(dev);
766         struct ifnet *ifp = &sc->arpcom.ac_if;
767
768         ifnet_serialize_all(ifp);
769
770         igb_stop(sc);
771
772         igb_rel_mgmt(sc);
773         igb_rel_hw_control(sc);
774
775         if (sc->wol) {
776                 E1000_WRITE_REG(&sc->hw, E1000_WUC, E1000_WUC_PME_EN);
777                 E1000_WRITE_REG(&sc->hw, E1000_WUFC, sc->wol);
778                 igb_enable_wol(dev);
779         }
780
781         ifnet_deserialize_all(ifp);
782
783         return bus_generic_suspend(dev);
784 }
785
786 static int
787 igb_resume(device_t dev)
788 {
789         struct igb_softc *sc = device_get_softc(dev);
790         struct ifnet *ifp = &sc->arpcom.ac_if;
791         int i;
792
793         ifnet_serialize_all(ifp);
794
795         igb_init(sc);
796         igb_get_mgmt(sc);
797
798         for (i = 0; i < sc->tx_ring_inuse; ++i)
799                 ifsq_devstart_sched(sc->tx_rings[i].ifsq);
800
801         ifnet_deserialize_all(ifp);
802
803         return bus_generic_resume(dev);
804 }
805
806 static int
807 igb_ioctl(struct ifnet *ifp, u_long command, caddr_t data, struct ucred *cr)
808 {
809         struct igb_softc *sc = ifp->if_softc;
810         struct ifreq *ifr = (struct ifreq *)data;
811         int max_frame_size, mask, reinit;
812         int error = 0;
813
814         ASSERT_IFNET_SERIALIZED_ALL(ifp);
815
816         switch (command) {
817         case SIOCSIFMTU:
818                 max_frame_size = 9234;
819                 if (ifr->ifr_mtu > max_frame_size - ETHER_HDR_LEN -
820                     ETHER_CRC_LEN) {
821                         error = EINVAL;
822                         break;
823                 }
824
825                 ifp->if_mtu = ifr->ifr_mtu;
826                 sc->max_frame_size = ifp->if_mtu + ETHER_HDR_LEN +
827                     ETHER_CRC_LEN;
828
829                 if (ifp->if_flags & IFF_RUNNING)
830                         igb_init(sc);
831                 break;
832
833         case SIOCSIFFLAGS:
834                 if (ifp->if_flags & IFF_UP) {
835                         if (ifp->if_flags & IFF_RUNNING) {
836                                 if ((ifp->if_flags ^ sc->if_flags) &
837                                     (IFF_PROMISC | IFF_ALLMULTI)) {
838                                         igb_disable_promisc(sc);
839                                         igb_set_promisc(sc);
840                                 }
841                         } else {
842                                 igb_init(sc);
843                         }
844                 } else if (ifp->if_flags & IFF_RUNNING) {
845                         igb_stop(sc);
846                 }
847                 sc->if_flags = ifp->if_flags;
848                 break;
849
850         case SIOCADDMULTI:
851         case SIOCDELMULTI:
852                 if (ifp->if_flags & IFF_RUNNING) {
853                         igb_disable_intr(sc);
854                         igb_set_multi(sc);
855 #ifdef IFPOLL_ENABLE
856                         if (!(ifp->if_flags & IFF_NPOLLING))
857 #endif
858                                 igb_enable_intr(sc);
859                 }
860                 break;
861
862         case SIOCSIFMEDIA:
863                 /* Check SOL/IDER usage */
864                 if (e1000_check_reset_block(&sc->hw)) {
865                         if_printf(ifp, "Media change is "
866                             "blocked due to SOL/IDER session.\n");
867                         break;
868                 }
869                 /* FALL THROUGH */
870
871         case SIOCGIFMEDIA:
872                 error = ifmedia_ioctl(ifp, ifr, &sc->media, command);
873                 break;
874
875         case SIOCSIFCAP:
876                 reinit = 0;
877                 mask = ifr->ifr_reqcap ^ ifp->if_capenable;
878                 if (mask & IFCAP_RXCSUM) {
879                         ifp->if_capenable ^= IFCAP_RXCSUM;
880                         reinit = 1;
881                 }
882                 if (mask & IFCAP_VLAN_HWTAGGING) {
883                         ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
884                         reinit = 1;
885                 }
886                 if (mask & IFCAP_TXCSUM) {
887                         ifp->if_capenable ^= IFCAP_TXCSUM;
888                         if (ifp->if_capenable & IFCAP_TXCSUM)
889                                 ifp->if_hwassist |= IGB_CSUM_FEATURES;
890                         else
891                                 ifp->if_hwassist &= ~IGB_CSUM_FEATURES;
892                 }
893                 if (mask & IFCAP_TSO) {
894                         ifp->if_capenable ^= IFCAP_TSO;
895                         if (ifp->if_capenable & IFCAP_TSO)
896                                 ifp->if_hwassist |= CSUM_TSO;
897                         else
898                                 ifp->if_hwassist &= ~CSUM_TSO;
899                 }
900                 if (mask & IFCAP_RSS)
901                         ifp->if_capenable ^= IFCAP_RSS;
902                 if (reinit && (ifp->if_flags & IFF_RUNNING))
903                         igb_init(sc);
904                 break;
905
906         default:
907                 error = ether_ioctl(ifp, command, data);
908                 break;
909         }
910         return error;
911 }
912
913 static void
914 igb_init(void *xsc)
915 {
916         struct igb_softc *sc = xsc;
917         struct ifnet *ifp = &sc->arpcom.ac_if;
918         boolean_t polling;
919         int i;
920
921         ASSERT_IFNET_SERIALIZED_ALL(ifp);
922
923         igb_stop(sc);
924
925         /* Get the latest mac address, User can use a LAA */
926         bcopy(IF_LLADDR(ifp), sc->hw.mac.addr, ETHER_ADDR_LEN);
927
928         /* Put the address into the Receive Address Array */
929         e1000_rar_set(&sc->hw, sc->hw.mac.addr, 0);
930
931         igb_reset(sc);
932         igb_update_link_status(sc);
933
934         E1000_WRITE_REG(&sc->hw, E1000_VET, ETHERTYPE_VLAN);
935
936         /* Configure for OS presence */
937         igb_get_mgmt(sc);
938
939         polling = FALSE;
940 #ifdef IFPOLL_ENABLE
941         if (ifp->if_flags & IFF_NPOLLING)
942                 polling = TRUE;
943 #endif
944
945         /* Configured used RX/TX rings */
946         igb_set_ring_inuse(sc, polling);
947         ifq_set_subq_mask(&ifp->if_snd, sc->tx_ring_inuse - 1);
948
949         /* Initialize interrupt */
950         igb_init_intr(sc);
951
952         /* Prepare transmit descriptors and buffers */
953         for (i = 0; i < sc->tx_ring_inuse; ++i)
954                 igb_init_tx_ring(&sc->tx_rings[i]);
955         igb_init_tx_unit(sc);
956
957         /* Setup Multicast table */
958         igb_set_multi(sc);
959
960 #if 0
961         /*
962          * Figure out the desired mbuf pool
963          * for doing jumbo/packetsplit
964          */
965         if (adapter->max_frame_size <= 2048)
966                 adapter->rx_mbuf_sz = MCLBYTES;
967         else if (adapter->max_frame_size <= 4096)
968                 adapter->rx_mbuf_sz = MJUMPAGESIZE;
969         else
970                 adapter->rx_mbuf_sz = MJUM9BYTES;
971 #endif
972
973         /* Prepare receive descriptors and buffers */
974         for (i = 0; i < sc->rx_ring_inuse; ++i) {
975                 int error;
976
977                 error = igb_init_rx_ring(&sc->rx_rings[i]);
978                 if (error) {
979                         if_printf(ifp, "Could not setup receive structures\n");
980                         igb_stop(sc);
981                         return;
982                 }
983         }
984         igb_init_rx_unit(sc);
985
986         /* Enable VLAN support */
987         if (ifp->if_capenable & IFCAP_VLAN_HWTAGGING)
988                 igb_set_vlan(sc);
989
990         /* Don't lose promiscuous settings */
991         igb_set_promisc(sc);
992
993         ifp->if_flags |= IFF_RUNNING;
994         for (i = 0; i < sc->tx_ring_inuse; ++i) {
995                 ifsq_clr_oactive(sc->tx_rings[i].ifsq);
996                 ifsq_watchdog_start(&sc->tx_rings[i].tx_watchdog);
997         }
998
999         if (polling || sc->intr_type == PCI_INTR_TYPE_MSIX)
1000                 sc->timer_cpuid = 0; /* XXX fixed */
1001         else
1002                 sc->timer_cpuid = rman_get_cpuid(sc->intr_res);
1003         callout_reset_bycpu(&sc->timer, hz, igb_timer, sc, sc->timer_cpuid);
1004         e1000_clear_hw_cntrs_base_generic(&sc->hw);
1005
1006         /* This clears any pending interrupts */
1007         E1000_READ_REG(&sc->hw, E1000_ICR);
1008
1009         /*
1010          * Only enable interrupts if we are not polling, make sure
1011          * they are off otherwise.
1012          */
1013         if (polling) {
1014                 igb_disable_intr(sc);
1015         } else {
1016                 igb_enable_intr(sc);
1017                 E1000_WRITE_REG(&sc->hw, E1000_ICS, E1000_ICS_LSC);
1018         }
1019
1020         /* Set Energy Efficient Ethernet */
1021         if (sc->hw.phy.media_type == e1000_media_type_copper)
1022                 e1000_set_eee_i350(&sc->hw);
1023 }
1024
1025 static void
1026 igb_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
1027 {
1028         struct igb_softc *sc = ifp->if_softc;
1029
1030         ASSERT_IFNET_SERIALIZED_ALL(ifp);
1031
1032         igb_update_link_status(sc);
1033
1034         ifmr->ifm_status = IFM_AVALID;
1035         ifmr->ifm_active = IFM_ETHER;
1036
1037         if (!sc->link_active)
1038                 return;
1039
1040         ifmr->ifm_status |= IFM_ACTIVE;
1041
1042         switch (sc->link_speed) {
1043         case 10:
1044                 ifmr->ifm_active |= IFM_10_T;
1045                 break;
1046
1047         case 100:
1048                 /*
1049                  * Support for 100Mb SFP - these are Fiber 
1050                  * but the media type appears as serdes
1051                  */
1052                 if (sc->hw.phy.media_type == e1000_media_type_internal_serdes)
1053                         ifmr->ifm_active |= IFM_100_FX;
1054                 else
1055                         ifmr->ifm_active |= IFM_100_TX;
1056                 break;
1057
1058         case 1000:
1059                 ifmr->ifm_active |= IFM_1000_T;
1060                 break;
1061         }
1062
1063         if (sc->link_duplex == FULL_DUPLEX)
1064                 ifmr->ifm_active |= IFM_FDX;
1065         else
1066                 ifmr->ifm_active |= IFM_HDX;
1067 }
1068
1069 static int
1070 igb_media_change(struct ifnet *ifp)
1071 {
1072         struct igb_softc *sc = ifp->if_softc;
1073         struct ifmedia *ifm = &sc->media;
1074
1075         ASSERT_IFNET_SERIALIZED_ALL(ifp);
1076
1077         if (IFM_TYPE(ifm->ifm_media) != IFM_ETHER)
1078                 return EINVAL;
1079
1080         switch (IFM_SUBTYPE(ifm->ifm_media)) {
1081         case IFM_AUTO:
1082                 sc->hw.mac.autoneg = DO_AUTO_NEG;
1083                 sc->hw.phy.autoneg_advertised = AUTONEG_ADV_DEFAULT;
1084                 break;
1085
1086         case IFM_1000_LX:
1087         case IFM_1000_SX:
1088         case IFM_1000_T:
1089                 sc->hw.mac.autoneg = DO_AUTO_NEG;
1090                 sc->hw.phy.autoneg_advertised = ADVERTISE_1000_FULL;
1091                 break;
1092
1093         case IFM_100_TX:
1094                 sc->hw.mac.autoneg = FALSE;
1095                 sc->hw.phy.autoneg_advertised = 0;
1096                 if ((ifm->ifm_media & IFM_GMASK) == IFM_FDX)
1097                         sc->hw.mac.forced_speed_duplex = ADVERTISE_100_FULL;
1098                 else
1099                         sc->hw.mac.forced_speed_duplex = ADVERTISE_100_HALF;
1100                 break;
1101
1102         case IFM_10_T:
1103                 sc->hw.mac.autoneg = FALSE;
1104                 sc->hw.phy.autoneg_advertised = 0;
1105                 if ((ifm->ifm_media & IFM_GMASK) == IFM_FDX)
1106                         sc->hw.mac.forced_speed_duplex = ADVERTISE_10_FULL;
1107                 else
1108                         sc->hw.mac.forced_speed_duplex = ADVERTISE_10_HALF;
1109                 break;
1110
1111         default:
1112                 if_printf(ifp, "Unsupported media type\n");
1113                 break;
1114         }
1115
1116         igb_init(sc);
1117
1118         return 0;
1119 }
1120
1121 static void
1122 igb_set_promisc(struct igb_softc *sc)
1123 {
1124         struct ifnet *ifp = &sc->arpcom.ac_if;
1125         struct e1000_hw *hw = &sc->hw;
1126         uint32_t reg;
1127
1128         if (sc->vf_ifp) {
1129                 e1000_promisc_set_vf(hw, e1000_promisc_enabled);
1130                 return;
1131         }
1132
1133         reg = E1000_READ_REG(hw, E1000_RCTL);
1134         if (ifp->if_flags & IFF_PROMISC) {
1135                 reg |= (E1000_RCTL_UPE | E1000_RCTL_MPE);
1136                 E1000_WRITE_REG(hw, E1000_RCTL, reg);
1137         } else if (ifp->if_flags & IFF_ALLMULTI) {
1138                 reg |= E1000_RCTL_MPE;
1139                 reg &= ~E1000_RCTL_UPE;
1140                 E1000_WRITE_REG(hw, E1000_RCTL, reg);
1141         }
1142 }
1143
1144 static void
1145 igb_disable_promisc(struct igb_softc *sc)
1146 {
1147         struct e1000_hw *hw = &sc->hw;
1148         uint32_t reg;
1149
1150         if (sc->vf_ifp) {
1151                 e1000_promisc_set_vf(hw, e1000_promisc_disabled);
1152                 return;
1153         }
1154         reg = E1000_READ_REG(hw, E1000_RCTL);
1155         reg &= ~E1000_RCTL_UPE;
1156         reg &= ~E1000_RCTL_MPE;
1157         E1000_WRITE_REG(hw, E1000_RCTL, reg);
1158 }
1159
1160 static void
1161 igb_set_multi(struct igb_softc *sc)
1162 {
1163         struct ifnet *ifp = &sc->arpcom.ac_if;
1164         struct ifmultiaddr *ifma;
1165         uint32_t reg_rctl = 0;
1166         uint8_t *mta;
1167         int mcnt = 0;
1168
1169         mta = sc->mta;
1170         bzero(mta, ETH_ADDR_LEN * MAX_NUM_MULTICAST_ADDRESSES);
1171
1172         TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1173                 if (ifma->ifma_addr->sa_family != AF_LINK)
1174                         continue;
1175
1176                 if (mcnt == MAX_NUM_MULTICAST_ADDRESSES)
1177                         break;
1178
1179                 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1180                     &mta[mcnt * ETH_ADDR_LEN], ETH_ADDR_LEN);
1181                 mcnt++;
1182         }
1183
1184         if (mcnt >= MAX_NUM_MULTICAST_ADDRESSES) {
1185                 reg_rctl = E1000_READ_REG(&sc->hw, E1000_RCTL);
1186                 reg_rctl |= E1000_RCTL_MPE;
1187                 E1000_WRITE_REG(&sc->hw, E1000_RCTL, reg_rctl);
1188         } else {
1189                 e1000_update_mc_addr_list(&sc->hw, mta, mcnt);
1190         }
1191 }
1192
1193 static void
1194 igb_timer(void *xsc)
1195 {
1196         struct igb_softc *sc = xsc;
1197
1198         lwkt_serialize_enter(&sc->main_serialize);
1199
1200         igb_update_link_status(sc);
1201         igb_update_stats_counters(sc);
1202
1203         callout_reset_bycpu(&sc->timer, hz, igb_timer, sc, sc->timer_cpuid);
1204
1205         lwkt_serialize_exit(&sc->main_serialize);
1206 }
1207
1208 static void
1209 igb_update_link_status(struct igb_softc *sc)
1210 {
1211         struct ifnet *ifp = &sc->arpcom.ac_if;
1212         struct e1000_hw *hw = &sc->hw;
1213         uint32_t link_check, thstat, ctrl;
1214
1215         link_check = thstat = ctrl = 0;
1216
1217         /* Get the cached link value or read for real */
1218         switch (hw->phy.media_type) {
1219         case e1000_media_type_copper:
1220                 if (hw->mac.get_link_status) {
1221                         /* Do the work to read phy */
1222                         e1000_check_for_link(hw);
1223                         link_check = !hw->mac.get_link_status;
1224                 } else {
1225                         link_check = TRUE;
1226                 }
1227                 break;
1228
1229         case e1000_media_type_fiber:
1230                 e1000_check_for_link(hw);
1231                 link_check = E1000_READ_REG(hw, E1000_STATUS) & E1000_STATUS_LU;
1232                 break;
1233
1234         case e1000_media_type_internal_serdes:
1235                 e1000_check_for_link(hw);
1236                 link_check = hw->mac.serdes_has_link;
1237                 break;
1238
1239         /* VF device is type_unknown */
1240         case e1000_media_type_unknown:
1241                 e1000_check_for_link(hw);
1242                 link_check = !hw->mac.get_link_status;
1243                 /* Fall thru */
1244         default:
1245                 break;
1246         }
1247
1248         /* Check for thermal downshift or shutdown */
1249         if (hw->mac.type == e1000_i350) {
1250                 thstat = E1000_READ_REG(hw, E1000_THSTAT);
1251                 ctrl = E1000_READ_REG(hw, E1000_CTRL_EXT);
1252         }
1253
1254         /* Now we check if a transition has happened */
1255         if (link_check && sc->link_active == 0) {
1256                 e1000_get_speed_and_duplex(hw, 
1257                     &sc->link_speed, &sc->link_duplex);
1258                 if (bootverbose) {
1259                         const char *flowctl;
1260
1261                         /* Get the flow control for display */
1262                         switch (hw->fc.current_mode) {
1263                         case e1000_fc_rx_pause:
1264                                 flowctl = "RX";
1265                                 break;
1266
1267                         case e1000_fc_tx_pause:
1268                                 flowctl = "TX";
1269                                 break;
1270
1271                         case e1000_fc_full:
1272                                 flowctl = "Full";
1273                                 break;
1274
1275                         default:
1276                                 flowctl = "None";
1277                                 break;
1278                         }
1279
1280                         if_printf(ifp, "Link is up %d Mbps %s, "
1281                             "Flow control: %s\n",
1282                             sc->link_speed,
1283                             sc->link_duplex == FULL_DUPLEX ?
1284                             "Full Duplex" : "Half Duplex",
1285                             flowctl);
1286                 }
1287                 sc->link_active = 1;
1288
1289                 ifp->if_baudrate = sc->link_speed * 1000000;
1290                 if ((ctrl & E1000_CTRL_EXT_LINK_MODE_GMII) &&
1291                     (thstat & E1000_THSTAT_LINK_THROTTLE))
1292                         if_printf(ifp, "Link: thermal downshift\n");
1293                 /* This can sleep */
1294                 ifp->if_link_state = LINK_STATE_UP;
1295                 if_link_state_change(ifp);
1296         } else if (!link_check && sc->link_active == 1) {
1297                 ifp->if_baudrate = sc->link_speed = 0;
1298                 sc->link_duplex = 0;
1299                 if (bootverbose)
1300                         if_printf(ifp, "Link is Down\n");
1301                 if ((ctrl & E1000_CTRL_EXT_LINK_MODE_GMII) &&
1302                     (thstat & E1000_THSTAT_PWR_DOWN))
1303                         if_printf(ifp, "Link: thermal shutdown\n");
1304                 sc->link_active = 0;
1305                 /* This can sleep */
1306                 ifp->if_link_state = LINK_STATE_DOWN;
1307                 if_link_state_change(ifp);
1308         }
1309 }
1310
1311 static void
1312 igb_stop(struct igb_softc *sc)
1313 {
1314         struct ifnet *ifp = &sc->arpcom.ac_if;
1315         int i;
1316
1317         ASSERT_IFNET_SERIALIZED_ALL(ifp);
1318
1319         igb_disable_intr(sc);
1320
1321         callout_stop(&sc->timer);
1322
1323         ifp->if_flags &= ~IFF_RUNNING;
1324         for (i = 0; i < sc->tx_ring_cnt; ++i) {
1325                 ifsq_clr_oactive(sc->tx_rings[i].ifsq);
1326                 ifsq_watchdog_stop(&sc->tx_rings[i].tx_watchdog);
1327                 sc->tx_rings[i].tx_flags &= ~IGB_TXFLAG_ENABLED;
1328         }
1329
1330         e1000_reset_hw(&sc->hw);
1331         E1000_WRITE_REG(&sc->hw, E1000_WUC, 0);
1332
1333         e1000_led_off(&sc->hw);
1334         e1000_cleanup_led(&sc->hw);
1335
1336         for (i = 0; i < sc->tx_ring_cnt; ++i)
1337                 igb_free_tx_ring(&sc->tx_rings[i]);
1338         for (i = 0; i < sc->rx_ring_cnt; ++i)
1339                 igb_free_rx_ring(&sc->rx_rings[i]);
1340 }
1341
1342 static void
1343 igb_reset(struct igb_softc *sc)
1344 {
1345         struct ifnet *ifp = &sc->arpcom.ac_if;
1346         struct e1000_hw *hw = &sc->hw;
1347         struct e1000_fc_info *fc = &hw->fc;
1348         uint32_t pba = 0;
1349         uint16_t hwm;
1350
1351         /* Let the firmware know the OS is in control */
1352         igb_get_hw_control(sc);
1353
1354         /*
1355          * Packet Buffer Allocation (PBA)
1356          * Writing PBA sets the receive portion of the buffer
1357          * the remainder is used for the transmit buffer.
1358          */
1359         switch (hw->mac.type) {
1360         case e1000_82575:
1361                 pba = E1000_PBA_32K;
1362                 break;
1363
1364         case e1000_82576:
1365         case e1000_vfadapt:
1366                 pba = E1000_READ_REG(hw, E1000_RXPBS);
1367                 pba &= E1000_RXPBS_SIZE_MASK_82576;
1368                 break;
1369
1370         case e1000_82580:
1371         case e1000_i350:
1372         case e1000_vfadapt_i350:
1373                 pba = E1000_READ_REG(hw, E1000_RXPBS);
1374                 pba = e1000_rxpbs_adjust_82580(pba);
1375                 break;
1376
1377         case e1000_i210:
1378         case e1000_i211:
1379                 pba = E1000_PBA_34K;
1380                 break;
1381
1382         default:
1383                 break;
1384         }
1385
1386         /* Special needs in case of Jumbo frames */
1387         if (hw->mac.type == e1000_82575 && ifp->if_mtu > ETHERMTU) {
1388                 uint32_t tx_space, min_tx, min_rx;
1389
1390                 pba = E1000_READ_REG(hw, E1000_PBA);
1391                 tx_space = pba >> 16;
1392                 pba &= 0xffff;
1393
1394                 min_tx = (sc->max_frame_size +
1395                     sizeof(struct e1000_tx_desc) - ETHER_CRC_LEN) * 2;
1396                 min_tx = roundup2(min_tx, 1024);
1397                 min_tx >>= 10;
1398                 min_rx = sc->max_frame_size;
1399                 min_rx = roundup2(min_rx, 1024);
1400                 min_rx >>= 10;
1401                 if (tx_space < min_tx && (min_tx - tx_space) < pba) {
1402                         pba = pba - (min_tx - tx_space);
1403                         /*
1404                          * if short on rx space, rx wins
1405                          * and must trump tx adjustment
1406                          */
1407                         if (pba < min_rx)
1408                                 pba = min_rx;
1409                 }
1410                 E1000_WRITE_REG(hw, E1000_PBA, pba);
1411         }
1412
1413         /*
1414          * These parameters control the automatic generation (Tx) and
1415          * response (Rx) to Ethernet PAUSE frames.
1416          * - High water mark should allow for at least two frames to be
1417          *   received after sending an XOFF.
1418          * - Low water mark works best when it is very near the high water mark.
1419          *   This allows the receiver to restart by sending XON when it has
1420          *   drained a bit.
1421          */
1422         hwm = min(((pba << 10) * 9 / 10),
1423             ((pba << 10) - 2 * sc->max_frame_size));
1424
1425         if (hw->mac.type < e1000_82576) {
1426                 fc->high_water = hwm & 0xFFF8; /* 8-byte granularity */
1427                 fc->low_water = fc->high_water - 8;
1428         } else {
1429                 fc->high_water = hwm & 0xFFF0; /* 16-byte granularity */
1430                 fc->low_water = fc->high_water - 16;
1431         }
1432         fc->pause_time = IGB_FC_PAUSE_TIME;
1433         fc->send_xon = TRUE;
1434         fc->requested_mode = e1000_fc_default;
1435
1436         /* Issue a global reset */
1437         e1000_reset_hw(hw);
1438         E1000_WRITE_REG(hw, E1000_WUC, 0);
1439
1440         if (e1000_init_hw(hw) < 0)
1441                 if_printf(ifp, "Hardware Initialization Failed\n");
1442
1443         /* Setup DMA Coalescing */
1444         if (hw->mac.type > e1000_82580 && hw->mac.type != e1000_i211) {
1445                 uint32_t dmac;
1446                 uint32_t reg;
1447
1448                 if (sc->dma_coalesce == 0) {
1449                         /*
1450                          * Disabled
1451                          */
1452                         reg = E1000_READ_REG(hw, E1000_DMACR);
1453                         reg &= ~E1000_DMACR_DMAC_EN;
1454                         E1000_WRITE_REG(hw, E1000_DMACR, reg);
1455                         goto reset_out;
1456                 }
1457
1458                 /* Set starting thresholds */
1459                 E1000_WRITE_REG(hw, E1000_DMCTXTH, 0);
1460                 E1000_WRITE_REG(hw, E1000_DMCRTRH, 0);
1461
1462                 hwm = 64 * pba - sc->max_frame_size / 16;
1463                 if (hwm < 64 * (pba - 6))
1464                         hwm = 64 * (pba - 6);
1465                 reg = E1000_READ_REG(hw, E1000_FCRTC);
1466                 reg &= ~E1000_FCRTC_RTH_COAL_MASK;
1467                 reg |= ((hwm << E1000_FCRTC_RTH_COAL_SHIFT)
1468                     & E1000_FCRTC_RTH_COAL_MASK);
1469                 E1000_WRITE_REG(hw, E1000_FCRTC, reg);
1470
1471                 dmac = pba - sc->max_frame_size / 512;
1472                 if (dmac < pba - 10)
1473                         dmac = pba - 10;
1474                 reg = E1000_READ_REG(hw, E1000_DMACR);
1475                 reg &= ~E1000_DMACR_DMACTHR_MASK;
1476                 reg = ((dmac << E1000_DMACR_DMACTHR_SHIFT)
1477                     & E1000_DMACR_DMACTHR_MASK);
1478                 /* Transition to L0x or L1 if available.. */
1479                 reg |= (E1000_DMACR_DMAC_EN | E1000_DMACR_DMAC_LX_MASK);
1480                 /* timer = value in sc->dma_coalesce in 32usec intervals */
1481                 reg |= (sc->dma_coalesce >> 5);
1482                 E1000_WRITE_REG(hw, E1000_DMACR, reg);
1483
1484                 /* Set the interval before transition */
1485                 reg = E1000_READ_REG(hw, E1000_DMCTLX);
1486                 reg |= 0x80000004;
1487                 E1000_WRITE_REG(hw, E1000_DMCTLX, reg);
1488
1489                 /* Free space in tx packet buffer to wake from DMA coal */
1490                 E1000_WRITE_REG(hw, E1000_DMCTXTH,
1491                     (20480 - (2 * sc->max_frame_size)) >> 6);
1492
1493                 /* Make low power state decision controlled by DMA coal */
1494                 reg = E1000_READ_REG(hw, E1000_PCIEMISC);
1495                 reg &= ~E1000_PCIEMISC_LX_DECISION;
1496                 E1000_WRITE_REG(hw, E1000_PCIEMISC, reg);
1497                 if_printf(ifp, "DMA Coalescing enabled\n");
1498         } else if (hw->mac.type == e1000_82580) {
1499                 uint32_t reg = E1000_READ_REG(hw, E1000_PCIEMISC);
1500
1501                 E1000_WRITE_REG(hw, E1000_DMACR, 0);
1502                 E1000_WRITE_REG(hw, E1000_PCIEMISC,
1503                     reg & ~E1000_PCIEMISC_LX_DECISION);
1504         }
1505
1506 reset_out:
1507         E1000_WRITE_REG(&sc->hw, E1000_VET, ETHERTYPE_VLAN);
1508         e1000_get_phy_info(hw);
1509         e1000_check_for_link(hw);
1510 }
1511
1512 static void
1513 igb_setup_ifp(struct igb_softc *sc)
1514 {
1515         struct ifnet *ifp = &sc->arpcom.ac_if;
1516
1517         ifp->if_softc = sc;
1518         ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
1519         ifp->if_init = igb_init;
1520         ifp->if_ioctl = igb_ioctl;
1521         ifp->if_start = igb_start;
1522         ifp->if_serialize = igb_serialize;
1523         ifp->if_deserialize = igb_deserialize;
1524         ifp->if_tryserialize = igb_tryserialize;
1525 #ifdef INVARIANTS
1526         ifp->if_serialize_assert = igb_serialize_assert;
1527 #endif
1528 #ifdef IFPOLL_ENABLE
1529         ifp->if_npoll = igb_npoll;
1530 #endif
1531
1532         ifq_set_maxlen(&ifp->if_snd, sc->tx_rings[0].num_tx_desc - 1);
1533         ifq_set_ready(&ifp->if_snd);
1534         ifq_set_subq_cnt(&ifp->if_snd, sc->tx_ring_cnt);
1535
1536         ifp->if_mapsubq = ifq_mapsubq_mask;
1537         ifq_set_subq_mask(&ifp->if_snd, 0);
1538
1539         ether_ifattach(ifp, sc->hw.mac.addr, NULL);
1540
1541         ifp->if_capabilities =
1542             IFCAP_HWCSUM | IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_TSO;
1543         if (IGB_ENABLE_HWRSS(sc))
1544                 ifp->if_capabilities |= IFCAP_RSS;
1545         ifp->if_capenable = ifp->if_capabilities;
1546         ifp->if_hwassist = IGB_CSUM_FEATURES | CSUM_TSO;
1547
1548         /*
1549          * Tell the upper layer(s) we support long frames
1550          */
1551         ifp->if_data.ifi_hdrlen = sizeof(struct ether_vlan_header);
1552
1553         /*
1554          * Specify the media types supported by this adapter and register
1555          * callbacks to update media and link information
1556          */
1557         ifmedia_init(&sc->media, IFM_IMASK, igb_media_change, igb_media_status);
1558         if (sc->hw.phy.media_type == e1000_media_type_fiber ||
1559             sc->hw.phy.media_type == e1000_media_type_internal_serdes) {
1560                 ifmedia_add(&sc->media, IFM_ETHER | IFM_1000_SX | IFM_FDX,
1561                     0, NULL);
1562                 ifmedia_add(&sc->media, IFM_ETHER | IFM_1000_SX, 0, NULL);
1563         } else {
1564                 ifmedia_add(&sc->media, IFM_ETHER | IFM_10_T, 0, NULL);
1565                 ifmedia_add(&sc->media, IFM_ETHER | IFM_10_T | IFM_FDX,
1566                     0, NULL);
1567                 ifmedia_add(&sc->media, IFM_ETHER | IFM_100_TX, 0, NULL);
1568                 ifmedia_add(&sc->media, IFM_ETHER | IFM_100_TX | IFM_FDX,
1569                     0, NULL);
1570                 if (sc->hw.phy.type != e1000_phy_ife) {
1571                         ifmedia_add(&sc->media,
1572                             IFM_ETHER | IFM_1000_T | IFM_FDX, 0, NULL);
1573                         ifmedia_add(&sc->media,
1574                             IFM_ETHER | IFM_1000_T, 0, NULL);
1575                 }
1576         }
1577         ifmedia_add(&sc->media, IFM_ETHER | IFM_AUTO, 0, NULL);
1578         ifmedia_set(&sc->media, IFM_ETHER | IFM_AUTO);
1579 }
1580
1581 static void
1582 igb_add_sysctl(struct igb_softc *sc)
1583 {
1584         char node[32];
1585         int i;
1586
1587         sysctl_ctx_init(&sc->sysctl_ctx);
1588         sc->sysctl_tree = SYSCTL_ADD_NODE(&sc->sysctl_ctx,
1589             SYSCTL_STATIC_CHILDREN(_hw), OID_AUTO,
1590             device_get_nameunit(sc->dev), CTLFLAG_RD, 0, "");
1591         if (sc->sysctl_tree == NULL) {
1592                 device_printf(sc->dev, "can't add sysctl node\n");
1593                 return;
1594         }
1595
1596         SYSCTL_ADD_INT(&sc->sysctl_ctx, SYSCTL_CHILDREN(sc->sysctl_tree),
1597             OID_AUTO, "rxr", CTLFLAG_RD, &sc->rx_ring_cnt, 0, "# of RX rings");
1598         SYSCTL_ADD_INT(&sc->sysctl_ctx, SYSCTL_CHILDREN(sc->sysctl_tree),
1599             OID_AUTO, "rxr_inuse", CTLFLAG_RD, &sc->rx_ring_inuse, 0,
1600             "# of RX rings used");
1601         SYSCTL_ADD_INT(&sc->sysctl_ctx, SYSCTL_CHILDREN(sc->sysctl_tree),
1602             OID_AUTO, "txr", CTLFLAG_RD, &sc->tx_ring_cnt, 0, "# of TX rings");
1603         SYSCTL_ADD_INT(&sc->sysctl_ctx, SYSCTL_CHILDREN(sc->sysctl_tree),
1604             OID_AUTO, "txr_inuse", CTLFLAG_RD, &sc->tx_ring_inuse, 0,
1605             "# of TX rings used");
1606         SYSCTL_ADD_INT(&sc->sysctl_ctx, SYSCTL_CHILDREN(sc->sysctl_tree),
1607             OID_AUTO, "rxd", CTLFLAG_RD, &sc->rx_rings[0].num_rx_desc, 0,
1608             "# of RX descs");
1609         SYSCTL_ADD_INT(&sc->sysctl_ctx, SYSCTL_CHILDREN(sc->sysctl_tree),
1610             OID_AUTO, "txd", CTLFLAG_RD, &sc->tx_rings[0].num_tx_desc, 0,
1611             "# of TX descs");
1612
1613         if (sc->intr_type != PCI_INTR_TYPE_MSIX) {
1614                 SYSCTL_ADD_PROC(&sc->sysctl_ctx,
1615                     SYSCTL_CHILDREN(sc->sysctl_tree),
1616                     OID_AUTO, "intr_rate", CTLTYPE_INT | CTLFLAG_RW,
1617                     sc, 0, igb_sysctl_intr_rate, "I", "interrupt rate");
1618         } else {
1619                 for (i = 0; i < sc->msix_cnt; ++i) {
1620                         struct igb_msix_data *msix = &sc->msix_data[i];
1621
1622                         ksnprintf(node, sizeof(node), "msix%d_rate", i);
1623                         SYSCTL_ADD_PROC(&sc->sysctl_ctx,
1624                             SYSCTL_CHILDREN(sc->sysctl_tree),
1625                             OID_AUTO, node, CTLTYPE_INT | CTLFLAG_RW,
1626                             msix, 0, igb_sysctl_msix_rate, "I",
1627                             msix->msix_rate_desc);
1628                 }
1629         }
1630
1631         SYSCTL_ADD_PROC(&sc->sysctl_ctx, SYSCTL_CHILDREN(sc->sysctl_tree),
1632             OID_AUTO, "tx_intr_nsegs", CTLTYPE_INT | CTLFLAG_RW,
1633             sc, 0, igb_sysctl_tx_intr_nsegs, "I",
1634             "# of segments per TX interrupt");
1635
1636         SYSCTL_ADD_PROC(&sc->sysctl_ctx, SYSCTL_CHILDREN(sc->sysctl_tree),
1637             OID_AUTO, "tx_wreg_nsegs", CTLTYPE_INT | CTLFLAG_RW,
1638             sc, 0, igb_sysctl_tx_wreg_nsegs, "I",
1639             "# of segments sent before write to hardware register");
1640
1641         SYSCTL_ADD_PROC(&sc->sysctl_ctx, SYSCTL_CHILDREN(sc->sysctl_tree),
1642             OID_AUTO, "rx_wreg_nsegs", CTLTYPE_INT | CTLFLAG_RW,
1643             sc, 0, igb_sysctl_rx_wreg_nsegs, "I",
1644             "# of segments received before write to hardware register");
1645
1646 #ifdef IFPOLL_ENABLE
1647         SYSCTL_ADD_PROC(&sc->sysctl_ctx, SYSCTL_CHILDREN(sc->sysctl_tree),
1648             OID_AUTO, "npoll_rxoff", CTLTYPE_INT|CTLFLAG_RW,
1649             sc, 0, igb_sysctl_npoll_rxoff, "I", "NPOLLING RX cpu offset");
1650         SYSCTL_ADD_PROC(&sc->sysctl_ctx, SYSCTL_CHILDREN(sc->sysctl_tree),
1651             OID_AUTO, "npoll_txoff", CTLTYPE_INT|CTLFLAG_RW,
1652             sc, 0, igb_sysctl_npoll_txoff, "I", "NPOLLING TX cpu offset");
1653 #endif
1654
1655 #ifdef IGB_RSS_DEBUG
1656         SYSCTL_ADD_INT(&sc->sysctl_ctx, SYSCTL_CHILDREN(sc->sysctl_tree),
1657             OID_AUTO, "rss_debug", CTLFLAG_RW, &sc->rss_debug, 0,
1658             "RSS debug level");
1659         for (i = 0; i < sc->rx_ring_cnt; ++i) {
1660                 ksnprintf(node, sizeof(node), "rx%d_pkt", i);
1661                 SYSCTL_ADD_ULONG(&sc->sysctl_ctx,
1662                     SYSCTL_CHILDREN(sc->sysctl_tree), OID_AUTO, node,
1663                     CTLFLAG_RW, &sc->rx_rings[i].rx_packets, "RXed packets");
1664         }
1665 #endif
1666 #ifdef IGB_TSS_DEBUG
1667         for  (i = 0; i < sc->tx_ring_cnt; ++i) {
1668                 ksnprintf(node, sizeof(node), "tx%d_pkt", i);
1669                 SYSCTL_ADD_ULONG(&sc->sysctl_ctx,
1670                     SYSCTL_CHILDREN(sc->sysctl_tree), OID_AUTO, node,
1671                     CTLFLAG_RW, &sc->tx_rings[i].tx_packets, "TXed packets");
1672         }
1673 #endif
1674 }
1675
1676 static int
1677 igb_alloc_rings(struct igb_softc *sc)
1678 {
1679         int error, i;
1680
1681         /*
1682          * Create top level busdma tag
1683          */
1684         error = bus_dma_tag_create(NULL, 1, 0,
1685             BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL,
1686             BUS_SPACE_MAXSIZE_32BIT, 0, BUS_SPACE_MAXSIZE_32BIT, 0,
1687             &sc->parent_tag);
1688         if (error) {
1689                 device_printf(sc->dev, "could not create top level DMA tag\n");
1690                 return error;
1691         }
1692
1693         /*
1694          * Allocate TX descriptor rings and buffers
1695          */
1696         sc->tx_rings = kmalloc_cachealign(
1697             sizeof(struct igb_tx_ring) * sc->tx_ring_cnt,
1698             M_DEVBUF, M_WAITOK | M_ZERO);
1699         for (i = 0; i < sc->tx_ring_cnt; ++i) {
1700                 struct igb_tx_ring *txr = &sc->tx_rings[i];
1701
1702                 /* Set up some basics */
1703                 txr->sc = sc;
1704                 txr->me = i;
1705                 lwkt_serialize_init(&txr->tx_serialize);
1706
1707                 error = igb_create_tx_ring(txr);
1708                 if (error)
1709                         return error;
1710         }
1711
1712         /*
1713          * Allocate RX descriptor rings and buffers
1714          */ 
1715         sc->rx_rings = kmalloc_cachealign(
1716             sizeof(struct igb_rx_ring) * sc->rx_ring_cnt,
1717             M_DEVBUF, M_WAITOK | M_ZERO);
1718         for (i = 0; i < sc->rx_ring_cnt; ++i) {
1719                 struct igb_rx_ring *rxr = &sc->rx_rings[i];
1720
1721                 /* Set up some basics */
1722                 rxr->sc = sc;
1723                 rxr->me = i;
1724                 lwkt_serialize_init(&rxr->rx_serialize);
1725
1726                 error = igb_create_rx_ring(rxr);
1727                 if (error)
1728                         return error;
1729         }
1730
1731         return 0;
1732 }
1733
1734 static void
1735 igb_free_rings(struct igb_softc *sc)
1736 {
1737         int i;
1738
1739         if (sc->tx_rings != NULL) {
1740                 for (i = 0; i < sc->tx_ring_cnt; ++i) {
1741                         struct igb_tx_ring *txr = &sc->tx_rings[i];
1742
1743                         igb_destroy_tx_ring(txr, txr->num_tx_desc);
1744                 }
1745                 kfree(sc->tx_rings, M_DEVBUF);
1746         }
1747
1748         if (sc->rx_rings != NULL) {
1749                 for (i = 0; i < sc->rx_ring_cnt; ++i) {
1750                         struct igb_rx_ring *rxr = &sc->rx_rings[i];
1751
1752                         igb_destroy_rx_ring(rxr, rxr->num_rx_desc);
1753                 }
1754                 kfree(sc->rx_rings, M_DEVBUF);
1755         }
1756 }
1757
1758 static int
1759 igb_create_tx_ring(struct igb_tx_ring *txr)
1760 {
1761         int tsize, error, i, ntxd;
1762
1763         /*
1764          * Validate number of transmit descriptors. It must not exceed
1765          * hardware maximum, and must be multiple of IGB_DBA_ALIGN.
1766          */
1767         ntxd = device_getenv_int(txr->sc->dev, "txd", igb_txd);
1768         if ((ntxd * sizeof(struct e1000_tx_desc)) % IGB_DBA_ALIGN != 0 ||
1769             ntxd > IGB_MAX_TXD || ntxd < IGB_MIN_TXD) {
1770                 device_printf(txr->sc->dev,
1771                     "Using %d TX descriptors instead of %d!\n",
1772                     IGB_DEFAULT_TXD, ntxd);
1773                 txr->num_tx_desc = IGB_DEFAULT_TXD;
1774         } else {
1775                 txr->num_tx_desc = ntxd;
1776         }
1777
1778         /*
1779          * Allocate TX descriptor ring
1780          */
1781         tsize = roundup2(txr->num_tx_desc * sizeof(union e1000_adv_tx_desc),
1782             IGB_DBA_ALIGN);
1783         txr->txdma.dma_vaddr = bus_dmamem_coherent_any(txr->sc->parent_tag,
1784             IGB_DBA_ALIGN, tsize, BUS_DMA_WAITOK,
1785             &txr->txdma.dma_tag, &txr->txdma.dma_map, &txr->txdma.dma_paddr);
1786         if (txr->txdma.dma_vaddr == NULL) {
1787                 device_printf(txr->sc->dev,
1788                     "Unable to allocate TX Descriptor memory\n");
1789                 return ENOMEM;
1790         }
1791         txr->tx_base = txr->txdma.dma_vaddr;
1792         bzero(txr->tx_base, tsize);
1793
1794         tsize = __VM_CACHELINE_ALIGN(
1795             sizeof(struct igb_tx_buf) * txr->num_tx_desc);
1796         txr->tx_buf = kmalloc_cachealign(tsize, M_DEVBUF, M_WAITOK | M_ZERO);
1797
1798         /*
1799          * Allocate TX head write-back buffer
1800          */
1801         txr->tx_hdr = bus_dmamem_coherent_any(txr->sc->parent_tag,
1802             __VM_CACHELINE_SIZE, __VM_CACHELINE_SIZE, BUS_DMA_WAITOK,
1803             &txr->tx_hdr_dtag, &txr->tx_hdr_dmap, &txr->tx_hdr_paddr);
1804         if (txr->tx_hdr == NULL) {
1805                 device_printf(txr->sc->dev,
1806                     "Unable to allocate TX head write-back buffer\n");
1807                 return ENOMEM;
1808         }
1809
1810         /*
1811          * Create DMA tag for TX buffers
1812          */
1813         error = bus_dma_tag_create(txr->sc->parent_tag,
1814             1, 0,               /* alignment, bounds */
1815             BUS_SPACE_MAXADDR,  /* lowaddr */
1816             BUS_SPACE_MAXADDR,  /* highaddr */
1817             NULL, NULL,         /* filter, filterarg */
1818             IGB_TSO_SIZE,       /* maxsize */
1819             IGB_MAX_SCATTER,    /* nsegments */
1820             PAGE_SIZE,          /* maxsegsize */
1821             BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW |
1822             BUS_DMA_ONEBPAGE,   /* flags */
1823             &txr->tx_tag);
1824         if (error) {
1825                 device_printf(txr->sc->dev, "Unable to allocate TX DMA tag\n");
1826                 kfree(txr->tx_buf, M_DEVBUF);
1827                 txr->tx_buf = NULL;
1828                 return error;
1829         }
1830
1831         /*
1832          * Create DMA maps for TX buffers
1833          */
1834         for (i = 0; i < txr->num_tx_desc; ++i) {
1835                 struct igb_tx_buf *txbuf = &txr->tx_buf[i];
1836
1837                 error = bus_dmamap_create(txr->tx_tag,
1838                     BUS_DMA_WAITOK | BUS_DMA_ONEBPAGE, &txbuf->map);
1839                 if (error) {
1840                         device_printf(txr->sc->dev,
1841                             "Unable to create TX DMA map\n");
1842                         igb_destroy_tx_ring(txr, i);
1843                         return error;
1844                 }
1845         }
1846
1847         if (txr->sc->hw.mac.type == e1000_82575)
1848                 txr->tx_flags |= IGB_TXFLAG_TSO_IPLEN0;
1849
1850         /*
1851          * Initialize various watermark
1852          */
1853         txr->spare_desc = IGB_TX_SPARE;
1854         txr->intr_nsegs = txr->num_tx_desc / 16;
1855         txr->wreg_nsegs = IGB_DEF_TXWREG_NSEGS;
1856         txr->oact_hi_desc = txr->num_tx_desc / 2;
1857         txr->oact_lo_desc = txr->num_tx_desc / 8;
1858         if (txr->oact_lo_desc > IGB_TX_OACTIVE_MAX)
1859                 txr->oact_lo_desc = IGB_TX_OACTIVE_MAX;
1860         if (txr->oact_lo_desc < txr->spare_desc + IGB_TX_RESERVED)
1861                 txr->oact_lo_desc = txr->spare_desc + IGB_TX_RESERVED;
1862
1863         return 0;
1864 }
1865
1866 static void
1867 igb_free_tx_ring(struct igb_tx_ring *txr)
1868 {
1869         int i;
1870
1871         for (i = 0; i < txr->num_tx_desc; ++i) {
1872                 struct igb_tx_buf *txbuf = &txr->tx_buf[i];
1873
1874                 if (txbuf->m_head != NULL) {
1875                         bus_dmamap_unload(txr->tx_tag, txbuf->map);
1876                         m_freem(txbuf->m_head);
1877                         txbuf->m_head = NULL;
1878                 }
1879         }
1880 }
1881
1882 static void
1883 igb_destroy_tx_ring(struct igb_tx_ring *txr, int ndesc)
1884 {
1885         int i;
1886
1887         if (txr->txdma.dma_vaddr != NULL) {
1888                 bus_dmamap_unload(txr->txdma.dma_tag, txr->txdma.dma_map);
1889                 bus_dmamem_free(txr->txdma.dma_tag, txr->txdma.dma_vaddr,
1890                     txr->txdma.dma_map);
1891                 bus_dma_tag_destroy(txr->txdma.dma_tag);
1892                 txr->txdma.dma_vaddr = NULL;
1893         }
1894
1895         if (txr->tx_hdr != NULL) {
1896                 bus_dmamap_unload(txr->tx_hdr_dtag, txr->tx_hdr_dmap);
1897                 bus_dmamem_free(txr->tx_hdr_dtag, txr->tx_hdr,
1898                     txr->tx_hdr_dmap);
1899                 bus_dma_tag_destroy(txr->tx_hdr_dtag);
1900                 txr->tx_hdr = NULL;
1901         }
1902
1903         if (txr->tx_buf == NULL)
1904                 return;
1905
1906         for (i = 0; i < ndesc; ++i) {
1907                 struct igb_tx_buf *txbuf = &txr->tx_buf[i];
1908
1909                 KKASSERT(txbuf->m_head == NULL);
1910                 bus_dmamap_destroy(txr->tx_tag, txbuf->map);
1911         }
1912         bus_dma_tag_destroy(txr->tx_tag);
1913
1914         kfree(txr->tx_buf, M_DEVBUF);
1915         txr->tx_buf = NULL;
1916 }
1917
1918 static void
1919 igb_init_tx_ring(struct igb_tx_ring *txr)
1920 {
1921         /* Clear the old descriptor contents */
1922         bzero(txr->tx_base,
1923             sizeof(union e1000_adv_tx_desc) * txr->num_tx_desc);
1924
1925         /* Clear TX head write-back buffer */
1926         *(txr->tx_hdr) = 0;
1927
1928         /* Reset indices */
1929         txr->next_avail_desc = 0;
1930         txr->next_to_clean = 0;
1931         txr->tx_nsegs = 0;
1932
1933         /* Set number of descriptors available */
1934         txr->tx_avail = txr->num_tx_desc;
1935
1936         /* Enable this TX ring */
1937         txr->tx_flags |= IGB_TXFLAG_ENABLED;
1938 }
1939
1940 static void
1941 igb_init_tx_unit(struct igb_softc *sc)
1942 {
1943         struct e1000_hw *hw = &sc->hw;
1944         uint32_t tctl;
1945         int i;
1946
1947         /* Setup the Tx Descriptor Rings */
1948         for (i = 0; i < sc->tx_ring_inuse; ++i) {
1949                 struct igb_tx_ring *txr = &sc->tx_rings[i];
1950                 uint64_t bus_addr = txr->txdma.dma_paddr;
1951                 uint64_t hdr_paddr = txr->tx_hdr_paddr;
1952                 uint32_t txdctl = 0;
1953                 uint32_t dca_txctrl;
1954
1955                 E1000_WRITE_REG(hw, E1000_TDLEN(i),
1956                     txr->num_tx_desc * sizeof(struct e1000_tx_desc));
1957                 E1000_WRITE_REG(hw, E1000_TDBAH(i),
1958                     (uint32_t)(bus_addr >> 32));
1959                 E1000_WRITE_REG(hw, E1000_TDBAL(i),
1960                     (uint32_t)bus_addr);
1961
1962                 /* Setup the HW Tx Head and Tail descriptor pointers */
1963                 E1000_WRITE_REG(hw, E1000_TDT(i), 0);
1964                 E1000_WRITE_REG(hw, E1000_TDH(i), 0);
1965
1966                 dca_txctrl = E1000_READ_REG(hw, E1000_DCA_TXCTRL(i));
1967                 dca_txctrl &= ~E1000_DCA_TXCTRL_TX_WB_RO_EN;
1968                 E1000_WRITE_REG(hw, E1000_DCA_TXCTRL(i), dca_txctrl);
1969
1970                 /*
1971                  * Don't set WB_on_EITR:
1972                  * - 82575 does not have it
1973                  * - It almost has no effect on 82576, see:
1974                  *   82576 specification update errata #26
1975                  * - It causes unnecessary bus traffic
1976                  */
1977                 E1000_WRITE_REG(hw, E1000_TDWBAH(i),
1978                     (uint32_t)(hdr_paddr >> 32));
1979                 E1000_WRITE_REG(hw, E1000_TDWBAL(i),
1980                     ((uint32_t)hdr_paddr) | E1000_TX_HEAD_WB_ENABLE);
1981
1982                 /*
1983                  * WTHRESH is ignored by the hardware, since header
1984                  * write back mode is used.
1985                  */
1986                 txdctl |= IGB_TX_PTHRESH;
1987                 txdctl |= IGB_TX_HTHRESH << 8;
1988                 txdctl |= IGB_TX_WTHRESH << 16;
1989                 txdctl |= E1000_TXDCTL_QUEUE_ENABLE;
1990                 E1000_WRITE_REG(hw, E1000_TXDCTL(i), txdctl);
1991         }
1992
1993         if (sc->vf_ifp)
1994                 return;
1995
1996         e1000_config_collision_dist(hw);
1997
1998         /* Program the Transmit Control Register */
1999         tctl = E1000_READ_REG(hw, E1000_TCTL);
2000         tctl &= ~E1000_TCTL_CT;
2001         tctl |= (E1000_TCTL_PSP | E1000_TCTL_RTLC | E1000_TCTL_EN |
2002             (E1000_COLLISION_THRESHOLD << E1000_CT_SHIFT));
2003
2004         /* This write will effectively turn on the transmit unit. */
2005         E1000_WRITE_REG(hw, E1000_TCTL, tctl);
2006 }
2007
2008 static boolean_t
2009 igb_txcsum_ctx(struct igb_tx_ring *txr, struct mbuf *mp)
2010 {
2011         struct e1000_adv_tx_context_desc *TXD;
2012         uint32_t vlan_macip_lens, type_tucmd_mlhl, mss_l4len_idx;
2013         int ehdrlen, ctxd, ip_hlen = 0;
2014         boolean_t offload = TRUE;
2015
2016         if ((mp->m_pkthdr.csum_flags & IGB_CSUM_FEATURES) == 0)
2017                 offload = FALSE;
2018
2019         vlan_macip_lens = type_tucmd_mlhl = mss_l4len_idx = 0;
2020
2021         ctxd = txr->next_avail_desc;
2022         TXD = (struct e1000_adv_tx_context_desc *)&txr->tx_base[ctxd];
2023
2024         /*
2025          * In advanced descriptors the vlan tag must 
2026          * be placed into the context descriptor, thus
2027          * we need to be here just for that setup.
2028          */
2029         if (mp->m_flags & M_VLANTAG) {
2030                 uint16_t vlantag;
2031
2032                 vlantag = htole16(mp->m_pkthdr.ether_vlantag);
2033                 vlan_macip_lens |= (vlantag << E1000_ADVTXD_VLAN_SHIFT);
2034         } else if (!offload) {
2035                 return FALSE;
2036         }
2037
2038         ehdrlen = mp->m_pkthdr.csum_lhlen;
2039         KASSERT(ehdrlen > 0, ("invalid ether hlen"));
2040
2041         /* Set the ether header length */
2042         vlan_macip_lens |= ehdrlen << E1000_ADVTXD_MACLEN_SHIFT;
2043         if (mp->m_pkthdr.csum_flags & CSUM_IP) {
2044                 type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_IPV4;
2045                 ip_hlen = mp->m_pkthdr.csum_iphlen;
2046                 KASSERT(ip_hlen > 0, ("invalid ip hlen"));
2047         }
2048         vlan_macip_lens |= ip_hlen;
2049
2050         type_tucmd_mlhl |= E1000_ADVTXD_DCMD_DEXT | E1000_ADVTXD_DTYP_CTXT;
2051         if (mp->m_pkthdr.csum_flags & CSUM_TCP)
2052                 type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_TCP;
2053         else if (mp->m_pkthdr.csum_flags & CSUM_UDP)
2054                 type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_UDP;
2055
2056         /* 82575 needs the queue index added */
2057         if (txr->sc->hw.mac.type == e1000_82575)
2058                 mss_l4len_idx = txr->me << 4;
2059
2060         /* Now copy bits into descriptor */
2061         TXD->vlan_macip_lens = htole32(vlan_macip_lens);
2062         TXD->type_tucmd_mlhl = htole32(type_tucmd_mlhl);
2063         TXD->seqnum_seed = htole32(0);
2064         TXD->mss_l4len_idx = htole32(mss_l4len_idx);
2065
2066         /* We've consumed the first desc, adjust counters */
2067         if (++ctxd == txr->num_tx_desc)
2068                 ctxd = 0;
2069         txr->next_avail_desc = ctxd;
2070         --txr->tx_avail;
2071
2072         return offload;
2073 }
2074
2075 static void
2076 igb_txeof(struct igb_tx_ring *txr)
2077 {
2078         struct ifnet *ifp = &txr->sc->arpcom.ac_if;
2079         int first, hdr, avail;
2080
2081         if (txr->tx_avail == txr->num_tx_desc)
2082                 return;
2083
2084         first = txr->next_to_clean;
2085         hdr = *(txr->tx_hdr);
2086
2087         if (first == hdr)
2088                 return;
2089
2090         avail = txr->tx_avail;
2091         while (first != hdr) {
2092                 struct igb_tx_buf *txbuf = &txr->tx_buf[first];
2093
2094                 ++avail;
2095                 if (txbuf->m_head) {
2096                         bus_dmamap_unload(txr->tx_tag, txbuf->map);
2097                         m_freem(txbuf->m_head);
2098                         txbuf->m_head = NULL;
2099                         IFNET_STAT_INC(ifp, opackets, 1);
2100                 }
2101                 if (++first == txr->num_tx_desc)
2102                         first = 0;
2103         }
2104         txr->next_to_clean = first;
2105         txr->tx_avail = avail;
2106
2107         /*
2108          * If we have a minimum free, clear OACTIVE
2109          * to tell the stack that it is OK to send packets.
2110          */
2111         if (IGB_IS_NOT_OACTIVE(txr)) {
2112                 ifsq_clr_oactive(txr->ifsq);
2113
2114                 /*
2115                  * We have enough TX descriptors, turn off
2116                  * the watchdog.  We allow small amount of
2117                  * packets (roughly intr_nsegs) pending on
2118                  * the transmit ring.
2119                  */
2120                 txr->tx_watchdog.wd_timer = 0;
2121         }
2122 }
2123
2124 static int
2125 igb_create_rx_ring(struct igb_rx_ring *rxr)
2126 {
2127         int rsize, i, error, nrxd;
2128
2129         /*
2130          * Validate number of receive descriptors. It must not exceed
2131          * hardware maximum, and must be multiple of IGB_DBA_ALIGN.
2132          */
2133         nrxd = device_getenv_int(rxr->sc->dev, "rxd", igb_rxd);
2134         if ((nrxd * sizeof(struct e1000_rx_desc)) % IGB_DBA_ALIGN != 0 ||
2135             nrxd > IGB_MAX_RXD || nrxd < IGB_MIN_RXD) {
2136                 device_printf(rxr->sc->dev,
2137                     "Using %d RX descriptors instead of %d!\n",
2138                     IGB_DEFAULT_RXD, nrxd);
2139                 rxr->num_rx_desc = IGB_DEFAULT_RXD;
2140         } else {
2141                 rxr->num_rx_desc = nrxd;
2142         }
2143
2144         /*
2145          * Allocate RX descriptor ring
2146          */
2147         rsize = roundup2(rxr->num_rx_desc * sizeof(union e1000_adv_rx_desc),
2148             IGB_DBA_ALIGN);
2149         rxr->rxdma.dma_vaddr = bus_dmamem_coherent_any(rxr->sc->parent_tag,
2150             IGB_DBA_ALIGN, rsize, BUS_DMA_WAITOK,
2151             &rxr->rxdma.dma_tag, &rxr->rxdma.dma_map,
2152             &rxr->rxdma.dma_paddr);
2153         if (rxr->rxdma.dma_vaddr == NULL) {
2154                 device_printf(rxr->sc->dev,
2155                     "Unable to allocate RxDescriptor memory\n");
2156                 return ENOMEM;
2157         }
2158         rxr->rx_base = rxr->rxdma.dma_vaddr;
2159         bzero(rxr->rx_base, rsize);
2160
2161         rsize = __VM_CACHELINE_ALIGN(
2162             sizeof(struct igb_rx_buf) * rxr->num_rx_desc);
2163         rxr->rx_buf = kmalloc_cachealign(rsize, M_DEVBUF, M_WAITOK | M_ZERO);
2164
2165         /*
2166          * Create DMA tag for RX buffers
2167          */
2168         error = bus_dma_tag_create(rxr->sc->parent_tag,
2169             1, 0,               /* alignment, bounds */
2170             BUS_SPACE_MAXADDR,  /* lowaddr */
2171             BUS_SPACE_MAXADDR,  /* highaddr */
2172             NULL, NULL,         /* filter, filterarg */
2173             MCLBYTES,           /* maxsize */
2174             1,                  /* nsegments */
2175             MCLBYTES,           /* maxsegsize */
2176             BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW, /* flags */
2177             &rxr->rx_tag);
2178         if (error) {
2179                 device_printf(rxr->sc->dev,
2180                     "Unable to create RX payload DMA tag\n");
2181                 kfree(rxr->rx_buf, M_DEVBUF);
2182                 rxr->rx_buf = NULL;
2183                 return error;
2184         }
2185
2186         /*
2187          * Create spare DMA map for RX buffers
2188          */
2189         error = bus_dmamap_create(rxr->rx_tag, BUS_DMA_WAITOK,
2190             &rxr->rx_sparemap);
2191         if (error) {
2192                 device_printf(rxr->sc->dev,
2193                     "Unable to create spare RX DMA maps\n");
2194                 bus_dma_tag_destroy(rxr->rx_tag);
2195                 kfree(rxr->rx_buf, M_DEVBUF);
2196                 rxr->rx_buf = NULL;
2197                 return error;
2198         }
2199
2200         /*
2201          * Create DMA maps for RX buffers
2202          */
2203         for (i = 0; i < rxr->num_rx_desc; i++) {
2204                 struct igb_rx_buf *rxbuf = &rxr->rx_buf[i];
2205
2206                 error = bus_dmamap_create(rxr->rx_tag,
2207                     BUS_DMA_WAITOK, &rxbuf->map);
2208                 if (error) {
2209                         device_printf(rxr->sc->dev,
2210                             "Unable to create RX DMA maps\n");
2211                         igb_destroy_rx_ring(rxr, i);
2212                         return error;
2213                 }
2214         }
2215
2216         /*
2217          * Initialize various watermark
2218          */
2219         rxr->wreg_nsegs = IGB_DEF_RXWREG_NSEGS;
2220
2221         return 0;
2222 }
2223
2224 static void
2225 igb_free_rx_ring(struct igb_rx_ring *rxr)
2226 {
2227         int i;
2228
2229         for (i = 0; i < rxr->num_rx_desc; ++i) {
2230                 struct igb_rx_buf *rxbuf = &rxr->rx_buf[i];
2231
2232                 if (rxbuf->m_head != NULL) {
2233                         bus_dmamap_unload(rxr->rx_tag, rxbuf->map);
2234                         m_freem(rxbuf->m_head);
2235                         rxbuf->m_head = NULL;
2236                 }
2237         }
2238
2239         if (rxr->fmp != NULL)
2240                 m_freem(rxr->fmp);
2241         rxr->fmp = NULL;
2242         rxr->lmp = NULL;
2243 }
2244
2245 static void
2246 igb_destroy_rx_ring(struct igb_rx_ring *rxr, int ndesc)
2247 {
2248         int i;
2249
2250         if (rxr->rxdma.dma_vaddr != NULL) {
2251                 bus_dmamap_unload(rxr->rxdma.dma_tag, rxr->rxdma.dma_map);
2252                 bus_dmamem_free(rxr->rxdma.dma_tag, rxr->rxdma.dma_vaddr,
2253                     rxr->rxdma.dma_map);
2254                 bus_dma_tag_destroy(rxr->rxdma.dma_tag);
2255                 rxr->rxdma.dma_vaddr = NULL;
2256         }
2257
2258         if (rxr->rx_buf == NULL)
2259                 return;
2260
2261         for (i = 0; i < ndesc; ++i) {
2262                 struct igb_rx_buf *rxbuf = &rxr->rx_buf[i];
2263
2264                 KKASSERT(rxbuf->m_head == NULL);
2265                 bus_dmamap_destroy(rxr->rx_tag, rxbuf->map);
2266         }
2267         bus_dmamap_destroy(rxr->rx_tag, rxr->rx_sparemap);
2268         bus_dma_tag_destroy(rxr->rx_tag);
2269
2270         kfree(rxr->rx_buf, M_DEVBUF);
2271         rxr->rx_buf = NULL;
2272 }
2273
2274 static void
2275 igb_setup_rxdesc(union e1000_adv_rx_desc *rxd, const struct igb_rx_buf *rxbuf)
2276 {
2277         rxd->read.pkt_addr = htole64(rxbuf->paddr);
2278         rxd->wb.upper.status_error = 0;
2279 }
2280
2281 static int
2282 igb_newbuf(struct igb_rx_ring *rxr, int i, boolean_t wait)
2283 {
2284         struct mbuf *m;
2285         bus_dma_segment_t seg;
2286         bus_dmamap_t map;
2287         struct igb_rx_buf *rxbuf;
2288         int error, nseg;
2289
2290         m = m_getcl(wait ? MB_WAIT : MB_DONTWAIT, MT_DATA, M_PKTHDR);
2291         if (m == NULL) {
2292                 if (wait) {
2293                         if_printf(&rxr->sc->arpcom.ac_if,
2294                             "Unable to allocate RX mbuf\n");
2295                 }
2296                 return ENOBUFS;
2297         }
2298         m->m_len = m->m_pkthdr.len = MCLBYTES;
2299
2300         if (rxr->sc->max_frame_size <= MCLBYTES - ETHER_ALIGN)
2301                 m_adj(m, ETHER_ALIGN);
2302
2303         error = bus_dmamap_load_mbuf_segment(rxr->rx_tag,
2304             rxr->rx_sparemap, m, &seg, 1, &nseg, BUS_DMA_NOWAIT);
2305         if (error) {
2306                 m_freem(m);
2307                 if (wait) {
2308                         if_printf(&rxr->sc->arpcom.ac_if,
2309                             "Unable to load RX mbuf\n");
2310                 }
2311                 return error;
2312         }
2313
2314         rxbuf = &rxr->rx_buf[i];
2315         if (rxbuf->m_head != NULL)
2316                 bus_dmamap_unload(rxr->rx_tag, rxbuf->map);
2317
2318         map = rxbuf->map;
2319         rxbuf->map = rxr->rx_sparemap;
2320         rxr->rx_sparemap = map;
2321
2322         rxbuf->m_head = m;
2323         rxbuf->paddr = seg.ds_addr;
2324
2325         igb_setup_rxdesc(&rxr->rx_base[i], rxbuf);
2326         return 0;
2327 }
2328
2329 static int
2330 igb_init_rx_ring(struct igb_rx_ring *rxr)
2331 {
2332         int i;
2333
2334         /* Clear the ring contents */
2335         bzero(rxr->rx_base,
2336             rxr->num_rx_desc * sizeof(union e1000_adv_rx_desc));
2337
2338         /* Now replenish the ring mbufs */
2339         for (i = 0; i < rxr->num_rx_desc; ++i) {
2340                 int error;
2341
2342                 error = igb_newbuf(rxr, i, TRUE);
2343                 if (error)
2344                         return error;
2345         }
2346
2347         /* Setup our descriptor indices */
2348         rxr->next_to_check = 0;
2349
2350         rxr->fmp = NULL;
2351         rxr->lmp = NULL;
2352         rxr->discard = FALSE;
2353
2354         return 0;
2355 }
2356
2357 static void
2358 igb_init_rx_unit(struct igb_softc *sc)
2359 {
2360         struct ifnet *ifp = &sc->arpcom.ac_if;
2361         struct e1000_hw *hw = &sc->hw;
2362         uint32_t rctl, rxcsum, srrctl = 0;
2363         int i;
2364
2365         /*
2366          * Make sure receives are disabled while setting
2367          * up the descriptor ring
2368          */
2369         rctl = E1000_READ_REG(hw, E1000_RCTL);
2370         E1000_WRITE_REG(hw, E1000_RCTL, rctl & ~E1000_RCTL_EN);
2371
2372 #if 0
2373         /*
2374         ** Set up for header split
2375         */
2376         if (igb_header_split) {
2377                 /* Use a standard mbuf for the header */
2378                 srrctl |= IGB_HDR_BUF << E1000_SRRCTL_BSIZEHDRSIZE_SHIFT;
2379                 srrctl |= E1000_SRRCTL_DESCTYPE_HDR_SPLIT_ALWAYS;
2380         } else
2381 #endif
2382                 srrctl |= E1000_SRRCTL_DESCTYPE_ADV_ONEBUF;
2383
2384         /*
2385         ** Set up for jumbo frames
2386         */
2387         if (ifp->if_mtu > ETHERMTU) {
2388                 rctl |= E1000_RCTL_LPE;
2389 #if 0
2390                 if (adapter->rx_mbuf_sz == MJUMPAGESIZE) {
2391                         srrctl |= 4096 >> E1000_SRRCTL_BSIZEPKT_SHIFT;
2392                         rctl |= E1000_RCTL_SZ_4096 | E1000_RCTL_BSEX;
2393                 } else if (adapter->rx_mbuf_sz > MJUMPAGESIZE) {
2394                         srrctl |= 8192 >> E1000_SRRCTL_BSIZEPKT_SHIFT;
2395                         rctl |= E1000_RCTL_SZ_8192 | E1000_RCTL_BSEX;
2396                 }
2397                 /* Set maximum packet len */
2398                 psize = adapter->max_frame_size;
2399                 /* are we on a vlan? */
2400                 if (adapter->ifp->if_vlantrunk != NULL)
2401                         psize += VLAN_TAG_SIZE;
2402                 E1000_WRITE_REG(&adapter->hw, E1000_RLPML, psize);
2403 #else
2404                 srrctl |= 2048 >> E1000_SRRCTL_BSIZEPKT_SHIFT;
2405                 rctl |= E1000_RCTL_SZ_2048;
2406 #endif
2407         } else {
2408                 rctl &= ~E1000_RCTL_LPE;
2409                 srrctl |= 2048 >> E1000_SRRCTL_BSIZEPKT_SHIFT;
2410                 rctl |= E1000_RCTL_SZ_2048;
2411         }
2412
2413         /* Setup the Base and Length of the Rx Descriptor Rings */
2414         for (i = 0; i < sc->rx_ring_inuse; ++i) {
2415                 struct igb_rx_ring *rxr = &sc->rx_rings[i];
2416                 uint64_t bus_addr = rxr->rxdma.dma_paddr;
2417                 uint32_t rxdctl;
2418
2419                 E1000_WRITE_REG(hw, E1000_RDLEN(i),
2420                     rxr->num_rx_desc * sizeof(struct e1000_rx_desc));
2421                 E1000_WRITE_REG(hw, E1000_RDBAH(i),
2422                     (uint32_t)(bus_addr >> 32));
2423                 E1000_WRITE_REG(hw, E1000_RDBAL(i),
2424                     (uint32_t)bus_addr);
2425                 E1000_WRITE_REG(hw, E1000_SRRCTL(i), srrctl);
2426                 /* Enable this Queue */
2427                 rxdctl = E1000_READ_REG(hw, E1000_RXDCTL(i));
2428                 rxdctl |= E1000_RXDCTL_QUEUE_ENABLE;
2429                 rxdctl &= 0xFFF00000;
2430                 rxdctl |= IGB_RX_PTHRESH;
2431                 rxdctl |= IGB_RX_HTHRESH << 8;
2432                 /*
2433                  * Don't set WTHRESH to a value above 1 on 82576, see:
2434                  * 82576 specification update errata #26
2435                  */
2436                 rxdctl |= IGB_RX_WTHRESH << 16;
2437                 E1000_WRITE_REG(hw, E1000_RXDCTL(i), rxdctl);
2438         }
2439
2440         rxcsum = E1000_READ_REG(&sc->hw, E1000_RXCSUM);
2441         rxcsum &= ~(E1000_RXCSUM_PCSS_MASK | E1000_RXCSUM_IPPCSE);
2442
2443         /*
2444          * Receive Checksum Offload for TCP and UDP
2445          *
2446          * Checksum offloading is also enabled if multiple receive
2447          * queue is to be supported, since we need it to figure out
2448          * fragments.
2449          */
2450         if ((ifp->if_capenable & IFCAP_RXCSUM) || IGB_ENABLE_HWRSS(sc)) {
2451                 /*
2452                  * NOTE:
2453                  * PCSD must be enabled to enable multiple
2454                  * receive queues.
2455                  */
2456                 rxcsum |= E1000_RXCSUM_IPOFL | E1000_RXCSUM_TUOFL |
2457                     E1000_RXCSUM_PCSD;
2458         } else {
2459                 rxcsum &= ~(E1000_RXCSUM_IPOFL | E1000_RXCSUM_TUOFL |
2460                     E1000_RXCSUM_PCSD);
2461         }
2462         E1000_WRITE_REG(&sc->hw, E1000_RXCSUM, rxcsum);
2463
2464         if (IGB_ENABLE_HWRSS(sc)) {
2465                 uint8_t key[IGB_NRSSRK * IGB_RSSRK_SIZE];
2466                 uint32_t reta_shift;
2467                 int j, r;
2468
2469                 /*
2470                  * NOTE:
2471                  * When we reach here, RSS has already been disabled
2472                  * in igb_stop(), so we could safely configure RSS key
2473                  * and redirect table.
2474                  */
2475
2476                 /*
2477                  * Configure RSS key
2478                  */
2479                 toeplitz_get_key(key, sizeof(key));
2480                 for (i = 0; i < IGB_NRSSRK; ++i) {
2481                         uint32_t rssrk;
2482
2483                         rssrk = IGB_RSSRK_VAL(key, i);
2484                         IGB_RSS_DPRINTF(sc, 1, "rssrk%d 0x%08x\n", i, rssrk);
2485
2486                         E1000_WRITE_REG(hw, E1000_RSSRK(i), rssrk);
2487                 }
2488
2489                 /*
2490                  * Configure RSS redirect table in following fashion:
2491                  * (hash & ring_cnt_mask) == rdr_table[(hash & rdr_table_mask)]
2492                  */
2493                 reta_shift = IGB_RETA_SHIFT;
2494                 if (hw->mac.type == e1000_82575)
2495                         reta_shift = IGB_RETA_SHIFT_82575;
2496
2497                 r = 0;
2498                 for (j = 0; j < IGB_NRETA; ++j) {
2499                         uint32_t reta = 0;
2500
2501                         for (i = 0; i < IGB_RETA_SIZE; ++i) {
2502                                 uint32_t q;
2503
2504                                 q = (r % sc->rx_ring_inuse) << reta_shift;
2505                                 reta |= q << (8 * i);
2506                                 ++r;
2507                         }
2508                         IGB_RSS_DPRINTF(sc, 1, "reta 0x%08x\n", reta);
2509                         E1000_WRITE_REG(hw, E1000_RETA(j), reta);
2510                 }
2511
2512                 /*
2513                  * Enable multiple receive queues.
2514                  * Enable IPv4 RSS standard hash functions.
2515                  * Disable RSS interrupt on 82575
2516                  */
2517                 E1000_WRITE_REG(&sc->hw, E1000_MRQC,
2518                                 E1000_MRQC_ENABLE_RSS_4Q |
2519                                 E1000_MRQC_RSS_FIELD_IPV4_TCP |
2520                                 E1000_MRQC_RSS_FIELD_IPV4);
2521         }
2522
2523         /* Setup the Receive Control Register */
2524         rctl &= ~(3 << E1000_RCTL_MO_SHIFT);
2525         rctl |= E1000_RCTL_EN | E1000_RCTL_BAM | E1000_RCTL_LBM_NO |
2526             E1000_RCTL_RDMTS_HALF |
2527             (hw->mac.mc_filter_type << E1000_RCTL_MO_SHIFT);
2528         /* Strip CRC bytes. */
2529         rctl |= E1000_RCTL_SECRC;
2530         /* Make sure VLAN Filters are off */
2531         rctl &= ~E1000_RCTL_VFE;
2532         /* Don't store bad packets */
2533         rctl &= ~E1000_RCTL_SBP;
2534
2535         /* Enable Receives */
2536         E1000_WRITE_REG(hw, E1000_RCTL, rctl);
2537
2538         /*
2539          * Setup the HW Rx Head and Tail Descriptor Pointers
2540          *   - needs to be after enable
2541          */
2542         for (i = 0; i < sc->rx_ring_inuse; ++i) {
2543                 struct igb_rx_ring *rxr = &sc->rx_rings[i];
2544
2545                 E1000_WRITE_REG(hw, E1000_RDH(i), rxr->next_to_check);
2546                 E1000_WRITE_REG(hw, E1000_RDT(i), rxr->num_rx_desc - 1);
2547         }
2548 }
2549
2550 static void
2551 igb_rx_refresh(struct igb_rx_ring *rxr, int i)
2552 {
2553         if (--i < 0)
2554                 i = rxr->num_rx_desc - 1;
2555         E1000_WRITE_REG(&rxr->sc->hw, E1000_RDT(rxr->me), i);
2556 }
2557
2558 static void
2559 igb_rxeof(struct igb_rx_ring *rxr, int count)
2560 {
2561         struct ifnet *ifp = &rxr->sc->arpcom.ac_if;
2562         union e1000_adv_rx_desc *cur;
2563         uint32_t staterr;
2564         int i, ncoll = 0;
2565
2566         i = rxr->next_to_check;
2567         cur = &rxr->rx_base[i];
2568         staterr = le32toh(cur->wb.upper.status_error);
2569
2570         if ((staterr & E1000_RXD_STAT_DD) == 0)
2571                 return;
2572
2573         while ((staterr & E1000_RXD_STAT_DD) && count != 0) {
2574                 struct pktinfo *pi = NULL, pi0;
2575                 struct igb_rx_buf *rxbuf = &rxr->rx_buf[i];
2576                 struct mbuf *m = NULL;
2577                 boolean_t eop;
2578
2579                 eop = (staterr & E1000_RXD_STAT_EOP) ? TRUE : FALSE;
2580                 if (eop)
2581                         --count;
2582
2583                 ++ncoll;
2584                 if ((staterr & E1000_RXDEXT_ERR_FRAME_ERR_MASK) == 0 &&
2585                     !rxr->discard) {
2586                         struct mbuf *mp = rxbuf->m_head;
2587                         uint32_t hash, hashtype;
2588                         uint16_t vlan;
2589                         int len;
2590
2591                         len = le16toh(cur->wb.upper.length);
2592                         if (rxr->sc->hw.mac.type == e1000_i350 &&
2593                             (staterr & E1000_RXDEXT_STATERR_LB))
2594                                 vlan = be16toh(cur->wb.upper.vlan);
2595                         else
2596                                 vlan = le16toh(cur->wb.upper.vlan);
2597
2598                         hash = le32toh(cur->wb.lower.hi_dword.rss);
2599                         hashtype = le32toh(cur->wb.lower.lo_dword.data) &
2600                             E1000_RXDADV_RSSTYPE_MASK;
2601
2602                         IGB_RSS_DPRINTF(rxr->sc, 10,
2603                             "ring%d, hash 0x%08x, hashtype %u\n",
2604                             rxr->me, hash, hashtype);
2605
2606                         bus_dmamap_sync(rxr->rx_tag, rxbuf->map,
2607                             BUS_DMASYNC_POSTREAD);
2608
2609                         if (igb_newbuf(rxr, i, FALSE) != 0) {
2610                                 IFNET_STAT_INC(ifp, iqdrops, 1);
2611                                 goto discard;
2612                         }
2613
2614                         mp->m_len = len;
2615                         if (rxr->fmp == NULL) {
2616                                 mp->m_pkthdr.len = len;
2617                                 rxr->fmp = mp;
2618                                 rxr->lmp = mp;
2619                         } else {
2620                                 rxr->lmp->m_next = mp;
2621                                 rxr->lmp = rxr->lmp->m_next;
2622                                 rxr->fmp->m_pkthdr.len += len;
2623                         }
2624
2625                         if (eop) {
2626                                 m = rxr->fmp;
2627                                 rxr->fmp = NULL;
2628                                 rxr->lmp = NULL;
2629
2630                                 m->m_pkthdr.rcvif = ifp;
2631                                 IFNET_STAT_INC(ifp, ipackets, 1);
2632
2633                                 if (ifp->if_capenable & IFCAP_RXCSUM)
2634                                         igb_rxcsum(staterr, m);
2635
2636                                 if (staterr & E1000_RXD_STAT_VP) {
2637                                         m->m_pkthdr.ether_vlantag = vlan;
2638                                         m->m_flags |= M_VLANTAG;
2639                                 }
2640
2641                                 if (ifp->if_capenable & IFCAP_RSS) {
2642                                         pi = igb_rssinfo(m, &pi0,
2643                                             hash, hashtype, staterr);
2644                                 }
2645 #ifdef IGB_RSS_DEBUG
2646                                 rxr->rx_packets++;
2647 #endif
2648                         }
2649                 } else {
2650                         IFNET_STAT_INC(ifp, ierrors, 1);
2651 discard:
2652                         igb_setup_rxdesc(cur, rxbuf);
2653                         if (!eop)
2654                                 rxr->discard = TRUE;
2655                         else
2656                                 rxr->discard = FALSE;
2657                         if (rxr->fmp != NULL) {
2658                                 m_freem(rxr->fmp);
2659                                 rxr->fmp = NULL;
2660                                 rxr->lmp = NULL;
2661                         }
2662                         m = NULL;
2663                 }
2664
2665                 if (m != NULL)
2666                         ether_input_pkt(ifp, m, pi);
2667
2668                 /* Advance our pointers to the next descriptor. */
2669                 if (++i == rxr->num_rx_desc)
2670                         i = 0;
2671
2672                 if (ncoll >= rxr->wreg_nsegs) {
2673                         igb_rx_refresh(rxr, i);
2674                         ncoll = 0;
2675                 }
2676
2677                 cur = &rxr->rx_base[i];
2678                 staterr = le32toh(cur->wb.upper.status_error);
2679         }
2680         rxr->next_to_check = i;
2681
2682         if (ncoll > 0)
2683                 igb_rx_refresh(rxr, i);
2684 }
2685
2686
2687 static void
2688 igb_set_vlan(struct igb_softc *sc)
2689 {
2690         struct e1000_hw *hw = &sc->hw;
2691         uint32_t reg;
2692 #if 0
2693         struct ifnet *ifp = sc->arpcom.ac_if;
2694 #endif
2695
2696         if (sc->vf_ifp) {
2697                 e1000_rlpml_set_vf(hw, sc->max_frame_size + VLAN_TAG_SIZE);
2698                 return;
2699         }
2700
2701         reg = E1000_READ_REG(hw, E1000_CTRL);
2702         reg |= E1000_CTRL_VME;
2703         E1000_WRITE_REG(hw, E1000_CTRL, reg);
2704
2705 #if 0
2706         /* Enable the Filter Table */
2707         if (ifp->if_capenable & IFCAP_VLAN_HWFILTER) {
2708                 reg = E1000_READ_REG(hw, E1000_RCTL);
2709                 reg &= ~E1000_RCTL_CFIEN;
2710                 reg |= E1000_RCTL_VFE;
2711                 E1000_WRITE_REG(hw, E1000_RCTL, reg);
2712         }
2713 #endif
2714
2715         /* Update the frame size */
2716         E1000_WRITE_REG(&sc->hw, E1000_RLPML,
2717             sc->max_frame_size + VLAN_TAG_SIZE);
2718
2719 #if 0
2720         /* Don't bother with table if no vlans */
2721         if ((adapter->num_vlans == 0) ||
2722             ((ifp->if_capenable & IFCAP_VLAN_HWFILTER) == 0))
2723                 return;
2724         /*
2725         ** A soft reset zero's out the VFTA, so
2726         ** we need to repopulate it now.
2727         */
2728         for (int i = 0; i < IGB_VFTA_SIZE; i++)
2729                 if (adapter->shadow_vfta[i] != 0) {
2730                         if (adapter->vf_ifp)
2731                                 e1000_vfta_set_vf(hw,
2732                                     adapter->shadow_vfta[i], TRUE);
2733                         else
2734                                 E1000_WRITE_REG_ARRAY(hw, E1000_VFTA,
2735                                  i, adapter->shadow_vfta[i]);
2736                 }
2737 #endif
2738 }
2739
2740 static void
2741 igb_enable_intr(struct igb_softc *sc)
2742 {
2743         if (sc->intr_type != PCI_INTR_TYPE_MSIX) {
2744                 lwkt_serialize_handler_enable(&sc->main_serialize);
2745         } else {
2746                 int i;
2747
2748                 for (i = 0; i < sc->msix_cnt; ++i) {
2749                         lwkt_serialize_handler_enable(
2750                             sc->msix_data[i].msix_serialize);
2751                 }
2752         }
2753
2754         if ((sc->flags & IGB_FLAG_SHARED_INTR) == 0) {
2755                 if (sc->intr_type == PCI_INTR_TYPE_MSIX)
2756                         E1000_WRITE_REG(&sc->hw, E1000_EIAC, sc->intr_mask);
2757                 else
2758                         E1000_WRITE_REG(&sc->hw, E1000_EIAC, 0);
2759                 E1000_WRITE_REG(&sc->hw, E1000_EIAM, sc->intr_mask);
2760                 E1000_WRITE_REG(&sc->hw, E1000_EIMS, sc->intr_mask);
2761                 E1000_WRITE_REG(&sc->hw, E1000_IMS, E1000_IMS_LSC);
2762         } else {
2763                 E1000_WRITE_REG(&sc->hw, E1000_IMS, IMS_ENABLE_MASK);
2764         }
2765         E1000_WRITE_FLUSH(&sc->hw);
2766 }
2767
2768 static void
2769 igb_disable_intr(struct igb_softc *sc)
2770 {
2771         if ((sc->flags & IGB_FLAG_SHARED_INTR) == 0) {
2772                 E1000_WRITE_REG(&sc->hw, E1000_EIMC, 0xffffffff);
2773                 E1000_WRITE_REG(&sc->hw, E1000_EIAC, 0);
2774         }
2775         E1000_WRITE_REG(&sc->hw, E1000_IMC, 0xffffffff);
2776         E1000_WRITE_FLUSH(&sc->hw);
2777
2778         if (sc->intr_type != PCI_INTR_TYPE_MSIX) {
2779                 lwkt_serialize_handler_disable(&sc->main_serialize);
2780         } else {
2781                 int i;
2782
2783                 for (i = 0; i < sc->msix_cnt; ++i) {
2784                         lwkt_serialize_handler_disable(
2785                             sc->msix_data[i].msix_serialize);
2786                 }
2787         }
2788 }
2789
2790 /*
2791  * Bit of a misnomer, what this really means is
2792  * to enable OS management of the system... aka
2793  * to disable special hardware management features 
2794  */
2795 static void
2796 igb_get_mgmt(struct igb_softc *sc)
2797 {
2798         if (sc->flags & IGB_FLAG_HAS_MGMT) {
2799                 int manc2h = E1000_READ_REG(&sc->hw, E1000_MANC2H);
2800                 int manc = E1000_READ_REG(&sc->hw, E1000_MANC);
2801
2802                 /* disable hardware interception of ARP */
2803                 manc &= ~E1000_MANC_ARP_EN;
2804
2805                 /* enable receiving management packets to the host */
2806                 manc |= E1000_MANC_EN_MNG2HOST;
2807                 manc2h |= 1 << 5; /* Mng Port 623 */
2808                 manc2h |= 1 << 6; /* Mng Port 664 */
2809                 E1000_WRITE_REG(&sc->hw, E1000_MANC2H, manc2h);
2810                 E1000_WRITE_REG(&sc->hw, E1000_MANC, manc);
2811         }
2812 }
2813
2814 /*
2815  * Give control back to hardware management controller
2816  * if there is one.
2817  */
2818 static void
2819 igb_rel_mgmt(struct igb_softc *sc)
2820 {
2821         if (sc->flags & IGB_FLAG_HAS_MGMT) {
2822                 int manc = E1000_READ_REG(&sc->hw, E1000_MANC);
2823
2824                 /* Re-enable hardware interception of ARP */
2825                 manc |= E1000_MANC_ARP_EN;
2826                 manc &= ~E1000_MANC_EN_MNG2HOST;
2827
2828                 E1000_WRITE_REG(&sc->hw, E1000_MANC, manc);
2829         }
2830 }
2831
2832 /*
2833  * Sets CTRL_EXT:DRV_LOAD bit.
2834  *
2835  * For ASF and Pass Through versions of f/w this means that
2836  * the driver is loaded. 
2837  */
2838 static void
2839 igb_get_hw_control(struct igb_softc *sc)
2840 {
2841         uint32_t ctrl_ext;
2842
2843         if (sc->vf_ifp)
2844                 return;
2845
2846         /* Let firmware know the driver has taken over */
2847         ctrl_ext = E1000_READ_REG(&sc->hw, E1000_CTRL_EXT);
2848         E1000_WRITE_REG(&sc->hw, E1000_CTRL_EXT,
2849             ctrl_ext | E1000_CTRL_EXT_DRV_LOAD);
2850 }
2851
2852 /*
2853  * Resets CTRL_EXT:DRV_LOAD bit.
2854  *
2855  * For ASF and Pass Through versions of f/w this means that the
2856  * driver is no longer loaded.
2857  */
2858 static void
2859 igb_rel_hw_control(struct igb_softc *sc)
2860 {
2861         uint32_t ctrl_ext;
2862
2863         if (sc->vf_ifp)
2864                 return;
2865
2866         /* Let firmware taken over control of h/w */
2867         ctrl_ext = E1000_READ_REG(&sc->hw, E1000_CTRL_EXT);
2868         E1000_WRITE_REG(&sc->hw, E1000_CTRL_EXT,
2869             ctrl_ext & ~E1000_CTRL_EXT_DRV_LOAD);
2870 }
2871
2872 static int
2873 igb_is_valid_ether_addr(const uint8_t *addr)
2874 {
2875         uint8_t zero_addr[ETHER_ADDR_LEN] = { 0, 0, 0, 0, 0, 0 };
2876
2877         if ((addr[0] & 1) || !bcmp(addr, zero_addr, ETHER_ADDR_LEN))
2878                 return FALSE;
2879         return TRUE;
2880 }
2881
2882 /*
2883  * Enable PCI Wake On Lan capability
2884  */
2885 static void
2886 igb_enable_wol(device_t dev)
2887 {
2888         uint16_t cap, status;
2889         uint8_t id;
2890
2891         /* First find the capabilities pointer*/
2892         cap = pci_read_config(dev, PCIR_CAP_PTR, 2);
2893
2894         /* Read the PM Capabilities */
2895         id = pci_read_config(dev, cap, 1);
2896         if (id != PCIY_PMG)     /* Something wrong */
2897                 return;
2898
2899         /*
2900          * OK, we have the power capabilities,
2901          * so now get the status register
2902          */
2903         cap += PCIR_POWER_STATUS;
2904         status = pci_read_config(dev, cap, 2);
2905         status |= PCIM_PSTAT_PME | PCIM_PSTAT_PMEENABLE;
2906         pci_write_config(dev, cap, status, 2);
2907 }
2908
2909 static void
2910 igb_update_stats_counters(struct igb_softc *sc)
2911 {
2912         struct e1000_hw *hw = &sc->hw;
2913         struct e1000_hw_stats *stats;
2914         struct ifnet *ifp = &sc->arpcom.ac_if;
2915
2916         /* 
2917          * The virtual function adapter has only a
2918          * small controlled set of stats, do only 
2919          * those and return.
2920          */
2921         if (sc->vf_ifp) {
2922                 igb_update_vf_stats_counters(sc);
2923                 return;
2924         }
2925         stats = sc->stats;
2926
2927         if (sc->hw.phy.media_type == e1000_media_type_copper ||
2928             (E1000_READ_REG(hw, E1000_STATUS) & E1000_STATUS_LU)) {
2929                 stats->symerrs +=
2930                     E1000_READ_REG(hw,E1000_SYMERRS);
2931                 stats->sec += E1000_READ_REG(hw, E1000_SEC);
2932         }
2933
2934         stats->crcerrs += E1000_READ_REG(hw, E1000_CRCERRS);
2935         stats->mpc += E1000_READ_REG(hw, E1000_MPC);
2936         stats->scc += E1000_READ_REG(hw, E1000_SCC);
2937         stats->ecol += E1000_READ_REG(hw, E1000_ECOL);
2938
2939         stats->mcc += E1000_READ_REG(hw, E1000_MCC);
2940         stats->latecol += E1000_READ_REG(hw, E1000_LATECOL);
2941         stats->colc += E1000_READ_REG(hw, E1000_COLC);
2942         stats->dc += E1000_READ_REG(hw, E1000_DC);
2943         stats->rlec += E1000_READ_REG(hw, E1000_RLEC);
2944         stats->xonrxc += E1000_READ_REG(hw, E1000_XONRXC);
2945         stats->xontxc += E1000_READ_REG(hw, E1000_XONTXC);
2946
2947         /*
2948          * For watchdog management we need to know if we have been
2949          * paused during the last interval, so capture that here.
2950          */ 
2951         sc->pause_frames = E1000_READ_REG(hw, E1000_XOFFRXC);
2952         stats->xoffrxc += sc->pause_frames;
2953         stats->xofftxc += E1000_READ_REG(hw, E1000_XOFFTXC);
2954         stats->fcruc += E1000_READ_REG(hw, E1000_FCRUC);
2955         stats->prc64 += E1000_READ_REG(hw, E1000_PRC64);
2956         stats->prc127 += E1000_READ_REG(hw, E1000_PRC127);
2957         stats->prc255 += E1000_READ_REG(hw, E1000_PRC255);
2958         stats->prc511 += E1000_READ_REG(hw, E1000_PRC511);
2959         stats->prc1023 += E1000_READ_REG(hw, E1000_PRC1023);
2960         stats->prc1522 += E1000_READ_REG(hw, E1000_PRC1522);
2961         stats->gprc += E1000_READ_REG(hw, E1000_GPRC);
2962         stats->bprc += E1000_READ_REG(hw, E1000_BPRC);
2963         stats->mprc += E1000_READ_REG(hw, E1000_MPRC);
2964         stats->gptc += E1000_READ_REG(hw, E1000_GPTC);
2965
2966         /* For the 64-bit byte counters the low dword must be read first. */
2967         /* Both registers clear on the read of the high dword */
2968
2969         stats->gorc += E1000_READ_REG(hw, E1000_GORCL) +
2970             ((uint64_t)E1000_READ_REG(hw, E1000_GORCH) << 32);
2971         stats->gotc += E1000_READ_REG(hw, E1000_GOTCL) +
2972             ((uint64_t)E1000_READ_REG(hw, E1000_GOTCH) << 32);
2973
2974         stats->rnbc += E1000_READ_REG(hw, E1000_RNBC);
2975         stats->ruc += E1000_READ_REG(hw, E1000_RUC);
2976         stats->rfc += E1000_READ_REG(hw, E1000_RFC);
2977         stats->roc += E1000_READ_REG(hw, E1000_ROC);
2978         stats->rjc += E1000_READ_REG(hw, E1000_RJC);
2979
2980         stats->tor += E1000_READ_REG(hw, E1000_TORH);
2981         stats->tot += E1000_READ_REG(hw, E1000_TOTH);
2982
2983         stats->tpr += E1000_READ_REG(hw, E1000_TPR);
2984         stats->tpt += E1000_READ_REG(hw, E1000_TPT);
2985         stats->ptc64 += E1000_READ_REG(hw, E1000_PTC64);
2986         stats->ptc127 += E1000_READ_REG(hw, E1000_PTC127);
2987         stats->ptc255 += E1000_READ_REG(hw, E1000_PTC255);
2988         stats->ptc511 += E1000_READ_REG(hw, E1000_PTC511);
2989         stats->ptc1023 += E1000_READ_REG(hw, E1000_PTC1023);
2990         stats->ptc1522 += E1000_READ_REG(hw, E1000_PTC1522);
2991         stats->mptc += E1000_READ_REG(hw, E1000_MPTC);
2992         stats->bptc += E1000_READ_REG(hw, E1000_BPTC);
2993
2994         /* Interrupt Counts */
2995
2996         stats->iac += E1000_READ_REG(hw, E1000_IAC);
2997         stats->icrxptc += E1000_READ_REG(hw, E1000_ICRXPTC);
2998         stats->icrxatc += E1000_READ_REG(hw, E1000_ICRXATC);
2999         stats->ictxptc += E1000_READ_REG(hw, E1000_ICTXPTC);
3000         stats->ictxatc += E1000_READ_REG(hw, E1000_ICTXATC);
3001         stats->ictxqec += E1000_READ_REG(hw, E1000_ICTXQEC);
3002         stats->ictxqmtc += E1000_READ_REG(hw, E1000_ICTXQMTC);
3003         stats->icrxdmtc += E1000_READ_REG(hw, E1000_ICRXDMTC);
3004         stats->icrxoc += E1000_READ_REG(hw, E1000_ICRXOC);
3005
3006         /* Host to Card Statistics */
3007
3008         stats->cbtmpc += E1000_READ_REG(hw, E1000_CBTMPC);
3009         stats->htdpmc += E1000_READ_REG(hw, E1000_HTDPMC);
3010         stats->cbrdpc += E1000_READ_REG(hw, E1000_CBRDPC);
3011         stats->cbrmpc += E1000_READ_REG(hw, E1000_CBRMPC);
3012         stats->rpthc += E1000_READ_REG(hw, E1000_RPTHC);
3013         stats->hgptc += E1000_READ_REG(hw, E1000_HGPTC);
3014         stats->htcbdpc += E1000_READ_REG(hw, E1000_HTCBDPC);
3015         stats->hgorc += (E1000_READ_REG(hw, E1000_HGORCL) +
3016             ((uint64_t)E1000_READ_REG(hw, E1000_HGORCH) << 32));
3017         stats->hgotc += (E1000_READ_REG(hw, E1000_HGOTCL) +
3018             ((uint64_t)E1000_READ_REG(hw, E1000_HGOTCH) << 32));
3019         stats->lenerrs += E1000_READ_REG(hw, E1000_LENERRS);
3020         stats->scvpc += E1000_READ_REG(hw, E1000_SCVPC);
3021         stats->hrmpc += E1000_READ_REG(hw, E1000_HRMPC);
3022
3023         stats->algnerrc += E1000_READ_REG(hw, E1000_ALGNERRC);
3024         stats->rxerrc += E1000_READ_REG(hw, E1000_RXERRC);
3025         stats->tncrs += E1000_READ_REG(hw, E1000_TNCRS);
3026         stats->cexterr += E1000_READ_REG(hw, E1000_CEXTERR);
3027         stats->tsctc += E1000_READ_REG(hw, E1000_TSCTC);
3028         stats->tsctfc += E1000_READ_REG(hw, E1000_TSCTFC);
3029
3030         IFNET_STAT_SET(ifp, collisions, stats->colc);
3031
3032         /* Rx Errors */
3033         IFNET_STAT_SET(ifp, ierrors,
3034             stats->rxerrc + stats->crcerrs + stats->algnerrc +
3035             stats->ruc + stats->roc + stats->mpc + stats->cexterr);
3036
3037         /* Tx Errors */
3038         IFNET_STAT_SET(ifp, oerrors,
3039             stats->ecol + stats->latecol + sc->watchdog_events);
3040
3041         /* Driver specific counters */
3042         sc->device_control = E1000_READ_REG(hw, E1000_CTRL);
3043         sc->rx_control = E1000_READ_REG(hw, E1000_RCTL);
3044         sc->int_mask = E1000_READ_REG(hw, E1000_IMS);
3045         sc->eint_mask = E1000_READ_REG(hw, E1000_EIMS);
3046         sc->packet_buf_alloc_tx =
3047             ((E1000_READ_REG(hw, E1000_PBA) & 0xffff0000) >> 16);
3048         sc->packet_buf_alloc_rx =
3049             (E1000_READ_REG(hw, E1000_PBA) & 0xffff);
3050 }
3051
3052 static void
3053 igb_vf_init_stats(struct igb_softc *sc)
3054 {
3055         struct e1000_hw *hw = &sc->hw;
3056         struct e1000_vf_stats *stats;
3057
3058         stats = sc->stats;
3059         stats->last_gprc = E1000_READ_REG(hw, E1000_VFGPRC);
3060         stats->last_gorc = E1000_READ_REG(hw, E1000_VFGORC);
3061         stats->last_gptc = E1000_READ_REG(hw, E1000_VFGPTC);
3062         stats->last_gotc = E1000_READ_REG(hw, E1000_VFGOTC);
3063         stats->last_mprc = E1000_READ_REG(hw, E1000_VFMPRC);
3064 }
3065  
3066 static void
3067 igb_update_vf_stats_counters(struct igb_softc *sc)
3068 {
3069         struct e1000_hw *hw = &sc->hw;
3070         struct e1000_vf_stats *stats;
3071
3072         if (sc->link_speed == 0)
3073                 return;
3074
3075         stats = sc->stats;
3076         UPDATE_VF_REG(E1000_VFGPRC, stats->last_gprc, stats->gprc);
3077         UPDATE_VF_REG(E1000_VFGORC, stats->last_gorc, stats->gorc);
3078         UPDATE_VF_REG(E1000_VFGPTC, stats->last_gptc, stats->gptc);
3079         UPDATE_VF_REG(E1000_VFGOTC, stats->last_gotc, stats->gotc);
3080         UPDATE_VF_REG(E1000_VFMPRC, stats->last_mprc, stats->mprc);
3081 }
3082
3083 #ifdef IFPOLL_ENABLE
3084
3085 static void
3086 igb_npoll_status(struct ifnet *ifp)
3087 {
3088         struct igb_softc *sc = ifp->if_softc;
3089         uint32_t reg_icr;
3090
3091         ASSERT_SERIALIZED(&sc->main_serialize);
3092
3093         reg_icr = E1000_READ_REG(&sc->hw, E1000_ICR);
3094         if (reg_icr & (E1000_ICR_RXSEQ | E1000_ICR_LSC)) {
3095                 sc->hw.mac.get_link_status = 1;
3096                 igb_update_link_status(sc);
3097         }
3098 }
3099
3100 static void
3101 igb_npoll_tx(struct ifnet *ifp, void *arg, int cycle __unused)
3102 {
3103         struct igb_tx_ring *txr = arg;
3104
3105         ASSERT_SERIALIZED(&txr->tx_serialize);
3106
3107         igb_txeof(txr);
3108         if (!ifsq_is_empty(txr->ifsq))
3109                 ifsq_devstart(txr->ifsq);
3110 }
3111
3112 static void
3113 igb_npoll_rx(struct ifnet *ifp __unused, void *arg, int cycle)
3114 {
3115         struct igb_rx_ring *rxr = arg;
3116
3117         ASSERT_SERIALIZED(&rxr->rx_serialize);
3118
3119         igb_rxeof(rxr, cycle);
3120 }
3121
3122 static void
3123 igb_npoll(struct ifnet *ifp, struct ifpoll_info *info)
3124 {
3125         struct igb_softc *sc = ifp->if_softc;
3126         int i, txr_cnt, rxr_cnt;
3127
3128         ASSERT_IFNET_SERIALIZED_ALL(ifp);
3129
3130         if (info) {
3131                 int off;
3132
3133                 info->ifpi_status.status_func = igb_npoll_status;
3134                 info->ifpi_status.serializer = &sc->main_serialize;
3135
3136                 txr_cnt = igb_get_txring_inuse(sc, TRUE);
3137                 off = sc->tx_npoll_off;
3138                 for (i = 0; i < txr_cnt; ++i) {
3139                         struct igb_tx_ring *txr = &sc->tx_rings[i];
3140                         int idx = i + off;
3141
3142                         KKASSERT(idx < ncpus2);
3143                         info->ifpi_tx[idx].poll_func = igb_npoll_tx;
3144                         info->ifpi_tx[idx].arg = txr;
3145                         info->ifpi_tx[idx].serializer = &txr->tx_serialize;
3146                         ifsq_set_cpuid(txr->ifsq, idx);
3147                 }
3148
3149                 rxr_cnt = igb_get_rxring_inuse(sc, TRUE);
3150                 off = sc->rx_npoll_off;
3151                 for (i = 0; i < rxr_cnt; ++i) {
3152                         struct igb_rx_ring *rxr = &sc->rx_rings[i];
3153                         int idx = i + off;
3154
3155                         KKASSERT(idx < ncpus2);
3156                         info->ifpi_rx[idx].poll_func = igb_npoll_rx;
3157                         info->ifpi_rx[idx].arg = rxr;
3158                         info->ifpi_rx[idx].serializer = &rxr->rx_serialize;
3159                 }
3160
3161                 if (ifp->if_flags & IFF_RUNNING) {
3162                         if (rxr_cnt == sc->rx_ring_inuse &&
3163                             txr_cnt == sc->tx_ring_inuse)
3164                                 igb_disable_intr(sc);
3165                         else
3166                                 igb_init(sc);
3167                 }
3168         } else {
3169                 for (i = 0; i < sc->tx_ring_cnt; ++i) {
3170                         struct igb_tx_ring *txr = &sc->tx_rings[i];
3171
3172                         ifsq_set_cpuid(txr->ifsq, txr->tx_intr_cpuid);
3173                 }
3174
3175                 if (ifp->if_flags & IFF_RUNNING) {
3176                         txr_cnt = igb_get_txring_inuse(sc, FALSE);
3177                         rxr_cnt = igb_get_rxring_inuse(sc, FALSE);
3178
3179                         if (rxr_cnt == sc->rx_ring_inuse &&
3180                             txr_cnt == sc->tx_ring_inuse)
3181                                 igb_enable_intr(sc);
3182                         else
3183                                 igb_init(sc);
3184                 }
3185         }
3186 }
3187
3188 #endif /* IFPOLL_ENABLE */
3189
3190 static void
3191 igb_intr(void *xsc)
3192 {
3193         struct igb_softc *sc = xsc;
3194         struct ifnet *ifp = &sc->arpcom.ac_if;
3195         uint32_t eicr;
3196
3197         ASSERT_SERIALIZED(&sc->main_serialize);
3198
3199         eicr = E1000_READ_REG(&sc->hw, E1000_EICR);
3200
3201         if (eicr == 0)
3202                 return;
3203
3204         if (ifp->if_flags & IFF_RUNNING) {
3205                 struct igb_tx_ring *txr = &sc->tx_rings[0];
3206                 int i;
3207
3208                 for (i = 0; i < sc->rx_ring_inuse; ++i) {
3209                         struct igb_rx_ring *rxr = &sc->rx_rings[i];
3210
3211                         if (eicr & rxr->rx_intr_mask) {
3212                                 lwkt_serialize_enter(&rxr->rx_serialize);
3213                                 igb_rxeof(rxr, -1);
3214                                 lwkt_serialize_exit(&rxr->rx_serialize);
3215                         }
3216                 }
3217
3218                 if (eicr & txr->tx_intr_mask) {
3219                         lwkt_serialize_enter(&txr->tx_serialize);
3220                         igb_txeof(txr);
3221                         if (!ifsq_is_empty(txr->ifsq))
3222                                 ifsq_devstart(txr->ifsq);
3223                         lwkt_serialize_exit(&txr->tx_serialize);
3224                 }
3225         }
3226
3227         if (eicr & E1000_EICR_OTHER) {
3228                 uint32_t icr = E1000_READ_REG(&sc->hw, E1000_ICR);
3229
3230                 /* Link status change */
3231                 if (icr & E1000_ICR_LSC) {
3232                         sc->hw.mac.get_link_status = 1;
3233                         igb_update_link_status(sc);
3234                 }
3235         }
3236
3237         /*
3238          * Reading EICR has the side effect to clear interrupt mask,
3239          * so all interrupts need to be enabled here.
3240          */
3241         E1000_WRITE_REG(&sc->hw, E1000_EIMS, sc->intr_mask);
3242 }
3243
3244 static void
3245 igb_intr_shared(void *xsc)
3246 {
3247         struct igb_softc *sc = xsc;
3248         struct ifnet *ifp = &sc->arpcom.ac_if;
3249         uint32_t reg_icr;
3250
3251         ASSERT_SERIALIZED(&sc->main_serialize);
3252
3253         reg_icr = E1000_READ_REG(&sc->hw, E1000_ICR);
3254
3255         /* Hot eject?  */
3256         if (reg_icr == 0xffffffff)
3257                 return;
3258
3259         /* Definitely not our interrupt.  */
3260         if (reg_icr == 0x0)
3261                 return;
3262
3263         if ((reg_icr & E1000_ICR_INT_ASSERTED) == 0)
3264                 return;
3265
3266         if (ifp->if_flags & IFF_RUNNING) {
3267                 if (reg_icr &
3268                     (E1000_ICR_RXT0 | E1000_ICR_RXDMT0 | E1000_ICR_RXO)) {
3269                         int i;
3270
3271                         for (i = 0; i < sc->rx_ring_inuse; ++i) {
3272                                 struct igb_rx_ring *rxr = &sc->rx_rings[i];
3273
3274                                 lwkt_serialize_enter(&rxr->rx_serialize);
3275                                 igb_rxeof(rxr, -1);
3276                                 lwkt_serialize_exit(&rxr->rx_serialize);
3277                         }
3278                 }
3279
3280                 if (reg_icr & E1000_ICR_TXDW) {
3281                         struct igb_tx_ring *txr = &sc->tx_rings[0];
3282
3283                         lwkt_serialize_enter(&txr->tx_serialize);
3284                         igb_txeof(txr);
3285                         if (!ifsq_is_empty(txr->ifsq))
3286                                 ifsq_devstart(txr->ifsq);
3287                         lwkt_serialize_exit(&txr->tx_serialize);
3288                 }
3289         }
3290
3291         /* Link status change */
3292         if (reg_icr & (E1000_ICR_RXSEQ | E1000_ICR_LSC)) {
3293                 sc->hw.mac.get_link_status = 1;
3294                 igb_update_link_status(sc);
3295         }
3296
3297         if (reg_icr & E1000_ICR_RXO)
3298                 sc->rx_overruns++;
3299 }
3300
3301 static int
3302 igb_encap(struct igb_tx_ring *txr, struct mbuf **m_headp,
3303     int *segs_used, int *idx)
3304 {
3305         bus_dma_segment_t segs[IGB_MAX_SCATTER];
3306         bus_dmamap_t map;
3307         struct igb_tx_buf *tx_buf, *tx_buf_mapped;
3308         union e1000_adv_tx_desc *txd = NULL;
3309         struct mbuf *m_head = *m_headp;
3310         uint32_t olinfo_status = 0, cmd_type_len = 0, cmd_rs = 0;
3311         int maxsegs, nsegs, i, j, error;
3312         uint32_t hdrlen = 0;
3313
3314         if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3315                 error = igb_tso_pullup(txr, m_headp);
3316                 if (error)
3317                         return error;
3318                 m_head = *m_headp;
3319         }
3320
3321         /* Set basic descriptor constants */
3322         cmd_type_len |= E1000_ADVTXD_DTYP_DATA;
3323         cmd_type_len |= E1000_ADVTXD_DCMD_IFCS | E1000_ADVTXD_DCMD_DEXT;
3324         if (m_head->m_flags & M_VLANTAG)
3325                 cmd_type_len |= E1000_ADVTXD_DCMD_VLE;
3326
3327         /*
3328          * Map the packet for DMA.
3329          */
3330         tx_buf = &txr->tx_buf[txr->next_avail_desc];
3331         tx_buf_mapped = tx_buf;
3332         map = tx_buf->map;
3333
3334         maxsegs = txr->tx_avail - IGB_TX_RESERVED;
3335         KASSERT(maxsegs >= txr->spare_desc, ("not enough spare TX desc\n"));
3336         if (maxsegs > IGB_MAX_SCATTER)
3337                 maxsegs = IGB_MAX_SCATTER;
3338
3339         error = bus_dmamap_load_mbuf_defrag(txr->tx_tag, map, m_headp,
3340             segs, maxsegs, &nsegs, BUS_DMA_NOWAIT);
3341         if (error) {
3342                 if (error == ENOBUFS)
3343                         txr->sc->mbuf_defrag_failed++;
3344                 else
3345                         txr->sc->no_tx_dma_setup++;
3346
3347                 m_freem(*m_headp);
3348                 *m_headp = NULL;
3349                 return error;
3350         }
3351         bus_dmamap_sync(txr->tx_tag, map, BUS_DMASYNC_PREWRITE);
3352
3353         m_head = *m_headp;
3354
3355         /*
3356          * Set up the TX context descriptor, if any hardware offloading is
3357          * needed.  This includes CSUM, VLAN, and TSO.  It will consume one
3358          * TX descriptor.
3359          *
3360          * Unlike these chips' predecessors (em/emx), TX context descriptor
3361          * will _not_ interfere TX data fetching pipelining.
3362          */
3363         if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3364                 igb_tso_ctx(txr, m_head, &hdrlen);
3365                 cmd_type_len |= E1000_ADVTXD_DCMD_TSE;
3366                 olinfo_status |= E1000_TXD_POPTS_IXSM << 8;
3367                 olinfo_status |= E1000_TXD_POPTS_TXSM << 8;
3368                 txr->tx_nsegs++;
3369                 (*segs_used)++;
3370         } else if (igb_txcsum_ctx(txr, m_head)) {
3371                 if (m_head->m_pkthdr.csum_flags & CSUM_IP)
3372                         olinfo_status |= (E1000_TXD_POPTS_IXSM << 8);
3373                 if (m_head->m_pkthdr.csum_flags & (CSUM_UDP | CSUM_TCP))
3374                         olinfo_status |= (E1000_TXD_POPTS_TXSM << 8);
3375                 txr->tx_nsegs++;
3376                 (*segs_used)++;
3377         }
3378
3379         *segs_used += nsegs;
3380         txr->tx_nsegs += nsegs;
3381         if (txr->tx_nsegs >= txr->intr_nsegs) {
3382                 /*
3383                  * Report Status (RS) is turned on every intr_nsegs
3384                  * descriptors (roughly).
3385                  */
3386                 txr->tx_nsegs = 0;
3387                 cmd_rs = E1000_ADVTXD_DCMD_RS;
3388         }
3389
3390         /* Calculate payload length */
3391         olinfo_status |= ((m_head->m_pkthdr.len - hdrlen)
3392             << E1000_ADVTXD_PAYLEN_SHIFT);
3393
3394         /* 82575 needs the queue index added */
3395         if (txr->sc->hw.mac.type == e1000_82575)
3396                 olinfo_status |= txr->me << 4;
3397
3398         /* Set up our transmit descriptors */
3399         i = txr->next_avail_desc;
3400         for (j = 0; j < nsegs; j++) {
3401                 bus_size_t seg_len;
3402                 bus_addr_t seg_addr;
3403
3404                 tx_buf = &txr->tx_buf[i];
3405                 txd = (union e1000_adv_tx_desc *)&txr->tx_base[i];
3406                 seg_addr = segs[j].ds_addr;
3407                 seg_len = segs[j].ds_len;
3408
3409                 txd->read.buffer_addr = htole64(seg_addr);
3410                 txd->read.cmd_type_len = htole32(cmd_type_len | seg_len);
3411                 txd->read.olinfo_status = htole32(olinfo_status);
3412                 if (++i == txr->num_tx_desc)
3413                         i = 0;
3414                 tx_buf->m_head = NULL;
3415         }
3416
3417         KASSERT(txr->tx_avail > nsegs, ("invalid avail TX desc\n"));
3418         txr->next_avail_desc = i;
3419         txr->tx_avail -= nsegs;
3420
3421         tx_buf->m_head = m_head;
3422         tx_buf_mapped->map = tx_buf->map;
3423         tx_buf->map = map;
3424
3425         /*
3426          * Last Descriptor of Packet needs End Of Packet (EOP)
3427          */
3428         txd->read.cmd_type_len |= htole32(E1000_ADVTXD_DCMD_EOP | cmd_rs);
3429
3430         /*
3431          * Defer TDT updating, until enough descrptors are setup
3432          */
3433         *idx = i;
3434 #ifdef IGB_TSS_DEBUG
3435         ++txr->tx_packets;
3436 #endif
3437
3438         return 0;
3439 }
3440
3441 static void
3442 igb_start(struct ifnet *ifp, struct ifaltq_subque *ifsq)
3443 {
3444         struct igb_softc *sc = ifp->if_softc;
3445         struct igb_tx_ring *txr = ifsq_get_priv(ifsq);
3446         struct mbuf *m_head;
3447         int idx = -1, nsegs = 0;
3448
3449         KKASSERT(txr->ifsq == ifsq);
3450         ASSERT_SERIALIZED(&txr->tx_serialize);
3451
3452         if ((ifp->if_flags & IFF_RUNNING) == 0 || ifsq_is_oactive(ifsq))
3453                 return;
3454
3455         if (!sc->link_active || (txr->tx_flags & IGB_TXFLAG_ENABLED) == 0) {
3456                 ifsq_purge(ifsq);
3457                 return;
3458         }
3459
3460         if (!IGB_IS_NOT_OACTIVE(txr))
3461                 igb_txeof(txr);
3462
3463         while (!ifsq_is_empty(ifsq)) {
3464                 if (IGB_IS_OACTIVE(txr)) {
3465                         ifsq_set_oactive(ifsq);
3466                         /* Set watchdog on */
3467                         txr->tx_watchdog.wd_timer = 5;
3468                         break;
3469                 }
3470
3471                 m_head = ifsq_dequeue(ifsq, NULL);
3472                 if (m_head == NULL)
3473                         break;
3474
3475                 if (igb_encap(txr, &m_head, &nsegs, &idx)) {
3476                         IFNET_STAT_INC(ifp, oerrors, 1);
3477                         continue;
3478                 }
3479
3480                 if (nsegs >= txr->wreg_nsegs) {
3481                         E1000_WRITE_REG(&txr->sc->hw, E1000_TDT(txr->me), idx);
3482                         idx = -1;
3483                         nsegs = 0;
3484                 }
3485
3486                 /* Send a copy of the frame to the BPF listener */
3487                 ETHER_BPF_MTAP(ifp, m_head);
3488         }
3489         if (idx >= 0)
3490                 E1000_WRITE_REG(&txr->sc->hw, E1000_TDT(txr->me), idx);
3491 }
3492
3493 static void
3494 igb_watchdog(struct ifaltq_subque *ifsq)
3495 {
3496         struct igb_tx_ring *txr = ifsq_get_priv(ifsq);
3497         struct ifnet *ifp = ifsq_get_ifp(ifsq);
3498         struct igb_softc *sc = ifp->if_softc;
3499         int i;
3500
3501         KKASSERT(txr->ifsq == ifsq);
3502         ASSERT_IFNET_SERIALIZED_ALL(ifp);
3503
3504         /* 
3505          * If flow control has paused us since last checking
3506          * it invalidates the watchdog timing, so dont run it.
3507          */
3508         if (sc->pause_frames) {
3509                 sc->pause_frames = 0;
3510                 txr->tx_watchdog.wd_timer = 5;
3511                 return;
3512         }
3513
3514         if_printf(ifp, "Watchdog timeout -- resetting\n");
3515         if_printf(ifp, "Queue(%d) tdh = %d, hw tdt = %d\n", txr->me,
3516             E1000_READ_REG(&sc->hw, E1000_TDH(txr->me)),
3517             E1000_READ_REG(&sc->hw, E1000_TDT(txr->me)));
3518         if_printf(ifp, "TX(%d) desc avail = %d, "
3519             "Next TX to Clean = %d\n",
3520             txr->me, txr->tx_avail, txr->next_to_clean);
3521
3522         IFNET_STAT_INC(ifp, oerrors, 1);
3523         sc->watchdog_events++;
3524
3525         igb_init(sc);
3526         for (i = 0; i < sc->tx_ring_inuse; ++i)
3527                 ifsq_devstart_sched(sc->tx_rings[i].ifsq);
3528 }
3529
3530 static void
3531 igb_set_eitr(struct igb_softc *sc, int idx, int rate)
3532 {
3533         uint32_t eitr = 0;
3534
3535         if (rate > 0) {
3536                 if (sc->hw.mac.type == e1000_82575) {
3537                         eitr = 1000000000 / 256 / rate;
3538                         /*
3539                          * NOTE:
3540                          * Document is wrong on the 2 bits left shift
3541                          */
3542                 } else {
3543                         eitr = 1000000 / rate;
3544                         eitr <<= IGB_EITR_INTVL_SHIFT;
3545                 }
3546
3547                 if (eitr == 0) {
3548                         /* Don't disable it */
3549                         eitr = 1 << IGB_EITR_INTVL_SHIFT;
3550                 } else if (eitr > IGB_EITR_INTVL_MASK) {
3551                         /* Don't allow it to be too large */
3552                         eitr = IGB_EITR_INTVL_MASK;
3553                 }
3554         }
3555         if (sc->hw.mac.type == e1000_82575)
3556                 eitr |= eitr << 16;
3557         else
3558                 eitr |= E1000_EITR_CNT_IGNR;
3559         E1000_WRITE_REG(&sc->hw, E1000_EITR(idx), eitr);
3560 }
3561
3562 static int
3563 igb_sysctl_intr_rate(SYSCTL_HANDLER_ARGS)
3564 {
3565         struct igb_softc *sc = (void *)arg1;
3566         struct ifnet *ifp = &sc->arpcom.ac_if;
3567         int error, intr_rate;
3568
3569         intr_rate = sc->intr_rate;
3570         error = sysctl_handle_int(oidp, &intr_rate, 0, req);
3571         if (error || req->newptr == NULL)
3572                 return error;
3573         if (intr_rate < 0)
3574                 return EINVAL;
3575
3576         ifnet_serialize_all(ifp);
3577
3578         sc->intr_rate = intr_rate;
3579         if (ifp->if_flags & IFF_RUNNING)
3580                 igb_set_eitr(sc, 0, sc->intr_rate);
3581
3582         if (bootverbose)
3583                 if_printf(ifp, "interrupt rate set to %d/sec\n", sc->intr_rate);
3584
3585         ifnet_deserialize_all(ifp);
3586
3587         return 0;
3588 }
3589
3590 static int
3591 igb_sysctl_msix_rate(SYSCTL_HANDLER_ARGS)
3592 {
3593         struct igb_msix_data *msix = (void *)arg1;
3594         struct igb_softc *sc = msix->msix_sc;
3595         struct ifnet *ifp = &sc->arpcom.ac_if;
3596         int error, msix_rate;
3597
3598         msix_rate = msix->msix_rate;
3599         error = sysctl_handle_int(oidp, &msix_rate, 0, req);
3600         if (error || req->newptr == NULL)
3601                 return error;
3602         if (msix_rate < 0)
3603                 return EINVAL;
3604
3605         lwkt_serialize_enter(msix->msix_serialize);
3606
3607         msix->msix_rate = msix_rate;
3608         if (ifp->if_flags & IFF_RUNNING)
3609                 igb_set_eitr(sc, msix->msix_vector, msix->msix_rate);
3610
3611         if (bootverbose) {
3612                 if_printf(ifp, "%s set to %d/sec\n", msix->msix_rate_desc,
3613                     msix->msix_rate);
3614         }
3615
3616         lwkt_serialize_exit(msix->msix_serialize);
3617
3618         return 0;
3619 }
3620
3621 static int
3622 igb_sysctl_tx_intr_nsegs(SYSCTL_HANDLER_ARGS)
3623 {
3624         struct igb_softc *sc = (void *)arg1;
3625         struct ifnet *ifp = &sc->arpcom.ac_if;
3626         struct igb_tx_ring *txr = &sc->tx_rings[0];
3627         int error, nsegs;
3628
3629         nsegs = txr->intr_nsegs;
3630         error = sysctl_handle_int(oidp, &nsegs, 0, req);
3631         if (error || req->newptr == NULL)
3632                 return error;
3633         if (nsegs <= 0)
3634                 return EINVAL;
3635
3636         ifnet_serialize_all(ifp);
3637
3638         if (nsegs >= txr->num_tx_desc - txr->oact_lo_desc ||
3639             nsegs >= txr->oact_hi_desc - IGB_MAX_SCATTER) {
3640                 error = EINVAL;
3641         } else {
3642                 int i;
3643
3644                 error = 0;
3645                 for (i = 0; i < sc->tx_ring_cnt; ++i)
3646                         sc->tx_rings[i].intr_nsegs = nsegs;
3647         }
3648
3649         ifnet_deserialize_all(ifp);
3650
3651         return error;
3652 }
3653
3654 static int
3655 igb_sysctl_rx_wreg_nsegs(SYSCTL_HANDLER_ARGS)
3656 {
3657         struct igb_softc *sc = (void *)arg1;
3658         struct ifnet *ifp = &sc->arpcom.ac_if;
3659         int error, nsegs, i;
3660
3661         nsegs = sc->rx_rings[0].wreg_nsegs;
3662         error = sysctl_handle_int(oidp, &nsegs, 0, req);
3663         if (error || req->newptr == NULL)
3664                 return error;
3665
3666         ifnet_serialize_all(ifp);
3667         for (i = 0; i < sc->rx_ring_cnt; ++i)
3668                 sc->rx_rings[i].wreg_nsegs =nsegs;
3669         ifnet_deserialize_all(ifp);
3670
3671         return 0;
3672 }
3673
3674 static int
3675 igb_sysctl_tx_wreg_nsegs(SYSCTL_HANDLER_ARGS)
3676 {
3677         struct igb_softc *sc = (void *)arg1;
3678         struct ifnet *ifp = &sc->arpcom.ac_if;
3679         int error, nsegs, i;
3680
3681         nsegs = sc->tx_rings[0].wreg_nsegs;
3682         error = sysctl_handle_int(oidp, &nsegs, 0, req);
3683         if (error || req->newptr == NULL)
3684                 return error;
3685
3686         ifnet_serialize_all(ifp);
3687         for (i = 0; i < sc->tx_ring_cnt; ++i)
3688                 sc->tx_rings[i].wreg_nsegs =nsegs;
3689         ifnet_deserialize_all(ifp);
3690
3691         return 0;
3692 }
3693
3694 #ifdef IFPOLL_ENABLE
3695
3696 static int
3697 igb_sysctl_npoll_rxoff(SYSCTL_HANDLER_ARGS)
3698 {
3699         struct igb_softc *sc = (void *)arg1;
3700         struct ifnet *ifp = &sc->arpcom.ac_if;
3701         int error, off;
3702
3703         off = sc->rx_npoll_off;
3704         error = sysctl_handle_int(oidp, &off, 0, req);
3705         if (error || req->newptr == NULL)
3706                 return error;
3707         if (off < 0)
3708                 return EINVAL;
3709
3710         ifnet_serialize_all(ifp);
3711         if (off >= ncpus2 || off % sc->rx_ring_cnt != 0) {
3712                 error = EINVAL;
3713         } else {
3714                 error = 0;
3715                 sc->rx_npoll_off = off;
3716         }
3717         ifnet_deserialize_all(ifp);
3718
3719         return error;
3720 }
3721
3722 static int
3723 igb_sysctl_npoll_txoff(SYSCTL_HANDLER_ARGS)
3724 {
3725         struct igb_softc *sc = (void *)arg1;
3726         struct ifnet *ifp = &sc->arpcom.ac_if;
3727         int error, off;
3728
3729         off = sc->tx_npoll_off;
3730         error = sysctl_handle_int(oidp, &off, 0, req);
3731         if (error || req->newptr == NULL)
3732                 return error;
3733         if (off < 0)
3734                 return EINVAL;
3735
3736         ifnet_serialize_all(ifp);
3737         if (off >= ncpus2 || off % sc->tx_ring_cnt != 0) {
3738                 error = EINVAL;
3739         } else {
3740                 error = 0;
3741                 sc->tx_npoll_off = off;
3742         }
3743         ifnet_deserialize_all(ifp);
3744
3745         return error;
3746 }
3747
3748 #endif  /* IFPOLL_ENABLE */
3749
3750 static void
3751 igb_init_intr(struct igb_softc *sc)
3752 {
3753         igb_set_intr_mask(sc);
3754
3755         if ((sc->flags & IGB_FLAG_SHARED_INTR) == 0)
3756                 igb_init_unshared_intr(sc);
3757
3758         if (sc->intr_type != PCI_INTR_TYPE_MSIX) {
3759                 igb_set_eitr(sc, 0, sc->intr_rate);
3760         } else {
3761                 int i;
3762
3763                 for (i = 0; i < sc->msix_cnt; ++i)
3764                         igb_set_eitr(sc, i, sc->msix_data[i].msix_rate);
3765         }
3766 }
3767
3768 static void
3769 igb_init_unshared_intr(struct igb_softc *sc)
3770 {
3771         struct e1000_hw *hw = &sc->hw;
3772         const struct igb_rx_ring *rxr;
3773         const struct igb_tx_ring *txr;
3774         uint32_t ivar, index;
3775         int i;
3776
3777         /*
3778          * Enable extended mode
3779          */
3780         if (sc->hw.mac.type != e1000_82575) {
3781                 uint32_t gpie;
3782                 int ivar_max;
3783
3784                 gpie = E1000_GPIE_NSICR;
3785                 if (sc->intr_type == PCI_INTR_TYPE_MSIX) {
3786                         gpie |= E1000_GPIE_MSIX_MODE |
3787                             E1000_GPIE_EIAME |
3788                             E1000_GPIE_PBA;
3789                 }
3790                 E1000_WRITE_REG(hw, E1000_GPIE, gpie);
3791
3792                 /*
3793                  * Clear IVARs
3794                  */
3795                 switch (sc->hw.mac.type) {
3796                 case e1000_82576:
3797                         ivar_max = IGB_MAX_IVAR_82576;
3798                         break;
3799
3800                 case e1000_82580:
3801                         ivar_max = IGB_MAX_IVAR_82580;
3802                         break;
3803
3804                 case e1000_i350:
3805                         ivar_max = IGB_MAX_IVAR_I350;
3806                         break;
3807
3808                 case e1000_vfadapt:
3809                 case e1000_vfadapt_i350:
3810                         ivar_max = IGB_MAX_IVAR_VF;
3811                         break;
3812
3813                 case e1000_i210:
3814                         ivar_max = IGB_MAX_IVAR_I210;
3815                         break;
3816
3817                 case e1000_i211:
3818                         ivar_max = IGB_MAX_IVAR_I211;
3819                         break;
3820
3821                 default:
3822                         panic("unknown mac type %d\n", sc->hw.mac.type);
3823                 }
3824                 for (i = 0; i < ivar_max; ++i)
3825                         E1000_WRITE_REG_ARRAY(hw, E1000_IVAR0, i, 0);
3826                 E1000_WRITE_REG(hw, E1000_IVAR_MISC, 0);
3827         } else {
3828                 uint32_t tmp;
3829
3830                 KASSERT(sc->intr_type != PCI_INTR_TYPE_MSIX,
3831                     ("82575 w/ MSI-X"));
3832                 tmp = E1000_READ_REG(hw, E1000_CTRL_EXT);
3833                 tmp |= E1000_CTRL_EXT_IRCA;
3834                 E1000_WRITE_REG(hw, E1000_CTRL_EXT, tmp);
3835         }
3836
3837         /*
3838          * Map TX/RX interrupts to EICR
3839          */
3840         switch (sc->hw.mac.type) {
3841         case e1000_82580:
3842         case e1000_i350:
3843         case e1000_vfadapt:
3844         case e1000_vfadapt_i350:
3845         case e1000_i210:
3846         case e1000_i211:
3847                 /* RX entries */
3848                 for (i = 0; i < sc->rx_ring_inuse; ++i) {
3849                         rxr = &sc->rx_rings[i];
3850
3851                         index = i >> 1;
3852                         ivar = E1000_READ_REG_ARRAY(hw, E1000_IVAR0, index);
3853
3854                         if (i & 1) {
3855                                 ivar &= 0xff00ffff;
3856                                 ivar |=
3857                                 (rxr->rx_intr_bit | E1000_IVAR_VALID) << 16;
3858                         } else {
3859                                 ivar &= 0xffffff00;
3860                                 ivar |=
3861                                 (rxr->rx_intr_bit | E1000_IVAR_VALID);
3862                         }
3863                         E1000_WRITE_REG_ARRAY(hw, E1000_IVAR0, index, ivar);
3864                 }
3865                 /* TX entries */
3866                 for (i = 0; i < sc->tx_ring_inuse; ++i) {
3867                         txr = &sc->tx_rings[i];
3868
3869                         index = i >> 1;
3870                         ivar = E1000_READ_REG_ARRAY(hw, E1000_IVAR0, index);
3871
3872                         if (i & 1) {
3873                                 ivar &= 0x00ffffff;
3874                                 ivar |=
3875                                 (txr->tx_intr_bit | E1000_IVAR_VALID) << 24;
3876                         } else {
3877                                 ivar &= 0xffff00ff;
3878                                 ivar |=
3879                                 (txr->tx_intr_bit | E1000_IVAR_VALID) << 8;
3880                         }
3881                         E1000_WRITE_REG_ARRAY(hw, E1000_IVAR0, index, ivar);
3882                 }
3883                 if (sc->intr_type == PCI_INTR_TYPE_MSIX) {
3884                         ivar = (sc->sts_intr_bit | E1000_IVAR_VALID) << 8;
3885                         E1000_WRITE_REG(hw, E1000_IVAR_MISC, ivar);
3886                 }
3887                 break;
3888
3889         case e1000_82576:
3890                 /* RX entries */
3891                 for (i = 0; i < sc->rx_ring_inuse; ++i) {
3892                         rxr = &sc->rx_rings[i];
3893
3894                         index = i & 0x7; /* Each IVAR has two entries */
3895                         ivar = E1000_READ_REG_ARRAY(hw, E1000_IVAR0, index);
3896
3897                         if (i < 8) {
3898                                 ivar &= 0xffffff00;
3899                                 ivar |=
3900                                 (rxr->rx_intr_bit | E1000_IVAR_VALID);
3901                         } else {
3902                                 ivar &= 0xff00ffff;
3903                                 ivar |=
3904                                 (rxr->rx_intr_bit | E1000_IVAR_VALID) << 16;
3905                         }
3906                         E1000_WRITE_REG_ARRAY(hw, E1000_IVAR0, index, ivar);
3907                 }
3908                 /* TX entries */
3909                 for (i = 0; i < sc->tx_ring_inuse; ++i) {
3910                         txr = &sc->tx_rings[i];
3911
3912                         index = i & 0x7; /* Each IVAR has two entries */
3913                         ivar = E1000_READ_REG_ARRAY(hw, E1000_IVAR0, index);
3914
3915                         if (i < 8) {
3916                                 ivar &= 0xffff00ff;
3917                                 ivar |=
3918                                 (txr->tx_intr_bit | E1000_IVAR_VALID) << 8;
3919                         } else {
3920                                 ivar &= 0x00ffffff;
3921                                 ivar |=
3922                                 (txr->tx_intr_bit | E1000_IVAR_VALID) << 24;
3923                         }
3924                         E1000_WRITE_REG_ARRAY(hw, E1000_IVAR0, index, ivar);
3925                 }
3926                 if (sc->intr_type == PCI_INTR_TYPE_MSIX) {
3927                         ivar = (sc->sts_intr_bit | E1000_IVAR_VALID) << 8;
3928                         E1000_WRITE_REG(hw, E1000_IVAR_MISC, ivar);
3929                 }
3930                 break;
3931
3932         case e1000_82575:
3933                 /*
3934                  * Enable necessary interrupt bits.
3935                  *
3936                  * The name of the register is confusing; in addition to
3937                  * configuring the first vector of MSI-X, it also configures
3938                  * which bits of EICR could be set by the hardware even when
3939                  * MSI or line interrupt is used; it thus controls interrupt
3940                  * generation.  It MUST be configured explicitly; the default
3941                  * value mentioned in the datasheet is wrong: RX queue0 and
3942                  * TX queue0 are NOT enabled by default.
3943                  */
3944                 E1000_WRITE_REG(&sc->hw, E1000_MSIXBM(0), sc->intr_mask);
3945                 break;
3946
3947         default:
3948                 panic("unknown mac type %d\n", sc->hw.mac.type);
3949         }
3950 }
3951
3952 static int
3953 igb_setup_intr(struct igb_softc *sc)
3954 {
3955         int error, i;
3956
3957         if (sc->intr_type == PCI_INTR_TYPE_MSIX)
3958                 return igb_msix_setup(sc);
3959
3960         error = bus_setup_intr(sc->dev, sc->intr_res, INTR_MPSAFE,
3961             (sc->flags & IGB_FLAG_SHARED_INTR) ? igb_intr_shared : igb_intr,
3962             sc, &sc->intr_tag, &sc->main_serialize);
3963         if (error) {
3964                 device_printf(sc->dev, "Failed to register interrupt handler");
3965                 return error;
3966         }
3967
3968         for (i = 0; i < sc->tx_ring_cnt; ++i)
3969                 sc->tx_rings[i].tx_intr_cpuid = rman_get_cpuid(sc->intr_res);
3970
3971         return 0;
3972 }
3973
3974 static void
3975 igb_set_txintr_mask(struct igb_tx_ring *txr, int *intr_bit0, int intr_bitmax)
3976 {
3977         if (txr->sc->hw.mac.type == e1000_82575) {
3978                 txr->tx_intr_bit = 0;   /* unused */
3979                 switch (txr->me) {
3980                 case 0:
3981                         txr->tx_intr_mask = E1000_EICR_TX_QUEUE0;
3982                         break;
3983                 case 1:
3984                         txr->tx_intr_mask = E1000_EICR_TX_QUEUE1;
3985                         break;
3986                 case 2:
3987                         txr->tx_intr_mask = E1000_EICR_TX_QUEUE2;
3988                         break;
3989                 case 3:
3990                         txr->tx_intr_mask = E1000_EICR_TX_QUEUE3;
3991                         break;
3992                 default:
3993                         panic("unsupported # of TX ring, %d\n", txr->me);
3994                 }
3995         } else {
3996                 int intr_bit = *intr_bit0;
3997
3998                 txr->tx_intr_bit = intr_bit % intr_bitmax;
3999                 txr->tx_intr_mask = 1 << txr->tx_intr_bit;
4000
4001                 *intr_bit0 = intr_bit + 1;
4002         }
4003 }
4004
4005 static void
4006 igb_set_rxintr_mask(struct igb_rx_ring *rxr, int *intr_bit0, int intr_bitmax)
4007 {
4008         if (rxr->sc->hw.mac.type == e1000_82575) {
4009                 rxr->rx_intr_bit = 0;   /* unused */
4010                 switch (rxr->me) {
4011                 case 0:
4012                         rxr->rx_intr_mask = E1000_EICR_RX_QUEUE0;
4013                         break;
4014                 case 1:
4015                         rxr->rx_intr_mask = E1000_EICR_RX_QUEUE1;
4016                         break;
4017                 case 2:
4018                         rxr->rx_intr_mask = E1000_EICR_RX_QUEUE2;
4019                         break;
4020                 case 3:
4021                         rxr->rx_intr_mask = E1000_EICR_RX_QUEUE3;
4022                         break;
4023                 default:
4024                         panic("unsupported # of RX ring, %d\n", rxr->me);
4025                 }
4026         } else {
4027                 int intr_bit = *intr_bit0;
4028
4029                 rxr->rx_intr_bit = intr_bit % intr_bitmax;
4030                 rxr->rx_intr_mask = 1 << rxr->rx_intr_bit;
4031
4032                 *intr_bit0 = intr_bit + 1;
4033         }
4034 }
4035
4036 static void
4037 igb_serialize(struct ifnet *ifp, enum ifnet_serialize slz)
4038 {
4039         struct igb_softc *sc = ifp->if_softc;
4040
4041         ifnet_serialize_array_enter(sc->serializes, sc->serialize_cnt,
4042             sc->tx_serialize, sc->rx_serialize, slz);
4043 }
4044
4045 static void
4046 igb_deserialize(struct ifnet *ifp, enum ifnet_serialize slz)
4047 {
4048         struct igb_softc *sc = ifp->if_softc;
4049
4050         ifnet_serialize_array_exit(sc->serializes, sc->serialize_cnt,
4051             sc->tx_serialize, sc->rx_serialize, slz);
4052 }
4053
4054 static int
4055 igb_tryserialize(struct ifnet *ifp, enum ifnet_serialize slz)
4056 {
4057         struct igb_softc *sc = ifp->if_softc;
4058
4059         return ifnet_serialize_array_try(sc->serializes, sc->serialize_cnt,
4060             sc->tx_serialize, sc->rx_serialize, slz);
4061 }
4062
4063 #ifdef INVARIANTS
4064
4065 static void
4066 igb_serialize_assert(struct ifnet *ifp, enum ifnet_serialize slz,
4067     boolean_t serialized)
4068 {
4069         struct igb_softc *sc = ifp->if_softc;
4070
4071         ifnet_serialize_array_assert(sc->serializes, sc->serialize_cnt,
4072             sc->tx_serialize, sc->rx_serialize, slz, serialized);
4073 }
4074
4075 #endif  /* INVARIANTS */
4076
4077 static void
4078 igb_set_intr_mask(struct igb_softc *sc)
4079 {
4080         int i;
4081
4082         sc->intr_mask = sc->sts_intr_mask;
4083         for (i = 0; i < sc->rx_ring_inuse; ++i)
4084                 sc->intr_mask |= sc->rx_rings[i].rx_intr_mask;
4085         for (i = 0; i < sc->tx_ring_inuse; ++i)
4086                 sc->intr_mask |= sc->tx_rings[i].tx_intr_mask;
4087         if (bootverbose) {
4088                 if_printf(&sc->arpcom.ac_if, "intr mask 0x%08x\n",
4089                     sc->intr_mask);
4090         }
4091 }
4092
4093 static int
4094 igb_alloc_intr(struct igb_softc *sc)
4095 {
4096         int i, intr_bit, intr_bitmax;
4097         u_int intr_flags;
4098
4099         igb_msix_try_alloc(sc);
4100         if (sc->intr_type == PCI_INTR_TYPE_MSIX)
4101                 goto done;
4102
4103         /*
4104          * Allocate MSI/legacy interrupt resource
4105          */
4106         sc->intr_type = pci_alloc_1intr(sc->dev, igb_msi_enable,
4107             &sc->intr_rid, &intr_flags);
4108
4109         if (sc->intr_type == PCI_INTR_TYPE_LEGACY) {
4110                 int unshared;
4111
4112                 unshared = device_getenv_int(sc->dev, "irq.unshared", 0);
4113                 if (!unshared) {
4114                         sc->flags |= IGB_FLAG_SHARED_INTR;
4115                         if (bootverbose)
4116                                 device_printf(sc->dev, "IRQ shared\n");
4117                 } else {
4118                         intr_flags &= ~RF_SHAREABLE;
4119                         if (bootverbose)
4120                                 device_printf(sc->dev, "IRQ unshared\n");
4121                 }
4122         }
4123
4124         sc->intr_res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ,
4125             &sc->intr_rid, intr_flags);
4126         if (sc->intr_res == NULL) {
4127                 device_printf(sc->dev, "Unable to allocate bus resource: "
4128                     "interrupt\n");
4129                 return ENXIO;
4130         }
4131
4132         /*
4133          * Setup MSI/legacy interrupt mask
4134          */
4135         switch (sc->hw.mac.type) {
4136         case e1000_82575:
4137                 intr_bitmax = IGB_MAX_TXRXINT_82575;
4138                 break;
4139
4140         case e1000_82576:
4141                 intr_bitmax = IGB_MAX_TXRXINT_82576;
4142                 break;
4143
4144         case e1000_82580:
4145                 intr_bitmax = IGB_MAX_TXRXINT_82580;
4146                 break;
4147
4148         case e1000_i350:
4149                 intr_bitmax = IGB_MAX_TXRXINT_I350;
4150                 break;
4151
4152         case e1000_i210:
4153                 intr_bitmax = IGB_MAX_TXRXINT_I210;
4154                 break;
4155
4156         case e1000_i211:
4157                 intr_bitmax = IGB_MAX_TXRXINT_I211;
4158                 break;
4159
4160         default:
4161                 intr_bitmax = IGB_MIN_TXRXINT;
4162                 break;
4163         }
4164         intr_bit = 0;
4165         for (i = 0; i < sc->tx_ring_cnt; ++i)
4166                 igb_set_txintr_mask(&sc->tx_rings[i], &intr_bit, intr_bitmax);
4167         for (i = 0; i < sc->rx_ring_cnt; ++i)
4168                 igb_set_rxintr_mask(&sc->rx_rings[i], &intr_bit, intr_bitmax);
4169         sc->sts_intr_bit = 0;
4170         sc->sts_intr_mask = E1000_EICR_OTHER;
4171
4172         /* Initialize interrupt rate */
4173         sc->intr_rate = IGB_INTR_RATE;
4174 done:
4175         igb_set_ring_inuse(sc, FALSE);
4176         igb_set_intr_mask(sc);
4177         return 0;
4178 }
4179
4180 static void
4181 igb_free_intr(struct igb_softc *sc)
4182 {
4183         if (sc->intr_type != PCI_INTR_TYPE_MSIX) {
4184                 if (sc->intr_res != NULL) {
4185                         bus_release_resource(sc->dev, SYS_RES_IRQ, sc->intr_rid,
4186                             sc->intr_res);
4187                 }
4188                 if (sc->intr_type == PCI_INTR_TYPE_MSI)
4189                         pci_release_msi(sc->dev);
4190         } else {
4191                 igb_msix_free(sc, TRUE);
4192         }
4193 }
4194
4195 static void
4196 igb_teardown_intr(struct igb_softc *sc)
4197 {
4198         if (sc->intr_type != PCI_INTR_TYPE_MSIX)
4199                 bus_teardown_intr(sc->dev, sc->intr_res, sc->intr_tag);
4200         else
4201                 igb_msix_teardown(sc, sc->msix_cnt);
4202 }
4203
4204 static void
4205 igb_msix_try_alloc(struct igb_softc *sc)
4206 {
4207         int msix_enable, msix_cnt, msix_cnt2, alloc_cnt;
4208         int i, x, error;
4209         int offset, offset_def;
4210         struct igb_msix_data *msix;
4211         boolean_t aggregate, setup = FALSE;
4212
4213         /*
4214          * Don't enable MSI-X on 82575, see:
4215          * 82575 specification update errata #25
4216          */
4217         if (sc->hw.mac.type == e1000_82575)
4218                 return;
4219
4220         /* Don't enable MSI-X on VF */
4221         if (sc->vf_ifp)
4222                 return;
4223
4224         msix_enable = device_getenv_int(sc->dev, "msix.enable",
4225             igb_msix_enable);
4226         if (!msix_enable)
4227                 return;
4228
4229         msix_cnt = pci_msix_count(sc->dev);
4230 #ifdef IGB_MSIX_DEBUG
4231         msix_cnt = device_getenv_int(sc->dev, "msix.count", msix_cnt);
4232 #endif
4233         if (msix_cnt <= 1) {
4234                 /* One MSI-X model does not make sense */
4235                 return;
4236         }
4237
4238         i = 0;
4239         while ((1 << (i + 1)) <= msix_cnt)
4240                 ++i;
4241         msix_cnt2 = 1 << i;
4242
4243         if (bootverbose) {
4244                 device_printf(sc->dev, "MSI-X count %d/%d\n",
4245                     msix_cnt2, msix_cnt);
4246         }
4247
4248         KKASSERT(msix_cnt2 <= msix_cnt);
4249         if (msix_cnt == msix_cnt2) {
4250                 /* We need at least one MSI-X for link status */
4251                 msix_cnt2 >>= 1;
4252                 if (msix_cnt2 <= 1) {
4253                         /* One MSI-X for RX/TX does not make sense */
4254                         device_printf(sc->dev, "not enough MSI-X for TX/RX, "
4255                             "MSI-X count %d/%d\n", msix_cnt2, msix_cnt);
4256                         return;
4257                 }
4258                 KKASSERT(msix_cnt > msix_cnt2);
4259
4260                 if (bootverbose) {
4261                         device_printf(sc->dev, "MSI-X count fixup %d/%d\n",
4262                             msix_cnt2, msix_cnt);
4263                 }
4264         }
4265
4266         sc->rx_ring_msix = sc->rx_ring_cnt;
4267         if (sc->rx_ring_msix > msix_cnt2)
4268                 sc->rx_ring_msix = msix_cnt2;
4269
4270         sc->tx_ring_msix = sc->tx_ring_cnt;
4271         if (sc->tx_ring_msix > msix_cnt2)
4272                 sc->tx_ring_msix = msix_cnt2;
4273
4274         if (msix_cnt >= sc->tx_ring_msix + sc->rx_ring_msix + 1) {
4275                 /*
4276                  * Independent TX/RX MSI-X
4277                  */
4278                 aggregate = FALSE;
4279                 if (bootverbose)
4280                         device_printf(sc->dev, "independent TX/RX MSI-X\n");
4281                 alloc_cnt = sc->tx_ring_msix + sc->rx_ring_msix;
4282         } else {
4283                 /*
4284                  * Aggregate TX/RX MSI-X
4285                  */
4286                 aggregate = TRUE;
4287                 if (bootverbose)
4288                         device_printf(sc->dev, "aggregate TX/RX MSI-X\n");
4289                 alloc_cnt = msix_cnt2;
4290                 if (alloc_cnt > ncpus2)
4291                         alloc_cnt = ncpus2;
4292                 if (sc->rx_ring_msix > alloc_cnt)
4293                         sc->rx_ring_msix = alloc_cnt;
4294                 if (sc->tx_ring_msix > alloc_cnt)
4295                         sc->tx_ring_msix = alloc_cnt;
4296         }
4297         ++alloc_cnt;    /* For link status */
4298
4299         if (bootverbose) {
4300                 device_printf(sc->dev, "MSI-X alloc %d, "
4301                     "RX ring %d, TX ring %d\n", alloc_cnt,
4302                     sc->rx_ring_msix, sc->tx_ring_msix);
4303         }
4304
4305         sc->msix_mem_rid = PCIR_BAR(IGB_MSIX_BAR);
4306         sc->msix_mem_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4307             &sc->msix_mem_rid, RF_ACTIVE);
4308         if (sc->msix_mem_res == NULL) {
4309                 device_printf(sc->dev, "Unable to map MSI-X table\n");
4310                 return;
4311         }
4312
4313         sc->msix_cnt = alloc_cnt;
4314         sc->msix_data = kmalloc_cachealign(
4315             sizeof(struct igb_msix_data) * sc->msix_cnt,
4316             M_DEVBUF, M_WAITOK | M_ZERO);
4317         for (x = 0; x < sc->msix_cnt; ++x) {
4318                 msix = &sc->msix_data[x];
4319
4320                 lwkt_serialize_init(&msix->msix_serialize0);
4321                 msix->msix_sc = sc;
4322                 msix->msix_rid = -1;
4323                 msix->msix_vector = x;
4324                 msix->msix_mask = 1 << msix->msix_vector;
4325                 msix->msix_rate = IGB_INTR_RATE;
4326         }
4327
4328         x = 0;
4329         if (!aggregate) {
4330                 /*
4331                  * RX rings
4332                  */
4333                 if (sc->rx_ring_msix == ncpus2) {
4334                         offset = 0;
4335                 } else {
4336                         offset_def = (sc->rx_ring_msix *
4337                             device_get_unit(sc->dev)) % ncpus2;
4338
4339                         offset = device_getenv_int(sc->dev,
4340                             "msix.rxoff", offset_def);
4341                         if (offset >= ncpus2 ||
4342                             offset % sc->rx_ring_msix != 0) {
4343                                 device_printf(sc->dev,
4344                                     "invalid msix.rxoff %d, use %d\n",
4345                                     offset, offset_def);
4346                                 offset = offset_def;
4347                         }
4348                 }
4349                 igb_msix_rx_conf(sc, 0, &x, offset);
4350
4351                 /*
4352                  * TX rings
4353                  */
4354                 if (sc->tx_ring_msix == ncpus2) {
4355                         offset = 0;
4356                 } else {
4357                         offset_def = (sc->tx_ring_msix *
4358                             device_get_unit(sc->dev)) % ncpus2;
4359
4360                         offset = device_getenv_int(sc->dev,
4361                             "msix.txoff", offset_def);
4362                         if (offset >= ncpus2 ||
4363                             offset % sc->tx_ring_msix != 0) {
4364                                 device_printf(sc->dev,
4365                                     "invalid msix.txoff %d, use %d\n",
4366                                     offset, offset_def);
4367                                 offset = offset_def;
4368                         }
4369                 }
4370                 igb_msix_tx_conf(sc, 0, &x, offset);
4371         } else {
4372                 int ring_agg, ring_max;
4373
4374                 ring_agg = sc->rx_ring_msix;
4375                 if (ring_agg > sc->tx_ring_msix)
4376                         ring_agg = sc->tx_ring_msix;
4377
4378                 ring_max = sc->rx_ring_msix;
4379                 if (ring_max < sc->tx_ring_msix)
4380                         ring_max = sc->tx_ring_msix;
4381
4382                 if (ring_max == ncpus2) {
4383                         offset = 0;
4384                 } else {
4385                         offset_def = (ring_max * device_get_unit(sc->dev)) %
4386                             ncpus2;
4387
4388                         offset = device_getenv_int(sc->dev, "msix.off",
4389                             offset_def);
4390                         if (offset >= ncpus2 || offset % ring_max != 0) {
4391                                 device_printf(sc->dev,
4392                                     "invalid msix.off %d, use %d\n",
4393                                     offset, offset_def);
4394                                 offset = offset_def;
4395                         }
4396                 }
4397
4398                 for (i = 0; i < ring_agg; ++i) {
4399                         struct igb_tx_ring *txr = &sc->tx_rings[i];
4400                         struct igb_rx_ring *rxr = &sc->rx_rings[i];
4401
4402                         KKASSERT(x < sc->msix_cnt);
4403                         msix = &sc->msix_data[x++];
4404
4405                         txr->tx_intr_bit = msix->msix_vector;
4406                         txr->tx_intr_mask = msix->msix_mask;
4407                         rxr->rx_intr_bit = msix->msix_vector;
4408                         rxr->rx_intr_mask = msix->msix_mask;
4409
4410                         msix->msix_serialize = &msix->msix_serialize0;
4411                         msix->msix_func = igb_msix_rxtx;
4412                         msix->msix_arg = msix;
4413                         msix->msix_rx = rxr;
4414                         msix->msix_tx = txr;
4415
4416                         msix->msix_cpuid = i + offset;
4417                         KKASSERT(msix->msix_cpuid < ncpus2);
4418                         txr->tx_intr_cpuid = msix->msix_cpuid;
4419
4420                         ksnprintf(msix->msix_desc, sizeof(msix->msix_desc),
4421                             "%s rxtx%d", device_get_nameunit(sc->dev), i);
4422                         msix->msix_rate = IGB_MSIX_RX_RATE;
4423                         ksnprintf(msix->msix_rate_desc,
4424                             sizeof(msix->msix_rate_desc),
4425                             "RXTX%d interrupt rate", i);
4426                 }
4427
4428                 if (ring_agg != ring_max) {
4429                         if (ring_max == sc->tx_ring_msix)
4430                                 igb_msix_tx_conf(sc, i, &x, offset);
4431                         else
4432                                 igb_msix_rx_conf(sc, i, &x, offset);
4433                 }
4434         }
4435
4436         /*
4437          * Link status
4438          */
4439         KKASSERT(x < sc->msix_cnt);
4440         msix = &sc->msix_data[x++];
4441         sc->sts_intr_bit = msix->msix_vector;
4442         sc->sts_intr_mask = msix->msix_mask;
4443
4444         msix->msix_serialize = &sc->main_serialize;
4445         msix->msix_func = igb_msix_status;
4446         msix->msix_arg = sc;
4447         msix->msix_cpuid = 0;
4448         ksnprintf(msix->msix_desc, sizeof(msix->msix_desc), "%s sts",
4449             device_get_nameunit(sc->dev));
4450         ksnprintf(msix->msix_rate_desc, sizeof(msix->msix_rate_desc),
4451             "status interrupt rate");
4452
4453         KKASSERT(x == sc->msix_cnt);
4454
4455         error = pci_setup_msix(sc->dev);
4456         if (error) {
4457                 device_printf(sc->dev, "Setup MSI-X failed\n");
4458                 goto back;
4459         }
4460         setup = TRUE;
4461
4462         for (i = 0; i < sc->msix_cnt; ++i) {
4463                 msix = &sc->msix_data[i];
4464
4465                 error = pci_alloc_msix_vector(sc->dev, msix->msix_vector,
4466                     &msix->msix_rid, msix->msix_cpuid);
4467                 if (error) {
4468                         device_printf(sc->dev,
4469                             "Unable to allocate MSI-X %d on cpu%d\n",
4470                             msix->msix_vector, msix->msix_cpuid);
4471                         goto back;
4472                 }
4473
4474                 msix->msix_res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ,
4475                     &msix->msix_rid, RF_ACTIVE);
4476                 if (msix->msix_res == NULL) {
4477                         device_printf(sc->dev,
4478                             "Unable to allocate MSI-X %d resource\n",
4479                             msix->msix_vector);
4480                         error = ENOMEM;
4481                         goto back;
4482                 }
4483         }
4484
4485         pci_enable_msix(sc->dev);
4486         sc->intr_type = PCI_INTR_TYPE_MSIX;
4487 back:
4488         if (error)
4489                 igb_msix_free(sc, setup);
4490 }
4491
4492 static void
4493 igb_msix_free(struct igb_softc *sc, boolean_t setup)
4494 {
4495         int i;
4496
4497         KKASSERT(sc->msix_cnt > 1);
4498
4499         for (i = 0; i < sc->msix_cnt; ++i) {
4500                 struct igb_msix_data *msix = &sc->msix_data[i];
4501
4502                 if (msix->msix_res != NULL) {
4503                         bus_release_resource(sc->dev, SYS_RES_IRQ,
4504                             msix->msix_rid, msix->msix_res);
4505                 }
4506                 if (msix->msix_rid >= 0)
4507                         pci_release_msix_vector(sc->dev, msix->msix_rid);
4508         }
4509         if (setup)
4510                 pci_teardown_msix(sc->dev);
4511
4512         sc->msix_cnt = 0;
4513         kfree(sc->msix_data, M_DEVBUF);
4514         sc->msix_data = NULL;
4515 }
4516
4517 static int
4518 igb_msix_setup(struct igb_softc *sc)
4519 {
4520         int i;
4521
4522         for (i = 0; i < sc->msix_cnt; ++i) {
4523                 struct igb_msix_data *msix = &sc->msix_data[i];
4524                 int error;
4525
4526                 error = bus_setup_intr_descr(sc->dev, msix->msix_res,
4527                     INTR_MPSAFE, msix->msix_func, msix->msix_arg,
4528                     &msix->msix_handle, msix->msix_serialize, msix->msix_desc);
4529                 if (error) {
4530                         device_printf(sc->dev, "could not set up %s "
4531                             "interrupt handler.\n", msix->msix_desc);
4532                         igb_msix_teardown(sc, i);
4533                         return error;
4534                 }
4535         }
4536         return 0;
4537 }
4538
4539 static void
4540 igb_msix_teardown(struct igb_softc *sc, int msix_cnt)
4541 {
4542         int i;
4543
4544         for (i = 0; i < msix_cnt; ++i) {
4545                 struct igb_msix_data *msix = &sc->msix_data[i];
4546
4547                 bus_teardown_intr(sc->dev, msix->msix_res, msix->msix_handle);
4548         }
4549 }
4550
4551 static void
4552 igb_msix_rx(void *arg)
4553 {
4554         struct igb_rx_ring *rxr = arg;
4555
4556         ASSERT_SERIALIZED(&rxr->rx_serialize);
4557         igb_rxeof(rxr, -1);
4558
4559         E1000_WRITE_REG(&rxr->sc->hw, E1000_EIMS, rxr->rx_intr_mask);
4560 }
4561
4562 static void
4563 igb_msix_tx(void *arg)
4564 {
4565         struct igb_tx_ring *txr = arg;
4566
4567         ASSERT_SERIALIZED(&txr->tx_serialize);
4568
4569         igb_txeof(txr);
4570         if (!ifsq_is_empty(txr->ifsq))
4571                 ifsq_devstart(txr->ifsq);
4572
4573         E1000_WRITE_REG(&txr->sc->hw, E1000_EIMS, txr->tx_intr_mask);
4574 }
4575
4576 static void
4577 igb_msix_status(void *arg)
4578 {
4579         struct igb_softc *sc = arg;
4580         uint32_t icr;
4581
4582         ASSERT_SERIALIZED(&sc->main_serialize);
4583
4584         icr = E1000_READ_REG(&sc->hw, E1000_ICR);
4585         if (icr & E1000_ICR_LSC) {
4586                 sc->hw.mac.get_link_status = 1;
4587                 igb_update_link_status(sc);
4588         }
4589
4590         E1000_WRITE_REG(&sc->hw, E1000_EIMS, sc->sts_intr_mask);
4591 }
4592
4593 static void
4594 igb_set_ring_inuse(struct igb_softc *sc, boolean_t polling)
4595 {
4596         sc->rx_ring_inuse = igb_get_rxring_inuse(sc, polling);
4597         sc->tx_ring_inuse = igb_get_txring_inuse(sc, polling);
4598         if (bootverbose) {
4599                 if_printf(&sc->arpcom.ac_if, "RX rings %d/%d, TX rings %d/%d\n",
4600                     sc->rx_ring_inuse, sc->rx_ring_cnt,
4601                     sc->tx_ring_inuse, sc->tx_ring_cnt);
4602         }
4603 }
4604
4605 static int
4606 igb_get_rxring_inuse(const struct igb_softc *sc, boolean_t polling)
4607 {
4608         if (!IGB_ENABLE_HWRSS(sc))
4609                 return 1;
4610
4611         if (polling)
4612                 return sc->rx_ring_cnt;
4613         else if (sc->intr_type != PCI_INTR_TYPE_MSIX)
4614                 return IGB_MIN_RING_RSS;
4615         else
4616                 return sc->rx_ring_msix;
4617 }
4618
4619 static int
4620 igb_get_txring_inuse(const struct igb_softc *sc, boolean_t polling)
4621 {
4622         if (!IGB_ENABLE_HWTSS(sc))
4623                 return 1;
4624
4625         if (polling)
4626                 return sc->tx_ring_cnt;
4627         else if (sc->intr_type != PCI_INTR_TYPE_MSIX)
4628                 return IGB_MIN_RING;
4629         else
4630                 return sc->tx_ring_msix;
4631 }
4632
4633 static int
4634 igb_tso_pullup(struct igb_tx_ring *txr, struct mbuf **mp)
4635 {
4636         int hoff, iphlen, thoff;
4637         struct mbuf *m;
4638
4639         m = *mp;
4640         KASSERT(M_WRITABLE(m), ("TSO mbuf not writable"));
4641
4642         iphlen = m->m_pkthdr.csum_iphlen;
4643         thoff = m->m_pkthdr.csum_thlen;
4644         hoff = m->m_pkthdr.csum_lhlen;
4645
4646         KASSERT(iphlen > 0, ("invalid ip hlen"));
4647         KASSERT(thoff > 0, ("invalid tcp hlen"));
4648         KASSERT(hoff > 0, ("invalid ether hlen"));
4649
4650         if (__predict_false(m->m_len < hoff + iphlen + thoff)) {
4651                 m = m_pullup(m, hoff + iphlen + thoff);
4652                 if (m == NULL) {
4653                         *mp = NULL;
4654                         return ENOBUFS;
4655                 }
4656                 *mp = m;
4657         }
4658         if (txr->tx_flags & IGB_TXFLAG_TSO_IPLEN0) {
4659                 struct ip *ip;
4660
4661                 ip = mtodoff(m, struct ip *, hoff);
4662                 ip->ip_len = 0;
4663         }
4664
4665         return 0;
4666 }
4667
4668 static void
4669 igb_tso_ctx(struct igb_tx_ring *txr, struct mbuf *m, uint32_t *hlen)
4670 {
4671         struct e1000_adv_tx_context_desc *TXD;
4672         uint32_t vlan_macip_lens, type_tucmd_mlhl, mss_l4len_idx;
4673         int hoff, ctxd, iphlen, thoff;
4674
4675         iphlen = m->m_pkthdr.csum_iphlen;
4676         thoff = m->m_pkthdr.csum_thlen;
4677         hoff = m->m_pkthdr.csum_lhlen;
4678
4679         vlan_macip_lens = type_tucmd_mlhl = mss_l4len_idx = 0;
4680
4681         ctxd = txr->next_avail_desc;
4682         TXD = (struct e1000_adv_tx_context_desc *)&txr->tx_base[ctxd];
4683
4684         if (m->m_flags & M_VLANTAG) {
4685                 uint16_t vlantag;
4686
4687                 vlantag = htole16(m->m_pkthdr.ether_vlantag);
4688                 vlan_macip_lens |= (vlantag << E1000_ADVTXD_VLAN_SHIFT);
4689         }
4690
4691         vlan_macip_lens |= (hoff << E1000_ADVTXD_MACLEN_SHIFT);
4692         vlan_macip_lens |= iphlen;
4693
4694         type_tucmd_mlhl |= E1000_ADVTXD_DCMD_DEXT | E1000_ADVTXD_DTYP_CTXT;
4695         type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_TCP;
4696         type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_IPV4;
4697
4698         mss_l4len_idx |= (m->m_pkthdr.tso_segsz << E1000_ADVTXD_MSS_SHIFT);
4699         mss_l4len_idx |= (thoff << E1000_ADVTXD_L4LEN_SHIFT);
4700         /* 82575 needs the queue index added */
4701         if (txr->sc->hw.mac.type == e1000_82575)
4702                 mss_l4len_idx |= txr->me << 4;
4703
4704         TXD->vlan_macip_lens = htole32(vlan_macip_lens);
4705         TXD->type_tucmd_mlhl = htole32(type_tucmd_mlhl);
4706         TXD->seqnum_seed = htole32(0);
4707         TXD->mss_l4len_idx = htole32(mss_l4len_idx);
4708
4709         /* We've consumed the first desc, adjust counters */
4710         if (++ctxd == txr->num_tx_desc)
4711                 ctxd = 0;
4712         txr->next_avail_desc = ctxd;
4713         --txr->tx_avail;
4714
4715         *hlen = hoff + iphlen + thoff;
4716 }
4717
4718 static void
4719 igb_setup_serializer(struct igb_softc *sc)
4720 {
4721         const struct igb_msix_data *msix;
4722         int i, j;
4723
4724         /*
4725          * Allocate serializer array
4726          */
4727
4728         /* Main + TX + RX */
4729         sc->serialize_cnt = 1 + sc->tx_ring_cnt + sc->rx_ring_cnt;
4730
4731         /* Aggregate TX/RX MSI-X */
4732         for (i = 0; i < sc->msix_cnt; ++i) {
4733                 msix = &sc->msix_data[i];
4734                 if (msix->msix_serialize == &msix->msix_serialize0)
4735                         sc->serialize_cnt++;
4736         }
4737
4738         sc->serializes =
4739             kmalloc(sc->serialize_cnt * sizeof(struct lwkt_serialize *),
4740                 M_DEVBUF, M_WAITOK | M_ZERO);
4741
4742         /*
4743          * Setup serializers
4744          *
4745          * NOTE: Order is critical
4746          */
4747
4748         i = 0;
4749         KKASSERT(i < sc->serialize_cnt);
4750         sc->serializes[i++] = &sc->main_serialize;
4751
4752         for (j = 0; j < sc->msix_cnt; ++j) {
4753                 msix = &sc->msix_data[j];
4754                 if (msix->msix_serialize == &msix->msix_serialize0) {
4755                         KKASSERT(i < sc->serialize_cnt);
4756                         sc->serializes[i++] = msix->msix_serialize;
4757                 }
4758         }
4759
4760         sc->tx_serialize = i;
4761         for (j = 0; j < sc->tx_ring_cnt; ++j) {
4762                 KKASSERT(i < sc->serialize_cnt);
4763                 sc->serializes[i++] = &sc->tx_rings[j].tx_serialize;
4764         }
4765
4766         sc->rx_serialize = i;
4767         for (j = 0; j < sc->rx_ring_cnt; ++j) {
4768                 KKASSERT(i < sc->serialize_cnt);
4769                 sc->serializes[i++] = &sc->rx_rings[j].rx_serialize;
4770         }
4771
4772         KKASSERT(i == sc->serialize_cnt);
4773 }
4774
4775 static void
4776 igb_msix_rx_conf(struct igb_softc *sc, int i, int *x0, int offset)
4777 {
4778         int x = *x0;
4779
4780         for (; i < sc->rx_ring_msix; ++i) {
4781                 struct igb_rx_ring *rxr = &sc->rx_rings[i];
4782                 struct igb_msix_data *msix;
4783
4784                 KKASSERT(x < sc->msix_cnt);
4785                 msix = &sc->msix_data[x++];
4786
4787                 rxr->rx_intr_bit = msix->msix_vector;
4788                 rxr->rx_intr_mask = msix->msix_mask;
4789
4790                 msix->msix_serialize = &rxr->rx_serialize;
4791                 msix->msix_func = igb_msix_rx;
4792                 msix->msix_arg = rxr;
4793
4794                 msix->msix_cpuid = i + offset;
4795                 KKASSERT(msix->msix_cpuid < ncpus2);
4796
4797                 ksnprintf(msix->msix_desc, sizeof(msix->msix_desc), "%s rx%d",
4798                     device_get_nameunit(sc->dev), i);
4799
4800                 msix->msix_rate = IGB_MSIX_RX_RATE;
4801                 ksnprintf(msix->msix_rate_desc, sizeof(msix->msix_rate_desc),
4802                     "RX%d interrupt rate", i);
4803         }
4804         *x0 = x;
4805 }
4806
4807 static void
4808 igb_msix_tx_conf(struct igb_softc *sc, int i, int *x0, int offset)
4809 {
4810         int x = *x0;
4811
4812         for (; i < sc->tx_ring_msix; ++i) {
4813                 struct igb_tx_ring *txr = &sc->tx_rings[i];
4814                 struct igb_msix_data *msix;
4815
4816                 KKASSERT(x < sc->msix_cnt);
4817                 msix = &sc->msix_data[x++];
4818
4819                 txr->tx_intr_bit = msix->msix_vector;
4820                 txr->tx_intr_mask = msix->msix_mask;
4821
4822                 msix->msix_serialize = &txr->tx_serialize;
4823                 msix->msix_func = igb_msix_tx;
4824                 msix->msix_arg = txr;
4825
4826                 msix->msix_cpuid = i + offset;
4827                 KKASSERT(msix->msix_cpuid < ncpus2);
4828                 txr->tx_intr_cpuid = msix->msix_cpuid;
4829
4830                 ksnprintf(msix->msix_desc, sizeof(msix->msix_desc), "%s tx%d",
4831                     device_get_nameunit(sc->dev), i);
4832
4833                 msix->msix_rate = IGB_MSIX_TX_RATE;
4834                 ksnprintf(msix->msix_rate_desc, sizeof(msix->msix_rate_desc),
4835                     "TX%d interrupt rate", i);
4836         }
4837         *x0 = x;
4838 }
4839
4840 static void
4841 igb_msix_rxtx(void *arg)
4842 {
4843         struct igb_msix_data *msix = arg;
4844         struct igb_rx_ring *rxr = msix->msix_rx;
4845         struct igb_tx_ring *txr = msix->msix_tx;
4846
4847         ASSERT_SERIALIZED(&msix->msix_serialize0);
4848
4849         lwkt_serialize_enter(&rxr->rx_serialize);
4850         igb_rxeof(rxr, -1);
4851         lwkt_serialize_exit(&rxr->rx_serialize);
4852
4853         lwkt_serialize_enter(&txr->tx_serialize);
4854         igb_txeof(txr);
4855         if (!ifsq_is_empty(txr->ifsq))
4856                 ifsq_devstart(txr->ifsq);
4857         lwkt_serialize_exit(&txr->tx_serialize);
4858
4859         E1000_WRITE_REG(&msix->msix_sc->hw, E1000_EIMS, msix->msix_mask);
4860 }