mxge: properly remove the sysctls
[dragonfly.git] / sys / dev / netif / mxge / if_mxge.c
CommitLineData
8892ea20
AE
1/******************************************************************************
2
3Copyright (c) 2006-2009, Myricom Inc.
4All rights reserved.
5
6Redistribution and use in source and binary forms, with or without
7modification, are permitted provided that the following conditions are met:
8
9 1. Redistributions of source code must retain the above copyright notice,
10 this list of conditions and the following disclaimer.
11
12 2. Neither the name of the Myricom Inc, nor the names of its
13 contributors may be used to endorse or promote products derived from
14 this software without specific prior written permission.
15
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26POSSIBILITY OF SUCH DAMAGE.
27
28***************************************************************************/
29
30#include <sys/cdefs.h>
b3535a6f 31/*__FBSDID("$FreeBSD: src/sys/dev/mxge/if_mxge.c,v 1.63 2009/06/26 11:45:06 rwatson Exp $");*/
8892ea20
AE
32
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/linker.h>
36#include <sys/firmware.h>
37#include <sys/endian.h>
05e71c89 38#include <sys/in_cksum.h>
8892ea20
AE
39#include <sys/sockio.h>
40#include <sys/mbuf.h>
41#include <sys/malloc.h>
8892ea20 42#include <sys/kernel.h>
8892ea20 43#include <sys/module.h>
2e8181d0 44#include <sys/serialize.h>
8892ea20
AE
45#include <sys/socket.h>
46#include <sys/sysctl.h>
8892ea20
AE
47
48/* count xmits ourselves, rather than via drbr */
49#define NO_SLOW_STATS
50#include <net/if.h>
51#include <net/if_arp.h>
f2f758df 52#include <net/ifq_var.h>
8892ea20
AE
53#include <net/ethernet.h>
54#include <net/if_dl.h>
55#include <net/if_media.h>
56
57#include <net/bpf.h>
58
59#include <net/if_types.h>
b3535a6f 60#include <net/vlan/if_vlan_var.h>
8892ea20
AE
61#include <net/zlib.h>
62
63#include <netinet/in_systm.h>
64#include <netinet/in.h>
65#include <netinet/ip.h>
66#include <netinet/tcp.h>
67
8892ea20
AE
68#include <sys/bus.h>
69#include <sys/rman.h>
8892ea20 70
b3535a6f
AE
71#include <bus/pci/pcireg.h>
72#include <bus/pci/pcivar.h>
73#include <bus/pci/pci_private.h> /* XXX for pci_cfg_restore */
8892ea20
AE
74
75#include <vm/vm.h> /* for pmap_mapdev() */
76#include <vm/pmap.h>
77
78#if defined(__i386) || defined(__amd64)
79#include <machine/specialreg.h>
80#endif
81
b3535a6f
AE
82#include <dev/netif/mxge/mxge_mcp.h>
83#include <dev/netif/mxge/mcp_gen_header.h>
8892ea20 84/*#define MXGE_FAKE_IFP*/
b3535a6f 85#include <dev/netif/mxge/if_mxge_var.h>
8892ea20
AE
86#ifdef IFNET_BUF_RING
87#include <sys/buf_ring.h>
88#endif
89
90#include "opt_inet.h"
91
92/* tunable params */
93static int mxge_nvidia_ecrc_enable = 1;
94static int mxge_force_firmware = 0;
95static int mxge_intr_coal_delay = 30;
96static int mxge_deassert_wait = 1;
97static int mxge_flow_control = 1;
98static int mxge_verbose = 0;
99static int mxge_lro_cnt = 8;
100static int mxge_ticks;
101static int mxge_max_slices = 1;
102static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
103static int mxge_always_promisc = 0;
104static int mxge_initial_mtu = ETHERMTU_JUMBO;
105static char *mxge_fw_unaligned = "mxge_ethp_z8e";
106static char *mxge_fw_aligned = "mxge_eth_z8e";
107static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
108static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
109
110static int mxge_probe(device_t dev);
111static int mxge_attach(device_t dev);
112static int mxge_detach(device_t dev);
113static int mxge_shutdown(device_t dev);
114static void mxge_intr(void *arg);
115
116static device_method_t mxge_methods[] =
117{
118 /* Device interface */
119 DEVMETHOD(device_probe, mxge_probe),
120 DEVMETHOD(device_attach, mxge_attach),
121 DEVMETHOD(device_detach, mxge_detach),
122 DEVMETHOD(device_shutdown, mxge_shutdown),
123 {0, 0}
124};
125
126static driver_t mxge_driver =
127{
128 "mxge",
129 mxge_methods,
130 sizeof(mxge_softc_t),
131};
132
133static devclass_t mxge_devclass;
134
135/* Declare ourselves to be a child of the PCI bus.*/
136DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
137MODULE_DEPEND(mxge, firmware, 1, 1, 1);
138MODULE_DEPEND(mxge, zlib, 1, 1, 1);
139
140static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
141static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
142static int mxge_close(mxge_softc_t *sc);
143static int mxge_open(mxge_softc_t *sc);
144static void mxge_tick(void *arg);
145
87353c03
AE
146/* XXX: we don't have Large Receive Offload support yet */
147 inline int
148mxge_lro_rx(struct mxge_slice_state *ss, struct mbuf *m_head, uint32_t csum)
149{
150 (void)ss;
151 (void)m_head;
152 (void)csum;
153 return 1;
154}
155
156 inline void
157mxge_lro_flush(struct mxge_slice_state *ss, struct lro_entry *lro)
158{
159 (void)ss;
160 (void)lro;
161}
162
8892ea20
AE
163static int
164mxge_probe(device_t dev)
165{
166 int rev;
167
168
169 if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
170 ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
171 (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
172 rev = pci_get_revid(dev);
173 switch (rev) {
174 case MXGE_PCI_REV_Z8E:
175 device_set_desc(dev, "Myri10G-PCIE-8A");
176 break;
177 case MXGE_PCI_REV_Z8ES:
178 device_set_desc(dev, "Myri10G-PCIE-8B");
179 break;
180 default:
181 device_set_desc(dev, "Myri10G-PCIE-8??");
182 device_printf(dev, "Unrecognized rev %d NIC\n",
183 rev);
184 break;
185 }
186 return 0;
187 }
188 return ENXIO;
189}
190
191static void
192mxge_enable_wc(mxge_softc_t *sc)
193{
9eb279be 194#if 0
8892ea20
AE
195#if defined(__i386) || defined(__amd64)
196 vm_offset_t len;
197 int err;
198
199 sc->wc = 1;
200 len = rman_get_size(sc->mem_res);
201 err = pmap_change_attr((vm_offset_t) sc->sram,
202 len, PAT_WRITE_COMBINING);
203 if (err != 0) {
204 device_printf(sc->dev, "pmap_change_attr failed, %d\n",
205 err);
206 sc->wc = 0;
207 }
9eb279be
AE
208#endif
209#else
210 sc->wc = 0; /* TBD: PAT support */
211#endif
8892ea20
AE
212}
213
214
215/* callback to get our DMA address */
216static void
217mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
218 int error)
219{
220 if (error == 0) {
221 *(bus_addr_t *) arg = segs->ds_addr;
222 }
223}
224
225static int
226mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
227 bus_size_t alignment)
228{
229 int err;
230 device_t dev = sc->dev;
231 bus_size_t boundary, maxsegsize;
232
233 if (bytes > 4096 && alignment == 4096) {
234 boundary = 0;
235 maxsegsize = bytes;
236 } else {
237 boundary = 4096;
238 maxsegsize = 4096;
239 }
240
241 /* allocate DMAable memory tags */
242 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
243 alignment, /* alignment */
244 boundary, /* boundary */
245 BUS_SPACE_MAXADDR, /* low */
246 BUS_SPACE_MAXADDR, /* high */
247 NULL, NULL, /* filter */
248 bytes, /* maxsize */
249 1, /* num segs */
250 maxsegsize, /* maxsegsize */
251 BUS_DMA_COHERENT, /* flags */
8892ea20
AE
252 &dma->dmat); /* tag */
253 if (err != 0) {
254 device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
255 return err;
256 }
257
258 /* allocate DMAable memory & map */
259 err = bus_dmamem_alloc(dma->dmat, &dma->addr,
260 (BUS_DMA_WAITOK | BUS_DMA_COHERENT
261 | BUS_DMA_ZERO), &dma->map);
262 if (err != 0) {
263 device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
264 goto abort_with_dmat;
265 }
266
267 /* load the memory */
268 err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
269 mxge_dmamap_callback,
270 (void *)&dma->bus_addr, 0);
271 if (err != 0) {
272 device_printf(dev, "couldn't load map (err = %d)\n", err);
273 goto abort_with_mem;
274 }
275 return 0;
276
277abort_with_mem:
278 bus_dmamem_free(dma->dmat, dma->addr, dma->map);
279abort_with_dmat:
280 (void)bus_dma_tag_destroy(dma->dmat);
281 return err;
282}
283
284
285static void
286mxge_dma_free(mxge_dma_t *dma)
287{
288 bus_dmamap_unload(dma->dmat, dma->map);
289 bus_dmamem_free(dma->dmat, dma->addr, dma->map);
290 (void)bus_dma_tag_destroy(dma->dmat);
291}
292
293/*
294 * The eeprom strings on the lanaiX have the format
295 * SN=x\0
296 * MAC=x:x:x:x:x:x\0
297 * PC=text\0
298 */
299
300static int
301mxge_parse_strings(mxge_softc_t *sc)
302{
303#define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
304
305 char *ptr, *limit;
306 int i, found_mac;
307
308 ptr = sc->eeprom_strings;
309 limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
310 found_mac = 0;
311 while (ptr < limit && *ptr != '\0') {
312 if (memcmp(ptr, "MAC=", 4) == 0) {
313 ptr += 1;
314 sc->mac_addr_string = ptr;
315 for (i = 0; i < 6; i++) {
316 ptr += 3;
317 if ((ptr + 2) > limit)
318 goto abort;
319 sc->mac_addr[i] = strtoul(ptr, NULL, 16);
320 found_mac = 1;
321 }
322 } else if (memcmp(ptr, "PC=", 3) == 0) {
323 ptr += 3;
324 strncpy(sc->product_code_string, ptr,
325 sizeof (sc->product_code_string) - 1);
326 } else if (memcmp(ptr, "SN=", 3) == 0) {
327 ptr += 3;
328 strncpy(sc->serial_number_string, ptr,
329 sizeof (sc->serial_number_string) - 1);
330 }
331 MXGE_NEXT_STRING(ptr);
332 }
333
334 if (found_mac)
335 return 0;
336
337 abort:
338 device_printf(sc->dev, "failed to parse eeprom_strings\n");
339
340 return ENXIO;
341}
342
343#if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
344static void
345mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
346{
347 uint32_t val;
348 unsigned long base, off;
349 char *va, *cfgptr;
350 device_t pdev, mcp55;
351 uint16_t vendor_id, device_id, word;
352 uintptr_t bus, slot, func, ivend, idev;
353 uint32_t *ptr32;
354
355
356 if (!mxge_nvidia_ecrc_enable)
357 return;
358
359 pdev = device_get_parent(device_get_parent(sc->dev));
360 if (pdev == NULL) {
361 device_printf(sc->dev, "could not find parent?\n");
362 return;
363 }
364 vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
365 device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
366
367 if (vendor_id != 0x10de)
368 return;
369
370 base = 0;
371
372 if (device_id == 0x005d) {
373 /* ck804, base address is magic */
374 base = 0xe0000000UL;
375 } else if (device_id >= 0x0374 && device_id <= 0x378) {
376 /* mcp55, base address stored in chipset */
377 mcp55 = pci_find_bsf(0, 0, 0);
378 if (mcp55 &&
379 0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
380 0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
381 word = pci_read_config(mcp55, 0x90, 2);
382 base = ((unsigned long)word & 0x7ffeU) << 25;
383 }
384 }
385 if (!base)
386 return;
387
388 /* XXXX
389 Test below is commented because it is believed that doing
390 config read/write beyond 0xff will access the config space
391 for the next larger function. Uncomment this and remove
392 the hacky pmap_mapdev() way of accessing config space when
393 FreeBSD grows support for extended pcie config space access
394 */
395#if 0
396 /* See if we can, by some miracle, access the extended
397 config space */
398 val = pci_read_config(pdev, 0x178, 4);
399 if (val != 0xffffffff) {
400 val |= 0x40;
401 pci_write_config(pdev, 0x178, val, 4);
402 return;
403 }
404#endif
405 /* Rather than using normal pci config space writes, we must
406 * map the Nvidia config space ourselves. This is because on
407 * opteron/nvidia class machine the 0xe000000 mapping is
408 * handled by the nvidia chipset, that means the internal PCI
409 * device (the on-chip northbridge), or the amd-8131 bridge
410 * and things behind them are not visible by this method.
411 */
412
413 BUS_READ_IVAR(device_get_parent(pdev), pdev,
414 PCI_IVAR_BUS, &bus);
415 BUS_READ_IVAR(device_get_parent(pdev), pdev,
416 PCI_IVAR_SLOT, &slot);
417 BUS_READ_IVAR(device_get_parent(pdev), pdev,
418 PCI_IVAR_FUNCTION, &func);
419 BUS_READ_IVAR(device_get_parent(pdev), pdev,
420 PCI_IVAR_VENDOR, &ivend);
421 BUS_READ_IVAR(device_get_parent(pdev), pdev,
422 PCI_IVAR_DEVICE, &idev);
423
424 off = base
425 + 0x00100000UL * (unsigned long)bus
426 + 0x00001000UL * (unsigned long)(func
427 + 8 * slot);
428
429 /* map it into the kernel */
430 va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
431
432
433 if (va == NULL) {
434 device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
435 return;
436 }
437 /* get a pointer to the config space mapped into the kernel */
438 cfgptr = va + (off & PAGE_MASK);
439
440 /* make sure that we can really access it */
441 vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
442 device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
443 if (! (vendor_id == ivend && device_id == idev)) {
444 device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
445 vendor_id, device_id);
446 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
447 return;
448 }
449
450 ptr32 = (uint32_t*)(cfgptr + 0x178);
451 val = *ptr32;
452
453 if (val == 0xffffffff) {
454 device_printf(sc->dev, "extended mapping failed\n");
455 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
456 return;
457 }
458 *ptr32 = val | 0x40;
459 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
460 if (mxge_verbose)
461 device_printf(sc->dev,
462 "Enabled ECRC on upstream Nvidia bridge "
463 "at %d:%d:%d\n",
464 (int)bus, (int)slot, (int)func);
465 return;
466}
467#else
468static void
469mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
470{
471 device_printf(sc->dev,
472 "Nforce 4 chipset on non-x86/amd64!?!?!\n");
473 return;
474}
475#endif
476
477
478static int
479mxge_dma_test(mxge_softc_t *sc, int test_type)
480{
481 mxge_cmd_t cmd;
482 bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
483 int status;
484 uint32_t len;
485 char *test = " ";
486
487
488 /* Run a small DMA test.
489 * The magic multipliers to the length tell the firmware
490 * to do DMA read, write, or read+write tests. The
491 * results are returned in cmd.data0. The upper 16
492 * bits of the return is the number of transfers completed.
493 * The lower 16 bits is the time in 0.5us ticks that the
494 * transfers took to complete.
495 */
496
497 len = sc->tx_boundary;
498
499 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
500 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
501 cmd.data2 = len * 0x10000;
502 status = mxge_send_cmd(sc, test_type, &cmd);
503 if (status != 0) {
504 test = "read";
505 goto abort;
506 }
507 sc->read_dma = ((cmd.data0>>16) * len * 2) /
508 (cmd.data0 & 0xffff);
509 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
510 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
511 cmd.data2 = len * 0x1;
512 status = mxge_send_cmd(sc, test_type, &cmd);
513 if (status != 0) {
514 test = "write";
515 goto abort;
516 }
517 sc->write_dma = ((cmd.data0>>16) * len * 2) /
518 (cmd.data0 & 0xffff);
519
520 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
521 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
522 cmd.data2 = len * 0x10001;
523 status = mxge_send_cmd(sc, test_type, &cmd);
524 if (status != 0) {
525 test = "read/write";
526 goto abort;
527 }
528 sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
529 (cmd.data0 & 0xffff);
530
531abort:
532 if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
533 device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
534 test, status);
535
536 return status;
537}
538
539/*
540 * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
541 * when the PCI-E Completion packets are aligned on an 8-byte
542 * boundary. Some PCI-E chip sets always align Completion packets; on
543 * the ones that do not, the alignment can be enforced by enabling
544 * ECRC generation (if supported).
545 *
546 * When PCI-E Completion packets are not aligned, it is actually more
547 * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
548 *
549 * If the driver can neither enable ECRC nor verify that it has
550 * already been enabled, then it must use a firmware image which works
551 * around unaligned completion packets (ethp_z8e.dat), and it should
552 * also ensure that it never gives the device a Read-DMA which is
553 * larger than 2KB by setting the tx_boundary to 2KB. If ECRC is
554 * enabled, then the driver should use the aligned (eth_z8e.dat)
555 * firmware image, and set tx_boundary to 4KB.
556 */
557
558static int
559mxge_firmware_probe(mxge_softc_t *sc)
560{
561 device_t dev = sc->dev;
562 int reg, status;
563 uint16_t pectl;
564
565 sc->tx_boundary = 4096;
566 /*
567 * Verify the max read request size was set to 4KB
568 * before trying the test with 4KB.
569 */
570 if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
571 pectl = pci_read_config(dev, reg + 0x8, 2);
572 if ((pectl & (5 << 12)) != (5 << 12)) {
573 device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
574 pectl);
575 sc->tx_boundary = 2048;
576 }
577 }
578
579 /*
580 * load the optimized firmware (which assumes aligned PCIe
581 * completions) in order to see if it works on this host.
582 */
583 sc->fw_name = mxge_fw_aligned;
584 status = mxge_load_firmware(sc, 1);
585 if (status != 0) {
586 return status;
587 }
588
589 /*
590 * Enable ECRC if possible
591 */
592 mxge_enable_nvidia_ecrc(sc);
593
594 /*
595 * Run a DMA test which watches for unaligned completions and
596 * aborts on the first one seen.
597 */
598
599 status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
600 if (status == 0)
601 return 0; /* keep the aligned firmware */
602
603 if (status != E2BIG)
604 device_printf(dev, "DMA test failed: %d\n", status);
605 if (status == ENOSYS)
606 device_printf(dev, "Falling back to ethp! "
607 "Please install up to date fw\n");
608 return status;
609}
610
611static int
612mxge_select_firmware(mxge_softc_t *sc)
613{
614 int aligned = 0;
615
616
617 if (mxge_force_firmware != 0) {
618 if (mxge_force_firmware == 1)
619 aligned = 1;
620 else
621 aligned = 0;
622 if (mxge_verbose)
623 device_printf(sc->dev,
624 "Assuming %s completions (forced)\n",
625 aligned ? "aligned" : "unaligned");
626 goto abort;
627 }
628
629 /* if the PCIe link width is 4 or less, we can use the aligned
630 firmware and skip any checks */
631 if (sc->link_width != 0 && sc->link_width <= 4) {
632 device_printf(sc->dev,
633 "PCIe x%d Link, expect reduced performance\n",
634 sc->link_width);
635 aligned = 1;
636 goto abort;
637 }
638
639 if (0 == mxge_firmware_probe(sc))
640 return 0;
641
642abort:
643 if (aligned) {
644 sc->fw_name = mxge_fw_aligned;
645 sc->tx_boundary = 4096;
646 } else {
647 sc->fw_name = mxge_fw_unaligned;
648 sc->tx_boundary = 2048;
649 }
650 return (mxge_load_firmware(sc, 0));
651}
652
653union qualhack
654{
655 const char *ro_char;
656 char *rw_char;
657};
658
659static int
660mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
661{
662
663
664 if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
665 device_printf(sc->dev, "Bad firmware type: 0x%x\n",
666 be32toh(hdr->mcp_type));
667 return EIO;
668 }
669
670 /* save firmware version for sysctl */
671 strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
672 if (mxge_verbose)
673 device_printf(sc->dev, "firmware id: %s\n", hdr->version);
674
b6670ba0 675 ksscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
8892ea20
AE
676 &sc->fw_ver_minor, &sc->fw_ver_tiny);
677
678 if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
679 && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
680 device_printf(sc->dev, "Found firmware version %s\n",
681 sc->fw_version);
682 device_printf(sc->dev, "Driver needs %d.%d\n",
683 MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
684 return EINVAL;
685 }
686 return 0;
687
688}
689
87353c03 690#if 0
8892ea20
AE
691static void *
692z_alloc(void *nil, u_int items, u_int size)
693{
694 void *ptr;
695
d777b84f 696 ptr = kmalloc(items * size, M_TEMP, M_NOWAIT);
8892ea20
AE
697 return ptr;
698}
699
700static void
701z_free(void *nil, void *ptr)
702{
d777b84f 703 kfree(ptr, M_TEMP);
8892ea20 704}
87353c03 705#endif
8892ea20
AE
706
707static int
708mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
709{
e16aed9b 710 struct fw_image *fw;
8892ea20
AE
711 const mcp_gen_header_t *hdr;
712 unsigned hdr_offset;
713 int status;
714 unsigned int i;
715 char dummy;
716 size_t fw_len;
717
e16aed9b 718 fw = firmware_image_load(sc->fw_name, NULL);
8892ea20
AE
719 if (fw == NULL) {
720 device_printf(sc->dev, "Could not find firmware image %s\n",
721 sc->fw_name);
722 return ENOENT;
723 }
e16aed9b 724#if 0
8892ea20
AE
725 /* setup zlib and decompress f/w */
726 bzero(&zs, sizeof (zs));
727 zs.zalloc = z_alloc;
728 zs.zfree = z_free;
729 status = inflateInit(&zs);
730 if (status != Z_OK) {
731 status = EIO;
732 goto abort_with_fw;
733 }
734
735 /* the uncompressed size is stored as the firmware version,
736 which would otherwise go unused */
737 fw_len = (size_t) fw->version;
d777b84f 738 inflate_buffer = kmalloc(fw_len, M_TEMP, M_NOWAIT);
8892ea20
AE
739 if (inflate_buffer == NULL)
740 goto abort_with_zs;
741 zs.avail_in = fw->datasize;
742 zs.next_in = __DECONST(char *, fw->data);
743 zs.avail_out = fw_len;
744 zs.next_out = inflate_buffer;
745 status = inflate(&zs, Z_FINISH);
746 if (status != Z_STREAM_END) {
747 device_printf(sc->dev, "zlib %d\n", status);
748 status = EIO;
749 goto abort_with_buffer;
750 }
e16aed9b 751#endif
d281a118 752 fw_len = fw->fw_imglen;
8892ea20
AE
753 /* check id */
754 hdr_offset = htobe32(*(const uint32_t *)
e16aed9b 755 (fw->fw_image + MCP_HEADER_PTR_OFFSET));
8892ea20
AE
756 if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
757 device_printf(sc->dev, "Bad firmware file");
758 status = EIO;
e16aed9b 759 goto abort_with_fw;
8892ea20 760 }
e16aed9b 761 hdr = (const void*)(fw->fw_image + hdr_offset);
8892ea20
AE
762
763 status = mxge_validate_firmware(sc, hdr);
764 if (status != 0)
e16aed9b 765 goto abort_with_fw;
8892ea20
AE
766
767 /* Copy the inflated firmware to NIC SRAM. */
768 for (i = 0; i < fw_len; i += 256) {
769 mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
e16aed9b 770 fw->fw_image + i,
8892ea20
AE
771 min(256U, (unsigned)(fw_len - i)));
772 wmb();
773 dummy = *sc->sram;
774 wmb();
775 }
776
777 *limit = fw_len;
778 status = 0;
e16aed9b 779#if 0
8892ea20 780abort_with_buffer:
d777b84f 781 kfree(inflate_buffer, M_TEMP);
8892ea20
AE
782abort_with_zs:
783 inflateEnd(&zs);
e16aed9b 784#endif
8892ea20 785abort_with_fw:
e16aed9b 786 firmware_image_unload(fw);
8892ea20
AE
787 return status;
788}
789
790/*
791 * Enable or disable periodic RDMAs from the host to make certain
792 * chipsets resend dropped PCIe messages
793 */
794
795static void
796mxge_dummy_rdma(mxge_softc_t *sc, int enable)
797{
798 char buf_bytes[72];
799 volatile uint32_t *confirm;
800 volatile char *submit;
801 uint32_t *buf, dma_low, dma_high;
802 int i;
803
804 buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
805
806 /* clear confirmation addr */
807 confirm = (volatile uint32_t *)sc->cmd;
808 *confirm = 0;
809 wmb();
810
811 /* send an rdma command to the PCIe engine, and wait for the
812 response in the confirmation address. The firmware should
813 write a -1 there to indicate it is alive and well
814 */
815
816 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
817 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
818 buf[0] = htobe32(dma_high); /* confirm addr MSW */
819 buf[1] = htobe32(dma_low); /* confirm addr LSW */
820 buf[2] = htobe32(0xffffffff); /* confirm data */
821 dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
822 dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
823 buf[3] = htobe32(dma_high); /* dummy addr MSW */
824 buf[4] = htobe32(dma_low); /* dummy addr LSW */
825 buf[5] = htobe32(enable); /* enable? */
826
827
828 submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
829
830 mxge_pio_copy(submit, buf, 64);
831 wmb();
832 DELAY(1000);
833 wmb();
834 i = 0;
835 while (*confirm != 0xffffffff && i < 20) {
836 DELAY(1000);
837 i++;
838 }
839 if (*confirm != 0xffffffff) {
840 device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
841 (enable ? "enable" : "disable"), confirm,
842 *confirm);
843 }
844 return;
845}
846
847static int
848mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
849{
850 mcp_cmd_t *buf;
851 char buf_bytes[sizeof(*buf) + 8];
852 volatile mcp_cmd_response_t *response = sc->cmd;
853 volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
854 uint32_t dma_low, dma_high;
855 int err, sleep_total = 0;
856
cd0543ff
AE
857 /*
858 * We may be called during attach, before if_serializer is available.
859 * This is not a fast path, just check for NULL
860 */
861
862 if (sc->ifp->if_serializer)
863 ASSERT_SERIALIZED(sc->ifp->if_serializer);
864
8892ea20
AE
865 /* ensure buf is aligned to 8 bytes */
866 buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
867
868 buf->data0 = htobe32(data->data0);
869 buf->data1 = htobe32(data->data1);
870 buf->data2 = htobe32(data->data2);
871 buf->cmd = htobe32(cmd);
872 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
873 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
874
875 buf->response_addr.low = htobe32(dma_low);
876 buf->response_addr.high = htobe32(dma_high);
2e8181d0 877
2e8181d0 878
8892ea20
AE
879 response->result = 0xffffffff;
880 wmb();
881 mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
882
883 /* wait up to 20ms */
884 err = EAGAIN;
885 for (sleep_total = 0; sleep_total < 20; sleep_total++) {
886 bus_dmamap_sync(sc->cmd_dma.dmat,
887 sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
888 wmb();
889 switch (be32toh(response->result)) {
890 case 0:
891 data->data0 = be32toh(response->data);
892 err = 0;
893 break;
894 case 0xffffffff:
895 DELAY(1000);
896 break;
897 case MXGEFW_CMD_UNKNOWN:
898 err = ENOSYS;
899 break;
900 case MXGEFW_CMD_ERROR_UNALIGNED:
901 err = E2BIG;
902 break;
903 case MXGEFW_CMD_ERROR_BUSY:
904 err = EBUSY;
905 break;
906 default:
907 device_printf(sc->dev,
908 "mxge: command %d "
909 "failed, result = %d\n",
910 cmd, be32toh(response->result));
911 err = ENXIO;
912 break;
913 }
914 if (err != EAGAIN)
915 break;
916 }
917 if (err == EAGAIN)
918 device_printf(sc->dev, "mxge: command %d timed out"
919 "result = %d\n",
920 cmd, be32toh(response->result));
8892ea20
AE
921 return err;
922}
923
924static int
925mxge_adopt_running_firmware(mxge_softc_t *sc)
926{
927 struct mcp_gen_header *hdr;
928 const size_t bytes = sizeof (struct mcp_gen_header);
929 size_t hdr_offset;
930 int status;
931
932 /* find running firmware header */
933 hdr_offset = htobe32(*(volatile uint32_t *)
934 (sc->sram + MCP_HEADER_PTR_OFFSET));
935
936 if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
937 device_printf(sc->dev,
938 "Running firmware has bad header offset (%d)\n",
939 (int)hdr_offset);
940 return EIO;
941 }
942
943 /* copy header of running firmware from SRAM to host memory to
944 * validate firmware */
d777b84f 945 hdr = kmalloc(bytes, M_DEVBUF, M_NOWAIT);
8892ea20 946 if (hdr == NULL) {
d777b84f 947 device_printf(sc->dev, "could not kmalloc firmware hdr\n");
8892ea20
AE
948 return ENOMEM;
949 }
950 bus_space_read_region_1(rman_get_bustag(sc->mem_res),
951 rman_get_bushandle(sc->mem_res),
952 hdr_offset, (char *)hdr, bytes);
953 status = mxge_validate_firmware(sc, hdr);
d777b84f 954 kfree(hdr, M_DEVBUF);
8892ea20
AE
955
956 /*
957 * check to see if adopted firmware has bug where adopting
958 * it will cause broadcasts to be filtered unless the NIC
959 * is kept in ALLMULTI mode
960 */
961 if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
962 sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
963 sc->adopted_rx_filter_bug = 1;
964 device_printf(sc->dev, "Adopting fw %d.%d.%d: "
965 "working around rx filter bug\n",
966 sc->fw_ver_major, sc->fw_ver_minor,
967 sc->fw_ver_tiny);
968 }
969
970 return status;
971}
972
973
974static int
975mxge_load_firmware(mxge_softc_t *sc, int adopt)
976{
977 volatile uint32_t *confirm;
978 volatile char *submit;
979 char buf_bytes[72];
980 uint32_t *buf, size, dma_low, dma_high;
981 int status, i;
982
983 buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
984
985 size = sc->sram_size;
986 status = mxge_load_firmware_helper(sc, &size);
987 if (status) {
988 if (!adopt)
989 return status;
990 /* Try to use the currently running firmware, if
991 it is new enough */
992 status = mxge_adopt_running_firmware(sc);
993 if (status) {
994 device_printf(sc->dev,
995 "failed to adopt running firmware\n");
996 return status;
997 }
998 device_printf(sc->dev,
999 "Successfully adopted running firmware\n");
1000 if (sc->tx_boundary == 4096) {
1001 device_printf(sc->dev,
1002 "Using firmware currently running on NIC"
1003 ". For optimal\n");
1004 device_printf(sc->dev,
1005 "performance consider loading optimized "
1006 "firmware\n");
1007 }
1008 sc->fw_name = mxge_fw_unaligned;
1009 sc->tx_boundary = 2048;
1010 return 0;
1011 }
1012 /* clear confirmation addr */
1013 confirm = (volatile uint32_t *)sc->cmd;
1014 *confirm = 0;
1015 wmb();
1016 /* send a reload command to the bootstrap MCP, and wait for the
1017 response in the confirmation address. The firmware should
1018 write a -1 there to indicate it is alive and well
1019 */
1020
1021 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
1022 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
1023
1024 buf[0] = htobe32(dma_high); /* confirm addr MSW */
1025 buf[1] = htobe32(dma_low); /* confirm addr LSW */
1026 buf[2] = htobe32(0xffffffff); /* confirm data */
1027
1028 /* FIX: All newest firmware should un-protect the bottom of
1029 the sram before handoff. However, the very first interfaces
1030 do not. Therefore the handoff copy must skip the first 8 bytes
1031 */
1032 /* where the code starts*/
1033 buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1034 buf[4] = htobe32(size - 8); /* length of code */
1035 buf[5] = htobe32(8); /* where to copy to */
1036 buf[6] = htobe32(0); /* where to jump to */
1037
1038 submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1039 mxge_pio_copy(submit, buf, 64);
1040 wmb();
1041 DELAY(1000);
1042 wmb();
1043 i = 0;
1044 while (*confirm != 0xffffffff && i < 20) {
1045 DELAY(1000*10);
1046 i++;
1047 bus_dmamap_sync(sc->cmd_dma.dmat,
1048 sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1049 }
1050 if (*confirm != 0xffffffff) {
1051 device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1052 confirm, *confirm);
1053
1054 return ENXIO;
1055 }
1056 return 0;
1057}
1058
1059static int
1060mxge_update_mac_address(mxge_softc_t *sc)
1061{
1062 mxge_cmd_t cmd;
1063 uint8_t *addr = sc->mac_addr;
1064 int status;
1065
1066
1067 cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1068 | (addr[2] << 8) | addr[3]);
1069
1070 cmd.data1 = ((addr[4] << 8) | (addr[5]));
1071
1072 status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1073 return status;
1074}
1075
1076static int
1077mxge_change_pause(mxge_softc_t *sc, int pause)
1078{
1079 mxge_cmd_t cmd;
1080 int status;
1081
1082 if (pause)
1083 status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1084 &cmd);
1085 else
1086 status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1087 &cmd);
1088
1089 if (status) {
1090 device_printf(sc->dev, "Failed to set flow control mode\n");
1091 return ENXIO;
1092 }
1093 sc->pause = pause;
1094 return 0;
1095}
1096
1097static void
1098mxge_change_promisc(mxge_softc_t *sc, int promisc)
1099{
1100 mxge_cmd_t cmd;
1101 int status;
1102
cd0543ff
AE
1103 if( sc->ifp->if_serializer)
1104 ASSERT_SERIALIZED(sc->ifp->if_serializer);
8892ea20
AE
1105 if (mxge_always_promisc)
1106 promisc = 1;
1107
1108 if (promisc)
1109 status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1110 &cmd);
1111 else
1112 status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1113 &cmd);
1114
1115 if (status) {
1116 device_printf(sc->dev, "Failed to set promisc mode\n");
1117 }
1118}
1119
1120static void
1121mxge_set_multicast_list(mxge_softc_t *sc)
1122{
1123 mxge_cmd_t cmd;
1124 struct ifmultiaddr *ifma;
1125 struct ifnet *ifp = sc->ifp;
1126 int err;
1127
cd0543ff
AE
1128 if (ifp->if_serializer)
1129 ASSERT_SERIALIZED(ifp->if_serializer);
1130
8892ea20
AE
1131 /* This firmware is known to not support multicast */
1132 if (!sc->fw_multicast_support)
1133 return;
1134
1135 /* Disable multicast filtering while we play with the lists*/
1136 err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1137 if (err != 0) {
1138 device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1139 " error status: %d\n", err);
1140 return;
1141 }
1142
1143 if (sc->adopted_rx_filter_bug)
1144 return;
1145
1146 if (ifp->if_flags & IFF_ALLMULTI)
1147 /* request to disable multicast filtering, so quit here */
1148 return;
1149
1150 /* Flush all the filters */
1151
1152 err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1153 if (err != 0) {
1154 device_printf(sc->dev,
1155 "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1156 ", error status: %d\n", err);
1157 return;
1158 }
1159
1160 /* Walk the multicast list, and add each address */
1161
b915556e 1162 LIST_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
8892ea20
AE
1163 if (ifma->ifma_addr->sa_family != AF_LINK)
1164 continue;
1165 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1166 &cmd.data0, 4);
1167 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1168 &cmd.data1, 2);
1169 cmd.data0 = htonl(cmd.data0);
1170 cmd.data1 = htonl(cmd.data1);
1171 err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1172 if (err != 0) {
1173 device_printf(sc->dev, "Failed "
1174 "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1175 "%d\t", err);
1176 /* abort, leaving multicast filtering off */
8892ea20
AE
1177 return;
1178 }
1179 }
8892ea20
AE
1180 /* Enable multicast filtering */
1181 err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1182 if (err != 0) {
1183 device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1184 ", error status: %d\n", err);
1185 }
1186}
1187
1188static int
1189mxge_max_mtu(mxge_softc_t *sc)
1190{
1191 mxge_cmd_t cmd;
1192 int status;
1193
1194 if (MJUMPAGESIZE - MXGEFW_PAD > MXGEFW_MAX_MTU)
1195 return MXGEFW_MAX_MTU - MXGEFW_PAD;
1196
1197 /* try to set nbufs to see if it we can
1198 use virtually contiguous jumbos */
1199 cmd.data0 = 0;
1200 status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1201 &cmd);
1202 if (status == 0)
1203 return MXGEFW_MAX_MTU - MXGEFW_PAD;
1204
1205 /* otherwise, we're limited to MJUMPAGESIZE */
1206 return MJUMPAGESIZE - MXGEFW_PAD;
1207}
1208
1209static int
1210mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1211{
1212 struct mxge_slice_state *ss;
1213 mxge_rx_done_t *rx_done;
1214 volatile uint32_t *irq_claim;
1215 mxge_cmd_t cmd;
1216 int slice, status;
1217
1218 /* try to send a reset command to the card to see if it
1219 is alive */
1220 memset(&cmd, 0, sizeof (cmd));
1221 status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1222 if (status != 0) {
1223 device_printf(sc->dev, "failed reset\n");
1224 return ENXIO;
1225 }
1226
1227 mxge_dummy_rdma(sc, 1);
1228
1229
1230 /* set the intrq size */
1231 cmd.data0 = sc->rx_ring_size;
1232 status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1233
1234 /*
1235 * Even though we already know how many slices are supported
1236 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1237 * has magic side effects, and must be called after a reset.
1238 * It must be called prior to calling any RSS related cmds,
1239 * including assigning an interrupt queue for anything but
1240 * slice 0. It must also be called *after*
1241 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1242 * the firmware to compute offsets.
1243 */
1244
1245 if (sc->num_slices > 1) {
1246 /* ask the maximum number of slices it supports */
1247 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1248 &cmd);
1249 if (status != 0) {
1250 device_printf(sc->dev,
1251 "failed to get number of slices\n");
1252 return status;
1253 }
1254 /*
1255 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1256 * to setting up the interrupt queue DMA
1257 */
1258 cmd.data0 = sc->num_slices;
1259 cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1260#ifdef IFNET_BUF_RING
1261 cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1262#endif
1263 status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1264 &cmd);
1265 if (status != 0) {
1266 device_printf(sc->dev,
1267 "failed to set number of slices\n");
1268 return status;
1269 }
1270 }
1271
1272
1273 if (interrupts_setup) {
1274 /* Now exchange information about interrupts */
1275 for (slice = 0; slice < sc->num_slices; slice++) {
1276 rx_done = &sc->ss[slice].rx_done;
1277 memset(rx_done->entry, 0, sc->rx_ring_size);
1278 cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1279 cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1280 cmd.data2 = slice;
1281 status |= mxge_send_cmd(sc,
1282 MXGEFW_CMD_SET_INTRQ_DMA,
1283 &cmd);
1284 }
1285 }
1286
1287 status |= mxge_send_cmd(sc,
1288 MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1289
1290
1291 sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1292
1293 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1294 irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1295
1296
1297 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1298 &cmd);
1299 sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1300 if (status != 0) {
1301 device_printf(sc->dev, "failed set interrupt parameters\n");
1302 return status;
1303 }
1304
1305
1306 *sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1307
1308
1309 /* run a DMA benchmark */
1310 (void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1311
1312 for (slice = 0; slice < sc->num_slices; slice++) {
1313 ss = &sc->ss[slice];
1314
1315 ss->irq_claim = irq_claim + (2 * slice);
1316 /* reset mcp/driver shared state back to 0 */
1317 ss->rx_done.idx = 0;
1318 ss->rx_done.cnt = 0;
1319 ss->tx.req = 0;
1320 ss->tx.done = 0;
1321 ss->tx.pkt_done = 0;
1322 ss->tx.queue_active = 0;
1323 ss->tx.activate = 0;
1324 ss->tx.deactivate = 0;
1325 ss->tx.wake = 0;
1326 ss->tx.defrag = 0;
1327 ss->tx.stall = 0;
1328 ss->rx_big.cnt = 0;
1329 ss->rx_small.cnt = 0;
1330 ss->lro_bad_csum = 0;
1331 ss->lro_queued = 0;
1332 ss->lro_flushed = 0;
1333 if (ss->fw_stats != NULL) {
1334 ss->fw_stats->valid = 0;
1335 ss->fw_stats->send_done_count = 0;
1336 }
1337 }
1338 sc->rdma_tags_available = 15;
1339 status = mxge_update_mac_address(sc);
1340 mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1341 mxge_change_pause(sc, sc->pause);
1342 mxge_set_multicast_list(sc);
1343 return status;
1344}
1345
1346static int
1347mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1348{
1349 mxge_softc_t *sc;
1350 unsigned int intr_coal_delay;
1351 int err;
1352
1353 sc = arg1;
1354 intr_coal_delay = sc->intr_coal_delay;
1355 err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1356 if (err != 0) {
1357 return err;
1358 }
1359 if (intr_coal_delay == sc->intr_coal_delay)
1360 return 0;
1361
1362 if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1363 return EINVAL;
1364
2e8181d0 1365 lwkt_serialize_enter(sc->ifp->if_serializer);
8892ea20
AE
1366 *sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1367 sc->intr_coal_delay = intr_coal_delay;
2e8181d0
AE
1368
1369 lwkt_serialize_exit(sc->ifp->if_serializer);
8892ea20
AE
1370 return err;
1371}
1372
1373static int
1374mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1375{
1376 mxge_softc_t *sc;
1377 unsigned int enabled;
1378 int err;
1379
1380 sc = arg1;
1381 enabled = sc->pause;
1382 err = sysctl_handle_int(oidp, &enabled, arg2, req);
1383 if (err != 0) {
1384 return err;
1385 }
1386 if (enabled == sc->pause)
1387 return 0;
1388
2e8181d0 1389 lwkt_serialize_enter(sc->ifp->if_serializer);
8892ea20 1390 err = mxge_change_pause(sc, enabled);
2e8181d0 1391 lwkt_serialize_exit(sc->ifp->if_serializer);
8892ea20
AE
1392 return err;
1393}
1394
1395static int
1396mxge_change_lro_locked(mxge_softc_t *sc, int lro_cnt)
1397{
1398 struct ifnet *ifp;
1399 int err = 0;
1400
1401 ifp = sc->ifp;
1402 if (lro_cnt == 0)
1403 ifp->if_capenable &= ~IFCAP_LRO;
1404 else
1405 ifp->if_capenable |= IFCAP_LRO;
1406 sc->lro_cnt = lro_cnt;
2ab1b8a9 1407 if (ifp->if_flags & IFF_RUNNING) {
8892ea20
AE
1408 mxge_close(sc);
1409 err = mxge_open(sc);
1410 }
1411 return err;
1412}
1413
1414static int
1415mxge_change_lro(SYSCTL_HANDLER_ARGS)
1416{
1417 mxge_softc_t *sc;
1418 unsigned int lro_cnt;
1419 int err;
1420
1421 sc = arg1;
1422 lro_cnt = sc->lro_cnt;
1423 err = sysctl_handle_int(oidp, &lro_cnt, arg2, req);
1424 if (err != 0)
1425 return err;
1426
1427 if (lro_cnt == sc->lro_cnt)
1428 return 0;
1429
1430 if (lro_cnt > 128)
1431 return EINVAL;
1432
2e8181d0 1433 lwkt_serialize_enter(sc->ifp->if_serializer);
8892ea20 1434 err = mxge_change_lro_locked(sc, lro_cnt);
2e8181d0 1435 lwkt_serialize_exit(sc->ifp->if_serializer);
8892ea20
AE
1436 return err;
1437}
1438
1439static int
1440mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1441{
1442 int err;
1443
1444 if (arg1 == NULL)
1445 return EFAULT;
1446 arg2 = be32toh(*(int *)arg1);
1447 arg1 = NULL;
1448 err = sysctl_handle_int(oidp, arg1, arg2, req);
1449
1450 return err;
1451}
1452
1453static void
1454mxge_rem_sysctls(mxge_softc_t *sc)
1455{
1456 struct mxge_slice_state *ss;
1457 int slice;
1458
1459 if (sc->slice_sysctl_tree == NULL)
1460 return;
1461
1462 for (slice = 0; slice < sc->num_slices; slice++) {
1463 ss = &sc->ss[slice];
1464 if (ss == NULL || ss->sysctl_tree == NULL)
1465 continue;
1466 sysctl_ctx_free(&ss->sysctl_ctx);
1467 ss->sysctl_tree = NULL;
1468 }
1469 sysctl_ctx_free(&sc->slice_sysctl_ctx);
1470 sc->slice_sysctl_tree = NULL;
bbac37fb
AE
1471 sysctl_ctx_free(&sc->sysctl_ctx);
1472 sc->sysctl_tree = NULL;
1473
8892ea20
AE
1474}
1475
1476static void
1477mxge_add_sysctls(mxge_softc_t *sc)
1478{
1479 struct sysctl_ctx_list *ctx;
1480 struct sysctl_oid_list *children;
1481 mcp_irq_data_t *fw;
1482 struct mxge_slice_state *ss;
1483 int slice;
1484 char slice_num[8];
1485
b6737651
AE
1486 ctx = &sc->sysctl_ctx;
1487 sysctl_ctx_init(ctx);
1488 sc->sysctl_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_STATIC_CHILDREN(_hw),
1489 OID_AUTO,
1490 device_get_nameunit(sc->dev),
1491 CTLFLAG_RD, 0, "");
1492 if (sc->sysctl_tree == NULL) {
1493 device_printf(sc->dev, "can't add sysctl node\n");
1494 return;
1495 }
1496
1497 children = SYSCTL_CHILDREN(sc->sysctl_tree);
8892ea20
AE
1498 fw = sc->ss[0].fw_stats;
1499
1500 /* random information */
1501 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1502 "firmware_version",
1503 CTLFLAG_RD, &sc->fw_version,
1504 0, "firmware version");
1505 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1506 "serial_number",
1507 CTLFLAG_RD, &sc->serial_number_string,
1508 0, "serial number");
1509 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1510 "product_code",
1511 CTLFLAG_RD, &sc->product_code_string,
1512 0, "product_code");
1513 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1514 "pcie_link_width",
1515 CTLFLAG_RD, &sc->link_width,
1516 0, "tx_boundary");
1517 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1518 "tx_boundary",
1519 CTLFLAG_RD, &sc->tx_boundary,
1520 0, "tx_boundary");
1521 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1522 "write_combine",
1523 CTLFLAG_RD, &sc->wc,
1524 0, "write combining PIO?");
1525 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1526 "read_dma_MBs",
1527 CTLFLAG_RD, &sc->read_dma,
1528 0, "DMA Read speed in MB/s");
1529 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1530 "write_dma_MBs",
1531 CTLFLAG_RD, &sc->write_dma,
1532 0, "DMA Write speed in MB/s");
1533 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1534 "read_write_dma_MBs",
1535 CTLFLAG_RD, &sc->read_write_dma,
1536 0, "DMA concurrent Read/Write speed in MB/s");
1537
1538
1539 /* performance related tunables */
1540 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1541 "intr_coal_delay",
1542 CTLTYPE_INT|CTLFLAG_RW, sc,
1543 0, mxge_change_intr_coal,
1544 "I", "interrupt coalescing delay in usecs");
1545
1546 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1547 "flow_control_enabled",
1548 CTLTYPE_INT|CTLFLAG_RW, sc,
1549 0, mxge_change_flow_control,
1550 "I", "interrupt coalescing delay in usecs");
1551
1552 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1553 "deassert_wait",
1554 CTLFLAG_RW, &mxge_deassert_wait,
1555 0, "Wait for IRQ line to go low in ihandler");
1556
1557 /* stats block from firmware is in network byte order.
1558 Need to swap it */
1559 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1560 "link_up",
1561 CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1562 0, mxge_handle_be32,
1563 "I", "link up");
1564 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1565 "rdma_tags_available",
1566 CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1567 0, mxge_handle_be32,
1568 "I", "rdma_tags_available");
1569 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1570 "dropped_bad_crc32",
1571 CTLTYPE_INT|CTLFLAG_RD,
1572 &fw->dropped_bad_crc32,
1573 0, mxge_handle_be32,
1574 "I", "dropped_bad_crc32");
1575 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1576 "dropped_bad_phy",
1577 CTLTYPE_INT|CTLFLAG_RD,
1578 &fw->dropped_bad_phy,
1579 0, mxge_handle_be32,
1580 "I", "dropped_bad_phy");
1581 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1582 "dropped_link_error_or_filtered",
1583 CTLTYPE_INT|CTLFLAG_RD,
1584 &fw->dropped_link_error_or_filtered,
1585 0, mxge_handle_be32,
1586 "I", "dropped_link_error_or_filtered");
1587 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1588 "dropped_link_overflow",
1589 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1590 0, mxge_handle_be32,
1591 "I", "dropped_link_overflow");
1592 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1593 "dropped_multicast_filtered",
1594 CTLTYPE_INT|CTLFLAG_RD,
1595 &fw->dropped_multicast_filtered,
1596 0, mxge_handle_be32,
1597 "I", "dropped_multicast_filtered");
1598 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1599 "dropped_no_big_buffer",
1600 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1601 0, mxge_handle_be32,
1602 "I", "dropped_no_big_buffer");
1603 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1604 "dropped_no_small_buffer",
1605 CTLTYPE_INT|CTLFLAG_RD,
1606 &fw->dropped_no_small_buffer,
1607 0, mxge_handle_be32,
1608 "I", "dropped_no_small_buffer");
1609 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1610 "dropped_overrun",
1611 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1612 0, mxge_handle_be32,
1613 "I", "dropped_overrun");
1614 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1615 "dropped_pause",
1616 CTLTYPE_INT|CTLFLAG_RD,
1617 &fw->dropped_pause,
1618 0, mxge_handle_be32,
1619 "I", "dropped_pause");
1620 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1621 "dropped_runt",
1622 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1623 0, mxge_handle_be32,
1624 "I", "dropped_runt");
1625
1626 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1627 "dropped_unicast_filtered",
1628 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1629 0, mxge_handle_be32,
1630 "I", "dropped_unicast_filtered");
1631
1632 /* verbose printing? */
1633 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1634 "verbose",
1635 CTLFLAG_RW, &mxge_verbose,
1636 0, "verbose printing");
1637
1638 /* lro */
1639 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1640 "lro_cnt",
1641 CTLTYPE_INT|CTLFLAG_RW, sc,
1642 0, mxge_change_lro,
1643 "I", "number of lro merge queues");
1644
1645
1646 /* add counters exported for debugging from all slices */
1647 sysctl_ctx_init(&sc->slice_sysctl_ctx);
1648 sc->slice_sysctl_tree =
1649 SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1650 "slice", CTLFLAG_RD, 0, "");
1651
1652 for (slice = 0; slice < sc->num_slices; slice++) {
1653 ss = &sc->ss[slice];
1654 sysctl_ctx_init(&ss->sysctl_ctx);
1655 ctx = &ss->sysctl_ctx;
1656 children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
b6737651 1657 ksprintf(slice_num, "%d", slice);
8892ea20
AE
1658 ss->sysctl_tree =
1659 SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1660 CTLFLAG_RD, 0, "");
1661 children = SYSCTL_CHILDREN(ss->sysctl_tree);
1662 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1663 "rx_small_cnt",
1664 CTLFLAG_RD, &ss->rx_small.cnt,
1665 0, "rx_small_cnt");
1666 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1667 "rx_big_cnt",
1668 CTLFLAG_RD, &ss->rx_big.cnt,
1669 0, "rx_small_cnt");
1670 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1671 "lro_flushed", CTLFLAG_RD, &ss->lro_flushed,
1672 0, "number of lro merge queues flushed");
1673
1674 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1675 "lro_queued", CTLFLAG_RD, &ss->lro_queued,
1676 0, "number of frames appended to lro merge"
1677 "queues");
1678
1679#ifndef IFNET_BUF_RING
1680 /* only transmit from slice 0 for now */
1681 if (slice > 0)
1682 continue;
1683#endif
1684 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1685 "tx_req",
1686 CTLFLAG_RD, &ss->tx.req,
1687 0, "tx_req");
1688
1689 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1690 "tx_done",
1691 CTLFLAG_RD, &ss->tx.done,
1692 0, "tx_done");
1693 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1694 "tx_pkt_done",
1695 CTLFLAG_RD, &ss->tx.pkt_done,
1696 0, "tx_done");
1697 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1698 "tx_stall",
1699 CTLFLAG_RD, &ss->tx.stall,
1700 0, "tx_stall");
1701 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1702 "tx_wake",
1703 CTLFLAG_RD, &ss->tx.wake,
1704 0, "tx_wake");
1705 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1706 "tx_defrag",
1707 CTLFLAG_RD, &ss->tx.defrag,
1708 0, "tx_defrag");
1709 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1710 "tx_queue_active",
1711 CTLFLAG_RD, &ss->tx.queue_active,
1712 0, "tx_queue_active");
1713 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1714 "tx_activate",
1715 CTLFLAG_RD, &ss->tx.activate,
1716 0, "tx_activate");
1717 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1718 "tx_deactivate",
1719 CTLFLAG_RD, &ss->tx.deactivate,
1720 0, "tx_deactivate");
1721 }
1722}
1723
1724/* copy an array of mcp_kreq_ether_send_t's to the mcp. Copy
1725 backwards one at a time and handle ring wraps */
1726
1727static inline void
1728mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1729 mcp_kreq_ether_send_t *src, int cnt)
1730{
1731 int idx, starting_slot;
1732 starting_slot = tx->req;
1733 while (cnt > 1) {
1734 cnt--;
1735 idx = (starting_slot + cnt) & tx->mask;
1736 mxge_pio_copy(&tx->lanai[idx],
1737 &src[cnt], sizeof(*src));
1738 wmb();
1739 }
1740}
1741
1742/*
1743 * copy an array of mcp_kreq_ether_send_t's to the mcp. Copy
1744 * at most 32 bytes at a time, so as to avoid involving the software
1745 * pio handler in the nic. We re-write the first segment's flags
1746 * to mark them valid only after writing the entire chain
1747 */
1748
1749static inline void
1750mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1751 int cnt)
1752{
1753 int idx, i;
1754 uint32_t *src_ints;
1755 volatile uint32_t *dst_ints;
1756 mcp_kreq_ether_send_t *srcp;
1757 volatile mcp_kreq_ether_send_t *dstp, *dst;
1758 uint8_t last_flags;
1759
1760 idx = tx->req & tx->mask;
1761
1762 last_flags = src->flags;
1763 src->flags = 0;
1764 wmb();
1765 dst = dstp = &tx->lanai[idx];
1766 srcp = src;
1767
1768 if ((idx + cnt) < tx->mask) {
1769 for (i = 0; i < (cnt - 1); i += 2) {
1770 mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1771 wmb(); /* force write every 32 bytes */
1772 srcp += 2;
1773 dstp += 2;
1774 }
1775 } else {
1776 /* submit all but the first request, and ensure
1777 that it is submitted below */
1778 mxge_submit_req_backwards(tx, src, cnt);
1779 i = 0;
1780 }
1781 if (i < cnt) {
1782 /* submit the first request */
1783 mxge_pio_copy(dstp, srcp, sizeof(*src));
1784 wmb(); /* barrier before setting valid flag */
1785 }
1786
1787 /* re-write the last 32-bits with the valid flags */
1788 src->flags = last_flags;
1789 src_ints = (uint32_t *)src;
1790 src_ints+=3;
1791 dst_ints = (volatile uint32_t *)dst;
1792 dst_ints+=3;
1793 *dst_ints = *src_ints;
1794 tx->req += cnt;
1795 wmb();
1796}
1797
1798#if IFCAP_TSO4
1799
1800static void
1801mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1802 int busdma_seg_cnt, int ip_off)
1803{
1804 mxge_tx_ring_t *tx;
1805 mcp_kreq_ether_send_t *req;
1806 bus_dma_segment_t *seg;
1807 struct ip *ip;
1808 struct tcphdr *tcp;
1809 uint32_t low, high_swapped;
1810 int len, seglen, cum_len, cum_len_next;
1811 int next_is_first, chop, cnt, rdma_count, small;
1812 uint16_t pseudo_hdr_offset, cksum_offset, mss;
1813 uint8_t flags, flags_next;
1814 static int once;
1815
1816 mss = m->m_pkthdr.tso_segsz;
1817
1818 /* negative cum_len signifies to the
1819 * send loop that we are still in the
1820 * header portion of the TSO packet.
1821 */
1822
1823 /* ensure we have the ethernet, IP and TCP
1824 header together in the first mbuf, copy
1825 it to a scratch buffer if not */
1826 if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1827 m_copydata(m, 0, ip_off + sizeof (*ip),
1828 ss->scratch);
1829 ip = (struct ip *)(ss->scratch + ip_off);
1830 } else {
1831 ip = (struct ip *)(mtod(m, char *) + ip_off);
1832 }
1833 if (__predict_false(m->m_len < ip_off + (ip->ip_hl << 2)
1834 + sizeof (*tcp))) {
1835 m_copydata(m, 0, ip_off + (ip->ip_hl << 2)
1836 + sizeof (*tcp), ss->scratch);
1837 ip = (struct ip *)(mtod(m, char *) + ip_off);
1838 }
1839
1840 tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1841 cum_len = -(ip_off + ((ip->ip_hl + tcp->th_off) << 2));
1842
1843 /* TSO implies checksum offload on this hardware */
1844 cksum_offset = ip_off + (ip->ip_hl << 2);
1845 flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1846
1847
1848 /* for TSO, pseudo_hdr_offset holds mss.
1849 * The firmware figures out where to put
1850 * the checksum by parsing the header. */
1851 pseudo_hdr_offset = htobe16(mss);
1852
1853 tx = &ss->tx;
1854 req = tx->req_list;
1855 seg = tx->seg_list;
1856 cnt = 0;
1857 rdma_count = 0;
1858 /* "rdma_count" is the number of RDMAs belonging to the
1859 * current packet BEFORE the current send request. For
1860 * non-TSO packets, this is equal to "count".
1861 * For TSO packets, rdma_count needs to be reset
1862 * to 0 after a segment cut.
1863 *
1864 * The rdma_count field of the send request is
1865 * the number of RDMAs of the packet starting at
1866 * that request. For TSO send requests with one ore more cuts
1867 * in the middle, this is the number of RDMAs starting
1868 * after the last cut in the request. All previous
1869 * segments before the last cut implicitly have 1 RDMA.
1870 *
1871 * Since the number of RDMAs is not known beforehand,
1872 * it must be filled-in retroactively - after each
1873 * segmentation cut or at the end of the entire packet.
1874 */
1875
1876 while (busdma_seg_cnt) {
1877 /* Break the busdma segment up into pieces*/
1878 low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1879 high_swapped = htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1880 len = seg->ds_len;
1881
1882 while (len) {
1883 flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1884 seglen = len;
1885 cum_len_next = cum_len + seglen;
1886 (req-rdma_count)->rdma_count = rdma_count + 1;
1887 if (__predict_true(cum_len >= 0)) {
1888 /* payload */
1889 chop = (cum_len_next > mss);
1890 cum_len_next = cum_len_next % mss;
1891 next_is_first = (cum_len_next == 0);
1892 flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1893 flags_next |= next_is_first *
1894 MXGEFW_FLAGS_FIRST;
1895 rdma_count |= -(chop | next_is_first);
1896 rdma_count += chop & !next_is_first;
1897 } else if (cum_len_next >= 0) {
1898 /* header ends */
1899 rdma_count = -1;
1900 cum_len_next = 0;
1901 seglen = -cum_len;
1902 small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1903 flags_next = MXGEFW_FLAGS_TSO_PLD |
1904 MXGEFW_FLAGS_FIRST |
1905 (small * MXGEFW_FLAGS_SMALL);
1906 }
1907
1908 req->addr_high = high_swapped;
1909 req->addr_low = htobe32(low);
1910 req->pseudo_hdr_offset = pseudo_hdr_offset;
1911 req->pad = 0;
1912 req->rdma_count = 1;
1913 req->length = htobe16(seglen);
1914 req->cksum_offset = cksum_offset;
1915 req->flags = flags | ((cum_len & 1) *
1916 MXGEFW_FLAGS_ALIGN_ODD);
1917 low += seglen;
1918 len -= seglen;
1919 cum_len = cum_len_next;
1920 flags = flags_next;
1921 req++;
1922 cnt++;
1923 rdma_count++;
1924 if (__predict_false(cksum_offset > seglen))
1925 cksum_offset -= seglen;
1926 else
1927 cksum_offset = 0;
1928 if (__predict_false(cnt > tx->max_desc))
1929 goto drop;
1930 }
1931 busdma_seg_cnt--;
1932 seg++;
1933 }
1934 (req-rdma_count)->rdma_count = rdma_count;
1935
1936 do {
1937 req--;
1938 req->flags |= MXGEFW_FLAGS_TSO_LAST;
1939 } while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1940
1941 tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1942 mxge_submit_req(tx, tx->req_list, cnt);
1943#ifdef IFNET_BUF_RING
1944 if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1945 /* tell the NIC to start polling this slice */
1946 *tx->send_go = 1;
1947 tx->queue_active = 1;
1948 tx->activate++;
1949 wmb();
1950 }
1951#endif
1952 return;
1953
1954drop:
1955 bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1956 m_freem(m);
1957 ss->oerrors++;
1958 if (!once) {
6c348da6
AE
1959 kprintf("tx->max_desc exceeded via TSO!\n");
1960 kprintf("mss = %d, %ld, %d!\n", mss,
8892ea20
AE
1961 (long)seg - (long)tx->seg_list, tx->max_desc);
1962 once = 1;
1963 }
1964 return;
1965
1966}
1967
1968#endif /* IFCAP_TSO4 */
1969
1970#ifdef MXGE_NEW_VLAN_API
1971/*
1972 * We reproduce the software vlan tag insertion from
1973 * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
1974 * vlan tag insertion. We need to advertise this in order to have the
1975 * vlan interface respect our csum offload flags.
1976 */
1977static struct mbuf *
1978mxge_vlan_tag_insert(struct mbuf *m)
1979{
1980 struct ether_vlan_header *evl;
1981
b915556e 1982 M_PREPEND(m, EVL_ENCAPLEN, MB_DONTWAIT);
8892ea20
AE
1983 if (__predict_false(m == NULL))
1984 return NULL;
1985 if (m->m_len < sizeof(*evl)) {
1986 m = m_pullup(m, sizeof(*evl));
1987 if (__predict_false(m == NULL))
1988 return NULL;
1989 }
1990 /*
1991 * Transform the Ethernet header into an Ethernet header
1992 * with 802.1Q encapsulation.
1993 */
1994 evl = mtod(m, struct ether_vlan_header *);
b915556e 1995 bcopy((char *)evl + EVL_ENCAPLEN,
8892ea20
AE
1996 (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
1997 evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
b915556e 1998 evl->evl_tag = htons(m->m_pkthdr.ether_vlantag);
8892ea20
AE
1999 m->m_flags &= ~M_VLANTAG;
2000 return m;
2001}
2002#endif /* MXGE_NEW_VLAN_API */
2003
2004static void
2005mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
2006{
2007 mxge_softc_t *sc;
2008 mcp_kreq_ether_send_t *req;
2009 bus_dma_segment_t *seg;
2010 struct mbuf *m_tmp;
2011 struct ifnet *ifp;
2012 mxge_tx_ring_t *tx;
2013 struct ip *ip;
2014 int cnt, cum_len, err, i, idx, odd_flag, ip_off;
2015 uint16_t pseudo_hdr_offset;
2016 uint8_t flags, cksum_offset;
2017
2018
2019 sc = ss->sc;
2020 ifp = sc->ifp;
2021 tx = &ss->tx;
2022
2023 ip_off = sizeof (struct ether_header);
2024#ifdef MXGE_NEW_VLAN_API
2025 if (m->m_flags & M_VLANTAG) {
2026 m = mxge_vlan_tag_insert(m);
2027 if (__predict_false(m == NULL))
2028 goto drop;
b915556e 2029 ip_off += EVL_ENCAPLEN;
8892ea20
AE
2030 }
2031#endif
2032 /* (try to) map the frame for DMA */
2033 idx = tx->req & tx->mask;
7d8771d4
AE
2034 err = bus_dmamap_load_mbuf_segment(tx->dmat, tx->info[idx].map,
2035 m, tx->seg_list, 1, &cnt,
2036 BUS_DMA_NOWAIT);
8892ea20
AE
2037 if (__predict_false(err == EFBIG)) {
2038 /* Too many segments in the chain. Try
2039 to defrag */
2040 m_tmp = m_defrag(m, M_NOWAIT);
2041 if (m_tmp == NULL) {
2042 goto drop;
2043 }
2044 ss->tx.defrag++;
2045 m = m_tmp;
7d8771d4 2046 err = bus_dmamap_load_mbuf_segment(tx->dmat,
8892ea20 2047 tx->info[idx].map,
7d8771d4 2048 m, tx->seg_list, 1, &cnt,
8892ea20
AE
2049 BUS_DMA_NOWAIT);
2050 }
2051 if (__predict_false(err != 0)) {
7d8771d4 2052 device_printf(sc->dev, "bus_dmamap_load_mbuf_segment returned %d"
8892ea20
AE
2053 " packet len = %d\n", err, m->m_pkthdr.len);
2054 goto drop;
2055 }
2056 bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2057 BUS_DMASYNC_PREWRITE);
2058 tx->info[idx].m = m;
2059
2060#if IFCAP_TSO4
2061 /* TSO is different enough, we handle it in another routine */
2062 if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2063 mxge_encap_tso(ss, m, cnt, ip_off);
2064 return;
2065 }
2066#endif
2067
2068 req = tx->req_list;
2069 cksum_offset = 0;
2070 pseudo_hdr_offset = 0;
2071 flags = MXGEFW_FLAGS_NO_TSO;
2072
2073 /* checksum offloading? */
2074 if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
2075 /* ensure ip header is in first mbuf, copy
2076 it to a scratch buffer if not */
2077 if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
2078 m_copydata(m, 0, ip_off + sizeof (*ip),
2079 ss->scratch);
2080 ip = (struct ip *)(ss->scratch + ip_off);
2081 } else {
2082 ip = (struct ip *)(mtod(m, char *) + ip_off);
2083 }
2084 cksum_offset = ip_off + (ip->ip_hl << 2);
2085 pseudo_hdr_offset = cksum_offset + m->m_pkthdr.csum_data;
2086 pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2087 req->cksum_offset = cksum_offset;
2088 flags |= MXGEFW_FLAGS_CKSUM;
2089 odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2090 } else {
2091 odd_flag = 0;
2092 }
2093 if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2094 flags |= MXGEFW_FLAGS_SMALL;
2095
2096 /* convert segments into a request list */
2097 cum_len = 0;
2098 seg = tx->seg_list;
2099 req->flags = MXGEFW_FLAGS_FIRST;
2100 for (i = 0; i < cnt; i++) {
2101 req->addr_low =
2102 htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2103 req->addr_high =
2104 htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2105 req->length = htobe16(seg->ds_len);
2106 req->cksum_offset = cksum_offset;
2107 if (cksum_offset > seg->ds_len)
2108 cksum_offset -= seg->ds_len;
2109 else
2110 cksum_offset = 0;
2111 req->pseudo_hdr_offset = pseudo_hdr_offset;
2112 req->pad = 0; /* complete solid 16-byte block */
2113 req->rdma_count = 1;
2114 req->flags |= flags | ((cum_len & 1) * odd_flag);
2115 cum_len += seg->ds_len;
2116 seg++;
2117 req++;
2118 req->flags = 0;
2119 }
2120 req--;
2121 /* pad runts to 60 bytes */
2122 if (cum_len < 60) {
2123 req++;
2124 req->addr_low =
2125 htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2126 req->addr_high =
2127 htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2128 req->length = htobe16(60 - cum_len);
2129 req->cksum_offset = 0;
2130 req->pseudo_hdr_offset = pseudo_hdr_offset;
2131 req->pad = 0; /* complete solid 16-byte block */
2132 req->rdma_count = 1;
2133 req->flags |= flags | ((cum_len & 1) * odd_flag);
2134 cnt++;
2135 }
2136
2137 tx->req_list[0].rdma_count = cnt;
2138#if 0
2139 /* print what the firmware will see */
2140 for (i = 0; i < cnt; i++) {
6c348da6 2141 kprintf("%d: addr: 0x%x 0x%x len:%d pso%d,"
8892ea20
AE
2142 "cso:%d, flags:0x%x, rdma:%d\n",
2143 i, (int)ntohl(tx->req_list[i].addr_high),
2144 (int)ntohl(tx->req_list[i].addr_low),
2145 (int)ntohs(tx->req_list[i].length),
2146 (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2147 tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2148 tx->req_list[i].rdma_count);
2149 }
6c348da6 2150 kprintf("--------------\n");
8892ea20
AE
2151#endif
2152 tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2153 mxge_submit_req(tx, tx->req_list, cnt);
2154#ifdef IFNET_BUF_RING
2155 if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2156 /* tell the NIC to start polling this slice */
2157 *tx->send_go = 1;
2158 tx->queue_active = 1;
2159 tx->activate++;
2160 wmb();
2161 }
2162#endif
2163 return;
2164
2165drop:
2166 m_freem(m);
2167 ss->oerrors++;
2168 return;
2169}
2170
2171#ifdef IFNET_BUF_RING
2172static void
2173mxge_qflush(struct ifnet *ifp)
2174{
2175 mxge_softc_t *sc = ifp->if_softc;
2176 mxge_tx_ring_t *tx;
2177 struct mbuf *m;
2178 int slice;
2179
2180 for (slice = 0; slice < sc->num_slices; slice++) {
2181 tx = &sc->ss[slice].tx;
2e8181d0 2182 lwkt_serialize_enter(sc->ifp->if_serializer);
8892ea20
AE
2183 while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2184 m_freem(m);
2e8181d0 2185 lwkt_serialize_exit(sc->ifp->if_serializer);
8892ea20
AE
2186 }
2187 if_qflush(ifp);
2188}
2189
2190static inline void
2191mxge_start_locked(struct mxge_slice_state *ss)
2192{
2193 mxge_softc_t *sc;
2194 struct mbuf *m;
2195 struct ifnet *ifp;
2196 mxge_tx_ring_t *tx;
2197
2198 sc = ss->sc;
2199 ifp = sc->ifp;
2200 tx = &ss->tx;
2201
2202 while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2203 m = drbr_dequeue(ifp, tx->br);
2204 if (m == NULL) {
2205 return;
2206 }
2207 /* let BPF see it */
2208 BPF_MTAP(ifp, m);
2209
2210 /* give it to the nic */
2211 mxge_encap(ss, m);
2212 }
2213 /* ran out of transmit slots */
2ab1b8a9 2214 if (((ss->if_flags & IFF_OACTIVE) == 0)
8892ea20 2215 && (!drbr_empty(ifp, tx->br))) {
2ab1b8a9 2216 ss->if_flags |= IFF_OACTIVE;
8892ea20
AE
2217 tx->stall++;
2218 }
2219}
2220
2221static int
2222mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2223{
2224 mxge_softc_t *sc;
2225 struct ifnet *ifp;
2226 mxge_tx_ring_t *tx;
2227 int err;
2228
2229 sc = ss->sc;
2230 ifp = sc->ifp;
2231 tx = &ss->tx;
2232
2ab1b8a9
AE
2233 if ((ss->if_flags & (IFF_RUNNING|IFF_OACTIVE)) !=
2234 IFF_RUNNING) {
8892ea20
AE
2235 err = drbr_enqueue(ifp, tx->br, m);
2236 return (err);
2237 }
2238
2239 if (drbr_empty(ifp, tx->br) &&
2240 ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2241 /* let BPF see it */
2242 BPF_MTAP(ifp, m);
2243 /* give it to the nic */
2244 mxge_encap(ss, m);
2245 } else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2246 return (err);
2247 }
2248 if (!drbr_empty(ifp, tx->br))
2249 mxge_start_locked(ss);
2250 return (0);
2251}
2252
2253static int
2254mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2255{
2256 mxge_softc_t *sc = ifp->if_softc;
2257 struct mxge_slice_state *ss;
2258 mxge_tx_ring_t *tx;
2259 int err = 0;
2260 int slice;
2261
deef6e3e 2262#if 0
8892ea20 2263 slice = m->m_pkthdr.flowid;
deef6e3e 2264#endif
8892ea20
AE
2265 slice &= (sc->num_slices - 1); /* num_slices always power of 2 */
2266
2267 ss = &sc->ss[slice];
2268 tx = &ss->tx;
2269
2e8181d0 2270 if(lwkt_serialize_try(ifp->if_serializer)) {
8892ea20 2271 err = mxge_transmit_locked(ss, m);
2e8181d0 2272 lwkt_serialize_exit(ifp->if_serializer);
8892ea20
AE
2273 } else {
2274 err = drbr_enqueue(ifp, tx->br, m);
2275 }
2276
2277 return (err);
2278}
2279
2280#else
2281
2282static inline void
2283mxge_start_locked(struct mxge_slice_state *ss)
2284{
2285 mxge_softc_t *sc;
2286 struct mbuf *m;
2287 struct ifnet *ifp;
2288 mxge_tx_ring_t *tx;
2289
2290 sc = ss->sc;
2291 ifp = sc->ifp;
2292 tx = &ss->tx;
2293 while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
f2f758df 2294 m = ifq_dequeue(&ifp->if_snd, NULL);
8892ea20
AE
2295 if (m == NULL) {
2296 return;
2297 }
2298 /* let BPF see it */
2299 BPF_MTAP(ifp, m);
2300
2301 /* give it to the nic */
2302 mxge_encap(ss, m);
2303 }
2304 /* ran out of transmit slots */
2ab1b8a9
AE
2305 if ((sc->ifp->if_flags & IFF_OACTIVE) == 0) {
2306 sc->ifp->if_flags |= IFF_OACTIVE;
8892ea20
AE
2307 tx->stall++;
2308 }
2309}
2310#endif
2311static void
2312mxge_start(struct ifnet *ifp)
2313{
2314 mxge_softc_t *sc = ifp->if_softc;
2315 struct mxge_slice_state *ss;
2316
cd0543ff 2317 ASSERT_SERIALIZED(sc->ifp->if_serializer);
8892ea20
AE
2318 /* only use the first slice for now */
2319 ss = &sc->ss[0];
8892ea20 2320 mxge_start_locked(ss);
8892ea20
AE
2321}
2322
2323/*
2324 * copy an array of mcp_kreq_ether_recv_t's to the mcp. Copy
2325 * at most 32 bytes at a time, so as to avoid involving the software
2326 * pio handler in the nic. We re-write the first segment's low
2327 * DMA address to mark it valid only after we write the entire chunk
2328 * in a burst
2329 */
2330static inline void
2331mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2332 mcp_kreq_ether_recv_t *src)
2333{
2334 uint32_t low;
2335
2336 low = src->addr_low;
2337 src->addr_low = 0xffffffff;
2338 mxge_pio_copy(dst, src, 4 * sizeof (*src));
2339 wmb();
2340 mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2341 wmb();
2342 src->addr_low = low;
2343 dst->addr_low = low;
2344 wmb();
2345}
2346
2347static int
2348mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2349{
2350 bus_dma_segment_t seg;
2351 struct mbuf *m;
2352 mxge_rx_ring_t *rx = &ss->rx_small;
2353 int cnt, err;
2354
17eb0737 2355 m = m_gethdr(MB_DONTWAIT, MT_DATA);
8892ea20
AE
2356 if (m == NULL) {
2357 rx->alloc_fail++;
2358 err = ENOBUFS;
2359 goto done;
2360 }
2361 m->m_len = MHLEN;
7d8771d4
AE
2362 err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2363 &seg, 1, &cnt, BUS_DMA_NOWAIT);
8892ea20
AE
2364 if (err != 0) {
2365 m_free(m);
2366 goto done;
2367 }
2368 rx->info[idx].m = m;
2369 rx->shadow[idx].addr_low =
2370 htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2371 rx->shadow[idx].addr_high =
2372 htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2373
2374done:
2375 if ((idx & 7) == 7)
2376 mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2377 return err;
2378}
2379
87353c03 2380
8892ea20
AE
2381static int
2382mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2383{
2384 bus_dma_segment_t seg[3];
2385 struct mbuf *m;
2386 mxge_rx_ring_t *rx = &ss->rx_big;
2387 int cnt, err, i;
2388
2389 if (rx->cl_size == MCLBYTES)
17eb0737 2390 m = m_getcl(MB_DONTWAIT, MT_DATA, M_PKTHDR);
0a61435d
AE
2391 else {
2392#if 0
17eb0737 2393 m = m_getjcl(MB_DONTWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
0a61435d 2394#else
5f2b9102
AE
2395 /*
2396 * XXX: allocate normal sized buffers for big buffers.
2397 * We should be fine as long as we don't get any jumbo frames
2398 */
0a61435d
AE
2399 m = m_getcl(MB_DONTWAIT, MT_DATA, M_PKTHDR);
2400#endif
2401 }
8892ea20
AE
2402 if (m == NULL) {
2403 rx->alloc_fail++;
2404 err = ENOBUFS;
2405 goto done;
2406 }
2407 m->m_len = rx->mlen;
7d8771d4
AE
2408 err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2409 seg, 1, &cnt, BUS_DMA_NOWAIT);
8892ea20
AE
2410 if (err != 0) {
2411 m_free(m);
2412 goto done;
2413 }
2414 rx->info[idx].m = m;
2415 rx->shadow[idx].addr_low =
2416 htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2417 rx->shadow[idx].addr_high =
2418 htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2419
2420#if MXGE_VIRT_JUMBOS
2421 for (i = 1; i < cnt; i++) {
2422 rx->shadow[idx + i].addr_low =
2423 htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2424 rx->shadow[idx + i].addr_high =
2425 htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2426 }
2427#endif
2428
2429done:
2430 for (i = 0; i < rx->nbufs; i++) {
2431 if ((idx & 7) == 7) {
2432 mxge_submit_8rx(&rx->lanai[idx - 7],
2433 &rx->shadow[idx - 7]);
2434 }
2435 idx++;
2436 }
2437 return err;
2438}
2439
2440/*
2441 * Myri10GE hardware checksums are not valid if the sender
2442 * padded the frame with non-zero padding. This is because
2443 * the firmware just does a simple 16-bit 1s complement
2444 * checksum across the entire frame, excluding the first 14
2445 * bytes. It is best to simply to check the checksum and
2446 * tell the stack about it only if the checksum is good
2447 */
2448
2449static inline uint16_t
2450mxge_rx_csum(struct mbuf *m, int csum)
2451{
2452 struct ether_header *eh;
2453 struct ip *ip;
2454 uint16_t c;
2455
2456 eh = mtod(m, struct ether_header *);
2457
2458 /* only deal with IPv4 TCP & UDP for now */
2459 if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2460 return 1;
2461 ip = (struct ip *)(eh + 1);
2462 if (__predict_false(ip->ip_p != IPPROTO_TCP &&
2463 ip->ip_p != IPPROTO_UDP))
2464 return 1;
2465#ifdef INET
2466 c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2467 htonl(ntohs(csum) + ntohs(ip->ip_len) +
2468 - (ip->ip_hl << 2) + ip->ip_p));
2469#else
2470 c = 1;
2471#endif
2472 c ^= 0xffff;
2473 return (c);
2474}
2475
2476static void
2477mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2478{
2479 struct ether_vlan_header *evl;
2480 struct ether_header *eh;
2481 uint32_t partial;
2482
2483 evl = mtod(m, struct ether_vlan_header *);
2484 eh = mtod(m, struct ether_header *);
2485
2486 /*
b915556e 2487 * fix checksum by subtracting EVL_ENCAPLEN bytes
8892ea20
AE
2488 * after what the firmware thought was the end of the ethernet
2489 * header.
2490 */
2491
2492 /* put checksum into host byte order */
2493 *csum = ntohs(*csum);
2494 partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2495 (*csum) += ~partial;
2496 (*csum) += ((*csum) < ~partial);
2497 (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2498 (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2499
2500 /* restore checksum to network byte order;
2501 later consumers expect this */
2502 *csum = htons(*csum);
2503
2504 /* save the tag */
2505#ifdef MXGE_NEW_VLAN_API
b915556e 2506 m->m_pkthdr.ether_vlantag = ntohs(evl->evl_tag);
8892ea20
AE
2507#else
2508 {
2509 struct m_tag *mtag;
2510 mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
b915556e 2511 MB_DONTWAIT);
8892ea20
AE
2512 if (mtag == NULL)
2513 return;
2514 VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2515 m_tag_prepend(m, mtag);
2516 }
2517
2518#endif
2519 m->m_flags |= M_VLANTAG;
2520
2521 /*
2522 * Remove the 802.1q header by copying the Ethernet
2523 * addresses over it and adjusting the beginning of
2524 * the data in the mbuf. The encapsulated Ethernet
2525 * type field is already in place.
2526 */
b915556e 2527 bcopy((char *)evl, (char *)evl + EVL_ENCAPLEN,
8892ea20 2528 ETHER_HDR_LEN - ETHER_TYPE_LEN);
b915556e 2529 m_adj(m, EVL_ENCAPLEN);
8892ea20
AE
2530}
2531
2532
2533static inline void
2534mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2535{
2536 mxge_softc_t *sc;
2537 struct ifnet *ifp;
2538 struct mbuf *m;
2539 struct ether_header *eh;
2540 mxge_rx_ring_t *rx;
2541 bus_dmamap_t old_map;
2542 int idx;
2543 uint16_t tcpudp_csum;
2544
2545 sc = ss->sc;
2546 ifp = sc->ifp;
2547 rx = &ss->rx_big;
2548 idx = rx->cnt & rx->mask;
2549 rx->cnt += rx->nbufs;
2550 /* save a pointer to the received mbuf */
2551 m = rx->info[idx].m;
2552 /* try to replace the received mbuf */
2553 if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2554 /* drop the frame -- the old mbuf is re-cycled */
2555 ifp->if_ierrors++;
2556 return;
2557 }
2558
2559 /* unmap the received buffer */
2560 old_map = rx->info[idx].map;
2561 bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2562 bus_dmamap_unload(rx->dmat, old_map);
2563
2564 /* swap the bus_dmamap_t's */
2565 rx->info[idx].map = rx->extra_map;
2566 rx->extra_map = old_map;
2567
2568 /* mcp implicitly skips 1st 2 bytes so that packet is properly
2569 * aligned */
2570 m->m_data += MXGEFW_PAD;
2571
2572 m->m_pkthdr.rcvif = ifp;
2573 m->m_len = m->m_pkthdr.len = len;
2574 ss->ipackets++;
2575 eh = mtod(m, struct ether_header *);
2576 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2577 mxge_vlan_tag_remove(m, &csum);
2578 }
2579 /* if the checksum is valid, mark it in the mbuf header */
2580 if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2581 if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2582 return;
2583 /* otherwise, it was a UDP frame, or a TCP frame which
2584 we could not do LRO on. Tell the stack that the
2585 checksum is good */
2586 m->m_pkthdr.csum_data = 0xffff;
2587 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2588 }
deef6e3e 2589#if 0
8892ea20
AE
2590 /* flowid only valid if RSS hashing is enabled */
2591 if (sc->num_slices > 1) {
2592 m->m_pkthdr.flowid = (ss - sc->ss);
2593 m->m_flags |= M_FLOWID;
2594 }
deef6e3e 2595#endif
8892ea20
AE
2596 /* pass the frame up the stack */
2597 (*ifp->if_input)(ifp, m);
2598}
2599
2600static inline void
2601mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2602{
2603 mxge_softc_t *sc;
2604 struct ifnet *ifp;
2605 struct ether_header *eh;
2606 struct mbuf *m;
2607 mxge_rx_ring_t *rx;
2608 bus_dmamap_t old_map;
2609 int idx;
2610 uint16_t tcpudp_csum;
2611
2612 sc = ss->sc;
2613 ifp = sc->ifp;
2614 rx = &ss->rx_small;
2615 idx = rx->cnt & rx->mask;
2616 rx->cnt++;
2617 /* save a pointer to the received mbuf */
2618 m = rx->info[idx].m;
2619 /* try to replace the received mbuf */
2620 if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2621 /* drop the frame -- the old mbuf is re-cycled */
2622 ifp->if_ierrors++;
2623 return;
2624 }
2625
2626 /* unmap the received buffer */
2627 old_map = rx->info[idx].map;
2628 bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2629 bus_dmamap_unload(rx->dmat, old_map);
2630
2631 /* swap the bus_dmamap_t's */
2632 rx->info[idx].map = rx->extra_map;
2633 rx->extra_map = old_map;
2634
2635 /* mcp implicitly skips 1st 2 bytes so that packet is properly
2636 * aligned */
2637 m->m_data += MXGEFW_PAD;
2638
2639 m->m_pkthdr.rcvif = ifp;
2640 m->m_len = m->m_pkthdr.len = len;
2641 ss->ipackets++;
2642 eh = mtod(m, struct ether_header *);
2643 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2644 mxge_vlan_tag_remove(m, &csum);
2645 }
2646 /* if the checksum is valid, mark it in the mbuf header */
2647 if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2648 if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2649 return;
2650 /* otherwise, it was a UDP frame, or a TCP frame which
2651 we could not do LRO on. Tell the stack that the
2652 checksum is good */
2653 m->m_pkthdr.csum_data = 0xffff;
2654 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2655 }
deef6e3e 2656#if 0
8892ea20
AE
2657 /* flowid only valid if RSS hashing is enabled */
2658 if (sc->num_slices > 1) {
2659 m->m_pkthdr.flowid = (ss - sc->ss);
2660 m->m_flags |= M_FLOWID;
2661 }
deef6e3e 2662#endif
8892ea20
AE
2663 /* pass the frame up the stack */
2664 (*ifp->if_input)(ifp, m);
2665}
2666
2667static inline void
2668mxge_clean_rx_done(struct mxge_slice_state *ss)
2669{
2670 mxge_rx_done_t *rx_done = &ss->rx_done;
2671 int limit = 0;
2672 uint16_t length;
2673 uint16_t checksum;
2674
2675
2676 while (rx_done->entry[rx_done->idx].length != 0) {
2677 length = ntohs(rx_done->entry[rx_done->idx].length);
2678 rx_done->entry[rx_done->idx].length = 0;
2679 checksum = rx_done->entry[rx_done->idx].checksum;
2680 if (length <= (MHLEN - MXGEFW_PAD))
2681 mxge_rx_done_small(ss, length, checksum);
2682 else
2683 mxge_rx_done_big(ss, length, checksum);
2684 rx_done->cnt++;
2685 rx_done->idx = rx_done->cnt & rx_done->mask;
2686
2687 /* limit potential for livelock */
2688 if (__predict_false(++limit > rx_done->mask / 2))
2689 break;
2690 }
2691#ifdef INET
2692 while (!SLIST_EMPTY(&ss->lro_active)) {
2693 struct lro_entry *lro = SLIST_FIRST(&ss->lro_active);
2694 SLIST_REMOVE_HEAD(&ss->lro_active, next);
2695 mxge_lro_flush(ss, lro);
2696 }
2697#endif
2698}
2699
2700
2701static inline void
2702mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2703{
2704 struct ifnet *ifp;
2705 mxge_tx_ring_t *tx;
2706 struct mbuf *m;
2707 bus_dmamap_t map;
2708 int idx;
2709 int *flags;
2710
2711 tx = &ss->tx;
2712 ifp = ss->sc->ifp;
cd0543ff 2713 ASSERT_SERIALIZED(ifp->if_serializer);
8892ea20
AE
2714 while (tx->pkt_done != mcp_idx) {
2715 idx = tx->done & tx->mask;
2716 tx->done++;
2717 m = tx->info[idx].m;
2718 /* mbuf and DMA map only attached to the first
2719 segment per-mbuf */
2720 if (m != NULL) {
2721 ss->obytes += m->m_pkthdr.len;
2722 if (m->m_flags & M_MCAST)
2723 ss->omcasts++;
2724 ss->opackets++;
2725 tx->info[idx].m = NULL;
2726 map = tx->info[idx].map;
2727 bus_dmamap_unload(tx->dmat, map);
2728 m_freem(m);
2729 }
2730 if (tx->info[idx].flag) {
2731 tx->info[idx].flag = 0;
2732 tx->pkt_done++;
2733 }
2734 }
2735
2736 /* If we have space, clear IFF_OACTIVE to tell the stack that
2737 its OK to send packets */
2738#ifdef IFNET_BUF_RING
2ab1b8a9 2739 flags = &ss->if_flags;
8892ea20 2740#else
2ab1b8a9 2741 flags = &ifp->if_flags;
8892ea20 2742#endif
2ab1b8a9 2743 if ((*flags) & IFF_OACTIVE &&
8892ea20 2744 tx->req - tx->done < (tx->mask + 1)/4) {
2ab1b8a9 2745 *(flags) &= ~IFF_OACTIVE;
8892ea20
AE
2746 ss->tx.wake++;
2747 mxge_start_locked(ss);
2748 }
2749#ifdef IFNET_BUF_RING
2750 if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2751 /* let the NIC stop polling this queue, since there
2752 * are no more transmits pending */
2753 if (tx->req == tx->done) {
2754 *tx->send_stop = 1;
2755 tx->queue_active = 0;
2756 tx->deactivate++;
2757 wmb();
2758 }
2759 }
2760#endif
8892ea20
AE
2761
2762}
2763
2764static struct mxge_media_type mxge_xfp_media_types[] =
2765{
2766 {IFM_10G_CX4, 0x7f, "10GBASE-CX4 (module)"},
2767 {IFM_10G_SR, (1 << 7), "10GBASE-SR"},
2768 {IFM_10G_LR, (1 << 6), "10GBASE-LR"},
2769 {0, (1 << 5), "10GBASE-ER"},
2770 {IFM_10G_LRM, (1 << 4), "10GBASE-LRM"},
2771 {0, (1 << 3), "10GBASE-SW"},
2772 {0, (1 << 2), "10GBASE-LW"},
2773 {0, (1 << 1), "10GBASE-EW"},
2774 {0, (1 << 0), "Reserved"}
2775};
2776static struct mxge_media_type mxge_sfp_media_types[] =
2777{
2778 {0, (1 << 7), "Reserved"},
2779 {IFM_10G_LRM, (1 << 6), "10GBASE-LRM"},
2780 {IFM_10G_LR, (1 << 5), "10GBASE-LR"},
2781 {IFM_10G_SR, (1 << 4), "10GBASE-SR"}
2782};
2783
2784static void
2785mxge_set_media(mxge_softc_t *sc, int type)
2786{
2787 sc->media_flags |= type;
2788 ifmedia_add(&sc->media, sc->media_flags, 0, NULL);
2789 ifmedia_set(&sc->media, sc->media_flags);
2790}
2791
2792
2793/*
2794 * Determine the media type for a NIC. Some XFPs will identify
2795 * themselves only when their link is up, so this is initiated via a
2796 * link up interrupt. However, this can potentially take up to
2797 * several milliseconds, so it is run via the watchdog routine, rather
2798 * than in the interrupt handler itself. This need only be done
2799 * once, not each time the link is up.
2800 */
2801static void
2802mxge_media_probe(mxge_softc_t *sc)
2803{
2804 mxge_cmd_t cmd;
2805 char *cage_type;
2806 char *ptr;
2807 struct mxge_media_type *mxge_media_types = NULL;
2808 int i, err, ms, mxge_media_type_entries;
2809 uint32_t byte;
2810
2811 sc->need_media_probe = 0;
2812
2813 /* if we've already set a media type, we're done */
2814 if (sc->media_flags != (IFM_ETHER | IFM_AUTO))
2815 return;
2816
2817 /*
2818 * parse the product code to deterimine the interface type
2819 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2820 * after the 3rd dash in the driver's cached copy of the
2821 * EEPROM's product code string.
2822 */
2823 ptr = sc->product_code_string;
2824 if (ptr == NULL) {
2825 device_printf(sc->dev, "Missing product code\n");
2826 }
2827
2828 for (i = 0; i < 3; i++, ptr++) {
2829 ptr = index(ptr, '-');
2830 if (ptr == NULL) {
2831 device_printf(sc->dev,
2832 "only %d dashes in PC?!?\n", i);
2833 return;
2834 }
2835 }
2836 if (*ptr == 'C') {
2837 /* -C is CX4 */
2838 mxge_set_media(sc, IFM_10G_CX4);
2839 return;
2840 }
2841 else if (*ptr == 'Q') {
2842 /* -Q is Quad Ribbon Fiber */
2843 device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2844 /* FreeBSD has no media type for Quad ribbon fiber */
2845 return;
2846 }
2847
2848 if (*ptr == 'R') {
2849 /* -R is XFP */
2850 mxge_media_types = mxge_xfp_media_types;
2851 mxge_media_type_entries =
2852 sizeof (mxge_xfp_media_types) /
2853 sizeof (mxge_xfp_media_types[0]);
2854 byte = MXGE_XFP_COMPLIANCE_BYTE;
2855 cage_type = "XFP";
2856 }
2857
2858 if (*ptr == 'S' || *(ptr +1) == 'S') {
2859 /* -S or -2S is SFP+ */
2860 mxge_media_types = mxge_sfp_media_types;
2861 mxge_media_type_entries =
2862 sizeof (mxge_sfp_media_types) /
2863 sizeof (mxge_sfp_media_types[0]);
2864 cage_type = "SFP+";
2865 byte = 3;
2866 }
2867
2868 if (mxge_media_types == NULL) {
2869 device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2870 return;
2871 }
2872
2873 /*
2874 * At this point we know the NIC has an XFP cage, so now we
2875 * try to determine what is in the cage by using the
2876 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2877 * register. We read just one byte, which may take over
2878 * a millisecond
2879 */
2880
2881 cmd.data0 = 0; /* just fetch 1 byte, not all 256 */
2882 cmd.data1 = byte;
2883 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2884 if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
2885 device_printf(sc->dev, "failed to read XFP\n");
2886 }
2887 if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
2888 device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
2889 }
2890 if (err != MXGEFW_CMD_OK) {
2891 return;
2892 }
2893
2894 /* now we wait for the data to be cached */
2895 cmd.data0 = byte;
2896 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2897 for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
2898 DELAY(1000);
2899 cmd.data0 = byte;
2900 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2901 }
2902 if (err != MXGEFW_CMD_OK) {
2903 device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
2904 cage_type, err, ms);
2905 return;
2906 }
2907
2908 if (cmd.data0 == mxge_media_types[0].bitmask) {
2909 if (mxge_verbose)
2910 device_printf(sc->dev, "%s:%s\n", cage_type,
2911 mxge_media_types[0].name);
2912 mxge_set_media(sc, IFM_10G_CX4);
2913 return;
2914 }
2915 for (i = 1; i < mxge_media_type_entries; i++) {
2916 if (cmd.data0 & mxge_media_types[i].bitmask) {
2917 if (mxge_verbose)
2918 device_printf(sc->dev, "%s:%s\n",
2919 cage_type,
2920 mxge_media_types[i].name);
2921
2922 mxge_set_media(sc, mxge_media_types[i].flag);
2923 return;
2924 }
2925 }
2926 device_printf(sc->dev, "%s media 0x%x unknown\n", cage_type,
2927 cmd.data0);
2928
2929 return;
2930}
2931
2932static void
2933mxge_intr(void *arg)
2934{
2935 struct mxge_slice_state *ss = arg;
2936 mxge_softc_t *sc = ss->sc;
2937 mcp_irq_data_t *stats = ss->fw_stats;
2938 mxge_tx_ring_t *tx = &ss->tx;
2939 mxge_rx_done_t *rx_done = &ss->rx_done;
2940 uint32_t send_done_count;
2941 uint8_t valid;
2942
2943
2944#ifndef IFNET_BUF_RING
2945 /* an interrupt on a non-zero slice is implicitly valid
2946 since MSI-X irqs are not shared */
2947 if (ss != sc->ss) {
2948 mxge_clean_rx_done(ss);
2949 *ss->irq_claim = be32toh(3);
2950 return;
2951 }
2952#endif
2953
2954 /* make sure the DMA has finished */
2955 if (!stats->valid) {
2956 return;
2957 }
2958 valid = stats->valid;
2959
2960 if (sc->legacy_irq) {
2961 /* lower legacy IRQ */
2962 *sc->irq_deassert = 0;
2963 if (!mxge_deassert_wait)
2964 /* don't wait for conf. that irq is low */
2965 stats->valid = 0;
2966 } else {
2967 stats->valid = 0;
2968 }
2969
2970 /* loop while waiting for legacy irq deassertion */
2971 do {
2972 /* check for transmit completes and receives */
2973 send_done_count = be32toh(stats->send_done_count);
2974 while ((send_done_count != tx->pkt_done) ||
2975 (rx_done->entry[rx_done->idx].length != 0)) {
2976 if (send_done_count != tx->pkt_done)
2977 mxge_tx_done(ss, (int)send_done_count);
2978 mxge_clean_rx_done(ss);
2979 send_done_count = be32toh(stats->send_done_count);
2980 }
2981 if (sc->legacy_irq && mxge_deassert_wait)
2982 wmb();
2983 } while (*((volatile uint8_t *) &stats->valid));
2984
2985 /* fw link & error stats meaningful only on the first slice */
2986 if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
2987 if (sc->link_state != stats->link_up) {
2988 sc->link_state = stats->link_up;
2989 if (sc->link_state) {
73a22abe
AE
2990 sc->ifp->if_link_state = LINK_STATE_UP;
2991 if_link_state_change(sc->ifp);
8892ea20
AE
2992 if (mxge_verbose)
2993 device_printf(sc->dev, "link up\n");
2994 } else {
73a22abe
AE
2995 sc->ifp->if_link_state = LINK_STATE_DOWN;
2996 if_link_state_change(sc->ifp);
8892ea20
AE
2997 if (mxge_verbose)
2998 device_printf(sc->dev, "link down\n");
2999 }
3000 sc->need_media_probe = 1;
3001 }
3002 if (sc->rdma_tags_available !=
3003 be32toh(stats->rdma_tags_available)) {
3004 sc->rdma_tags_available =
3005 be32toh(stats->rdma_tags_available);
3006 device_printf(sc->dev, "RDMA timed out! %d tags "
3007 "left\n", sc->rdma_tags_available);
3008 }
3009
3010 if (stats->link_down) {
3011 sc->down_cnt += stats->link_down;
3012 sc->link_state = 0;
f0115d64
AE
3013 sc->ifp->if_link_state = LINK_STATE_DOWN;
3014 if_link_state_change(sc->ifp);
8892ea20
AE
3015 }
3016 }
3017
3018 /* check to see if we have rx token to pass back */
3019 if (valid & 0x1)
3020 *ss->irq_claim = be32toh(3);
3021 *(ss->irq_claim + 1) = be32toh(3);
3022}
3023
3024static void
3025mxge_init(void *arg)
3026{
3027}
3028
3029
3030
3031static void
3032mxge_free_slice_mbufs(struct mxge_slice_state *ss)
3033{
3034 struct lro_entry *lro_entry;
3035 int i;
3036
3037 while (!SLIST_EMPTY(&ss->lro_free)) {
3038 lro_entry = SLIST_FIRST(&ss->lro_free);
3039 SLIST_REMOVE_HEAD(&ss->lro_free, next);
d777b84f 3040 kfree(lro_entry, M_DEVBUF);
8892ea20
AE
3041 }
3042
3043 for (i = 0; i <= ss->rx_big.mask; i++) {
3044 if (ss->rx_big.info[i].m == NULL)
3045 continue;
3046 bus_dmamap_unload(ss->rx_big.dmat,
3047 ss->rx_big.info[i].map);
3048 m_freem(ss->rx_big.info[i].m);
3049 ss->rx_big.info[i].m = NULL;
3050 }
3051
3052 for (i = 0; i <= ss->rx_small.mask; i++) {
3053 if (ss->rx_small.info[i].m == NULL)
3054 continue;
3055 bus_dmamap_unload(ss->rx_small.dmat,
3056 ss->rx_small.info[i].map);
3057 m_freem(ss->rx_small.info[i].m);
3058 ss->rx_small.info[i].m = NULL;
3059 }
3060
3061 /* transmit ring used only on the first slice */
3062 if (ss->tx.info == NULL)
3063 return;
3064
3065 for (i = 0; i <= ss->tx.mask; i++) {
3066 ss->tx.info[i].flag = 0;
3067 if (ss->tx.info[i].m == NULL)
3068 continue;
3069 bus_dmamap_unload(ss->tx.dmat,
3070 ss->tx.info[i].map);
3071 m_freem(ss->tx.info[i].m);
3072 ss->tx.info[i].m = NULL;
3073 }
3074}
3075
3076static void
3077mxge_free_mbufs(mxge_softc_t *sc)
3078{
3079 int slice;
3080
3081 for (slice = 0; slice < sc->num_slices; slice++)
3082 mxge_free_slice_mbufs(&sc->ss[slice]);
3083}
3084
3085static void
3086mxge_free_slice_rings(struct mxge_slice_state *ss)
3087{
3088 int i;
3089
3090
3091 if (ss->rx_done.entry != NULL)
3092 mxge_dma_free(&ss->rx_done.dma);
3093 ss->rx_done.entry = NULL;
3094
3095 if (ss->tx.req_bytes != NULL)
d777b84f 3096 kfree(ss->tx.req_bytes, M_DEVBUF);
8892ea20
AE
3097 ss->tx.req_bytes = NULL;
3098
3099 if (ss->tx.seg_list != NULL)
d777b84f 3100 kfree(ss->tx.seg_list, M_DEVBUF);
8892ea20
AE
3101 ss->tx.seg_list = NULL;
3102
3103 if (ss->rx_small.shadow != NULL)
d777b84f 3104 kfree(ss->rx_small.shadow, M_DEVBUF);
8892ea20
AE
3105 ss->rx_small.shadow = NULL;
3106
3107 if (ss->rx_big.shadow != NULL)
d777b84f 3108 kfree(ss->rx_big.shadow, M_DEVBUF);
8892ea20
AE
3109 ss->rx_big.shadow = NULL;
3110
3111 if (ss->tx.info != NULL) {
3112 if (ss->tx.dmat != NULL) {
3113 for (i = 0; i <= ss->tx.mask; i++) {
3114 bus_dmamap_destroy(ss->tx.dmat,
3115 ss->tx.info[i].map);
3116 }
3117 bus_dma_tag_destroy(ss->tx.dmat);
3118 }
d777b84f 3119 kfree(ss->tx.info, M_DEVBUF);
8892ea20
AE
3120 }
3121 ss->tx.info = NULL;
3122
3123 if (ss->rx_small.info != NULL) {
3124 if (ss->rx_small.dmat != NULL) {
3125 for (i = 0; i <= ss->rx_small.mask; i++) {
3126 bus_dmamap_destroy(ss->rx_small.dmat,
3127 ss->rx_small.info[i].map);
3128 }
3129 bus_dmamap_destroy(ss->rx_small.dmat,
3130 ss->rx_small.extra_map);
3131 bus_dma_tag_destroy(ss->rx_small.dmat);
3132 }
d777b84f 3133 kfree(ss->rx_small.info, M_DEVBUF);
8892ea20
AE
3134 }
3135 ss->rx_small.info = NULL;
3136
3137 if (ss->rx_big.info != NULL) {
3138 if (ss->rx_big.dmat != NULL) {
3139 for (i = 0; i <= ss->rx_big.mask; i++) {
3140 bus_dmamap_destroy(ss->rx_big.dmat,
3141 ss->rx_big.info[i].map);
3142 }
3143 bus_dmamap_destroy(ss->rx_big.dmat,
3144 ss->rx_big.extra_map);
3145 bus_dma_tag_destroy(ss->rx_big.dmat);
3146 }
d777b84f 3147 kfree(ss->rx_big.info, M_DEVBUF);
8892ea20
AE
3148 }
3149 ss->rx_big.info = NULL;
3150}
3151
3152static void
3153mxge_free_rings(mxge_softc_t *sc)
3154{
3155 int slice;
3156
3157 for (slice = 0; slice < sc->num_slices; slice++)
3158 mxge_free_slice_rings(&sc->ss[slice]);
3159}
3160
3161static int
3162mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3163 int tx_ring_entries)
3164{
3165 mxge_softc_t *sc = ss->sc;
3166 size_t bytes;
3167 int err, i;
3168
3169 err = ENOMEM;
3170
3171 /* allocate per-slice receive resources */
3172
3173 ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3174 ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3175
3176 /* allocate the rx shadow rings */
3177 bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
d777b84f 3178 ss->rx_small.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
8892ea20
AE
3179 if (ss->rx_small.shadow == NULL)
3180 return err;;
3181
3182 bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
d777b84f 3183 ss->rx_big.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
8892ea20
AE
3184 if (ss->rx_big.shadow == NULL)
3185 return err;;
3186
3187 /* allocate the rx host info rings */
3188 bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
d777b84f 3189 ss->rx_small.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
8892ea20
AE
3190 if (ss->rx_small.info == NULL)
3191 return err;;
3192
3193 bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
d777b84f 3194 ss->rx_big.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
8892ea20
AE
3195 if (ss->rx_big.info == NULL)
3196 return err;;
3197
3198 /* allocate the rx busdma resources */
3199 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3200 1, /* alignment */
3201 4096, /* boundary */
3202 BUS_SPACE_MAXADDR, /* low */
3203 BUS_SPACE_MAXADDR, /* high */
3204 NULL, NULL, /* filter */
3205 MHLEN, /* maxsize */
3206 1, /* num segs */
3207 MHLEN, /* maxsegsize */
3208 BUS_DMA_ALLOCNOW, /* flags */
8892ea20
AE
3209 &ss->rx_small.dmat); /* tag */
3210 if (err != 0) {
3211 device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3212 err);
3213 return err;;
3214 }
3215
3216 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3217 1, /* alignment */
3218#if MXGE_VIRT_JUMBOS
3219 4096, /* boundary */
3220#else
3221 0, /* boundary */
3222#endif
3223 BUS_SPACE_MAXADDR, /* low */
3224 BUS_SPACE_MAXADDR, /* high */
3225 NULL, NULL, /* filter */
3226 3*4096, /* maxsize */
3227#if MXGE_VIRT_JUMBOS
3228 3, /* num segs */
3229 4096, /* maxsegsize*/
3230#else
3231 1, /* num segs */
3232 MJUM9BYTES, /* maxsegsize*/
3233#endif
3234 BUS_DMA_ALLOCNOW, /* flags */
8892ea20
AE
3235 &ss->rx_big.dmat); /* tag */
3236 if (err != 0) {
3237 device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3238 err);
3239 return err;;
3240 }
3241 for (i = 0; i <= ss->rx_small.mask; i++) {
3242 err = bus_dmamap_create(ss->rx_small.dmat, 0,
3243 &ss->rx_small.info[i].map);
3244 if (err != 0) {
3245 device_printf(sc->dev, "Err %d rx_small dmamap\n",
3246 err);
3247 return err;;
3248 }
3249 }
3250 err = bus_dmamap_create(ss->rx_small.dmat, 0,
3251 &ss->rx_small.extra_map);
3252 if (err != 0) {
3253 device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3254 err);
3255 return err;;
3256 }
3257
3258 for (i = 0; i <= ss->rx_big.mask; i++) {
3259 err = bus_dmamap_create(ss->rx_big.dmat, 0,
3260 &ss->rx_big.info[i].map);
3261 if (err != 0) {
3262 device_printf(sc->dev, "Err %d rx_big dmamap\n",
3263 err);
3264 return err;;
3265 }
3266 }
3267 err = bus_dmamap_create(ss->rx_big.dmat, 0,
3268 &ss->rx_big.extra_map);
3269 if (err != 0) {
3270 device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3271 err);
3272 return err;;
3273 }
3274
3275 /* now allocate TX resouces */
3276
3277#ifndef IFNET_BUF_RING
3278 /* only use a single TX ring for now */
3279 if (ss != ss->sc->ss)
3280 return 0;
3281#endif
3282
3283 ss->tx.mask = tx_ring_entries - 1;
3284 ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3285
3286
3287 /* allocate the tx request copy block */
3288 bytes = 8 +
3289 sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
d777b84f 3290 ss->tx.req_bytes = kmalloc(bytes, M_DEVBUF, M_WAITOK);
8892ea20
AE
3291 if (ss->tx.req_bytes == NULL)
3292 return err;;
3293 /* ensure req_list entries are aligned to 8 bytes */
3294 ss->tx.req_list = (mcp_kreq_ether_send_t *)
3295 ((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3296
3297 /* allocate the tx busdma segment list */
3298 bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3299 ss->tx.seg_list = (bus_dma_segment_t *)
d777b84f 3300 kmalloc(bytes, M_DEVBUF, M_WAITOK);
8892ea20
AE
3301 if (ss->tx.seg_list == NULL)
3302 return err;;
3303
3304 /* allocate the tx host info ring */
3305 bytes = tx_ring_entries * sizeof (*ss->tx.info);
d777b84f 3306 ss->tx.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
8892ea20
AE
3307 if (ss->tx.info == NULL)
3308 return err;;
3309
3310 /* allocate the tx busdma resources */
3311 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3312 1, /* alignment */
3313 sc->tx_boundary, /* boundary */
3314 BUS_SPACE_MAXADDR, /* low */
3315 BUS_SPACE_MAXADDR, /* high */
3316 NULL, NULL, /* filter */
3317 65536 + 256, /* maxsize */
3318 ss->tx.max_desc - 2, /* num segs */
3319 sc->tx_boundary, /* maxsegsz */
3320 BUS_DMA_ALLOCNOW, /* flags */
8892ea20
AE
3321 &ss->tx.dmat); /* tag */
3322
3323 if (err != 0) {
3324 device_printf(sc->dev, "Err %d allocating tx dmat\n",
3325 err);
3326 return err;;
3327 }
3328
3329 /* now use these tags to setup dmamaps for each slot
3330 in the ring */
3331 for (i = 0; i <= ss->tx.mask; i++) {
3332 err = bus_dmamap_create(ss->tx.dmat, 0,
3333 &ss->tx.info[i].map);
3334 if (err != 0) {
3335 device_printf(sc->dev, "Err %d tx dmamap\n",
3336 err);
3337 return err;;
3338 }
3339 }
3340 return 0;
3341
3342}
3343
3344static int
3345mxge_alloc_rings(mxge_softc_t *sc)
3346{
3347 mxge_cmd_t cmd;
3348 int tx_ring_size;
3349 int tx_ring_entries, rx_ring_entries;
3350 int err, slice;
3351
3352 /* get ring sizes */
3353 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3354 tx_ring_size = cmd.data0;
3355 if (err != 0) {
3356 device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3357 goto abort;
3358 }
3359
3360 tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3361 rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
f2f758df
AE
3362 ifq_set_maxlen(&sc->ifp->if_snd, tx_ring_entries - 1);
3363 ifq_set_ready(&sc->ifp->if_snd);
8892ea20
AE
3364
3365 for (slice = 0; slice < sc->num_slices; slice++) {
3366 err = mxge_alloc_slice_rings(&sc->ss[slice],
3367 rx_ring_entries,
3368 tx_ring_entries);
3369 if (err != 0)
3370 goto abort;
3371 }
3372 return 0;
3373
3374abort:
3375 mxge_free_rings(sc);
3376 return err;
3377
3378}
3379
3380
3381static void
3382mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3383{
b915556e 3384 int bufsize = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN + MXGEFW_PAD;
8892ea20
AE
3385
3386 if (bufsize < MCLBYTES) {
3387 /* easy, everything fits in a single buffer */
3388 *big_buf_size = MCLBYTES;
3389 *cl_size = MCLBYTES;
3390 *nbufs = 1;
3391 return;
3392 }
3393
3394 if (bufsize < MJUMPAGESIZE) {
3395 /* still easy, everything still fits in a single buffer */
3396 *big_buf_size = MJUMPAGESIZE;
3397 *cl_size = MJUMPAGESIZE;
3398 *nbufs = 1;
3399 return;
3400 }
3401#if MXGE_VIRT_JUMBOS
3402 /* now we need to use virtually contiguous buffers */
3403 *cl_size = MJUM9BYTES;
3404 *big_buf_size = 4096;
3405 *nbufs = mtu / 4096 + 1;
3406 /* needs to be a power of two, so round up */
3407 if (*nbufs == 3)
3408 *nbufs = 4;
3409#else
3410 *cl_size = MJUM9BYTES;
3411 *big_buf_size = MJUM9BYTES;
3412 *nbufs = 1;
3413#endif
3414}
3415
3416static int
3417mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3418{
3419 mxge_softc_t *sc;
3420 mxge_cmd_t cmd;
3421 bus_dmamap_t map;
3422 struct lro_entry *lro_entry;
3423 int err, i, slice;
3424
3425
3426 sc = ss->sc;
3427 slice = ss - sc->ss;
3428
3429 SLIST_INIT(&ss->lro_free);
3430 SLIST_INIT(&ss->lro_active);
3431
3432 for (i = 0; i < sc->lro_cnt; i++) {
3433 lro_entry = (struct lro_entry *)
d777b84f 3434 kmalloc(sizeof (*lro_entry), M_DEVBUF,
8892ea20
AE
3435 M_NOWAIT | M_ZERO);
3436 if (lro_entry == NULL) {
3437 sc->lro_cnt = i;
3438 break;
3439 }
3440 SLIST_INSERT_HEAD(&ss->lro_free, lro_entry, next);
3441 }
3442 /* get the lanai pointers to the send and receive rings */
3443
3444 err = 0;
3445#ifndef IFNET_BUF_RING
3446 /* We currently only send from the first slice */
3447 if (slice == 0) {
3448#endif
3449 cmd.data0 = slice;
3450 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3451 ss->tx.lanai =
3452 (volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3453 ss->tx.send_go = (volatile uint32_t *)
3454 (sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3455 ss->tx.send_stop = (volatile uint32_t *)
3456 (sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3457#ifndef IFNET_BUF_RING
3458 }
3459#endif
3460 cmd.data0 = slice;
3461 err |= mxge_send_cmd(sc,
3462 MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3463 ss->rx_small.lanai =
3464 (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3465 cmd.data0 = slice;
3466 err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3467 ss->rx_big.lanai =
3468 (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3469
3470 if (err != 0) {
3471 device_printf(sc->dev,
3472 "failed to get ring sizes or locations\n");
3473 return EIO;
3474 }
3475
3476 /* stock receive rings */
3477 for (i = 0; i <= ss->rx_small.mask; i++) {
3478 map = ss->rx_small.info[i].map;
3479 err = mxge_get_buf_small(ss, map, i);
3480 if (err) {
3481 device_printf(sc->dev, "alloced %d/%d smalls\n",
3482 i, ss->rx_small.mask + 1);
3483 return ENOMEM;
3484 }
3485 }
3486 for (i = 0; i <= ss->rx_big.mask; i++) {
3487 ss->rx_big.shadow[i].addr_low = 0xffffffff;
3488 ss->rx_big.shadow[i].addr_high = 0xffffffff;
3489 }
3490 ss->rx_big.nbufs = nbufs;
3491 ss->rx_big.cl_size = cl_size;
3492 ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
b915556e 3493 EVL_ENCAPLEN + MXGEFW_PAD;
8892ea20
AE
3494 for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3495 map = ss->rx_big.info[i].map;
3496 err = mxge_get_buf_big(ss, map, i);
3497 if (err) {
3498 device_printf(sc->dev, "alloced %d/%d bigs\n",
3499 i, ss->rx_big.mask + 1);
3500 return ENOMEM;
3501 }
3502 }
3503 return 0;
3504}
3505
3506static int
3507mxge_open(mxge_softc_t *sc)
3508{
3509 mxge_cmd_t cmd;
3510 int err, big_bytes, nbufs, slice, cl_size, i;
3511 bus_addr_t bus;
3512 volatile uint8_t *itable;
3513 struct mxge_slice_state *ss;
3514
cd0543ff 3515 ASSERT_SERIALIZED(sc->ifp->if_serializer);
8892ea20
AE
3516 /* Copy the MAC address in case it was overridden */
3517 bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3518
3519 err = mxge_reset(sc, 1);
3520 if (err != 0) {
3521 device_printf(sc->dev, "failed to reset\n");
3522 return EIO;
3523 }
3524
3525 if (sc->num_slices > 1) {
3526 /* setup the indirection table */
3527 cmd.data0 = sc->num_slices;
3528 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3529 &cmd);
3530
3531 err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3532 &cmd);
3533 if (err != 0) {
3534 device_printf(sc->dev,
3535 "failed to setup rss tables\n");
3536 return err;
3537 }
3538
3539 /* just enable an identity mapping */
3540 itable = sc->sram + cmd.data0;
3541 for (i = 0; i < sc->num_slices; i++)
3542 itable[i] = (uint8_t)i;
3543
3544 cmd.data0 = 1;
3545 cmd.data1 = mxge_rss_hash_type;
3546 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3547 if (err != 0) {
3548 device_printf(sc->dev, "failed to enable slices\n");
3549 return err;
3550 }
3551 }
3552
3553
3554 mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3555
3556 cmd.data0 = nbufs;
3557 err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3558 &cmd);
3559 /* error is only meaningful if we're trying to set
3560 MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3561 if (err && nbufs > 1) {
3562 device_printf(sc->dev,
3563 "Failed to set alway-use-n to %d\n",
3564 nbufs);
3565 return EIO;
3566 }
3567 /* Give the firmware the mtu and the big and small buffer
3568 sizes. The firmware wants the big buf size to be a power
3569 of two. Luckily, FreeBSD's clusters are powers of two */
b915556e 3570 cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
8892ea20
AE
3571 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3572 cmd.data0 = MHLEN - MXGEFW_PAD;
3573 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3574 &cmd);
3575 cmd.data0 = big_bytes;
3576 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3577
3578 if (err != 0) {
3579 device_printf(sc->dev, "failed to setup params\n");
3580 goto abort;
3581 }
3582
3583 /* Now give him the pointer to the stats block */
3584 for (slice = 0;
3585#ifdef IFNET_BUF_RING
3586 slice < sc->num_slices;
3587#else
3588 slice < 1;
3589#endif
3590 slice++) {
3591 ss = &sc->ss[slice];
3592 cmd.data0 =
3593 MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3594 cmd.data1 =
3595 MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3596 cmd.data2 = sizeof(struct mcp_irq_data);
3597 cmd.data2 |= (slice << 16);
3598 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3599 }
3600
3601 if (err != 0) {
3602 bus = sc->ss->fw_stats_dma.bus_addr;
3603 bus += offsetof(struct mcp_irq_data, send_done_count);
3604 cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3605 cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3606 err = mxge_send_cmd(sc,
3607 MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3608 &cmd);
3609 /* Firmware cannot support multicast without STATS_DMA_V2 */
3610 sc->fw_multicast_support = 0;
3611 } else {
3612 sc->fw_multicast_support = 1;
3613 }
3614
3615 if (err != 0) {
3616 device_printf(sc->dev, "failed to setup params\n");
3617 goto abort;
3618 }
3619
3620 for (slice = 0; slice < sc->num_slices; slice++) {
3621 err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3622 if (err != 0) {
3623 device_printf(sc->dev, "couldn't open slice %d\n",
3624 slice);
3625 goto abort;
3626 }
3627 }
3628
3629 /* Finally, start the firmware running */
3630 err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3631 if (err) {
3632 device_printf(sc->dev, "Couldn't bring up link\n");
3633 goto abort;
3634 }
3635#ifdef IFNET_BUF_RING
3636 for (slice = 0; slice < sc->num_slices; slice++) {
3637 ss = &sc->ss[slice];
2ab1b8a9
AE
3638 ss->if_flags |= IFF_RUNNING;
3639 ss->if_flags &= ~IFF_OACTIVE;
8892ea20
AE
3640 }
3641#endif
2ab1b8a9
AE
3642 sc->ifp->if_flags |= IFF_RUNNING;
3643 sc->ifp->if_flags &= ~IFF_OACTIVE;
8892ea20
AE
3644 callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3645
3646 return 0;
3647
3648
3649abort:
3650 mxge_free_mbufs(sc);
3651
3652 return err;
3653}
3654
3655static int
3656mxge_close(mxge_softc_t *sc)
3657{
3658 mxge_cmd_t cmd;
3659 int err, old_down_cnt;
3660#ifdef IFNET_BUF_RING
3661 struct mxge_slice_state *ss;
3662 int slice;
3663#endif
3664
cd0543ff 3665 ASSERT_SERIALIZED(sc->ifp->if_serializer);
8892ea20
AE
3666 callout_stop(&sc->co_hdl);
3667#ifdef IFNET_BUF_RING
3668 for (slice = 0; slice < sc->num_slices; slice++) {
3669 ss = &sc->ss[slice];
2ab1b8a9 3670 ss->if_flags &= ~IFF_RUNNING;
8892ea20
AE
3671 }
3672#endif
2ab1b8a9 3673 sc->ifp->if_flags &= ~IFF_RUNNING;
8892ea20
AE
3674 old_down_cnt = sc->down_cnt;
3675 wmb();
3676 err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3677 if (err) {
3678 device_printf(sc->dev, "Couldn't bring down link\n");
3679 }
3680 if (old_down_cnt == sc->down_cnt) {
3681 /* wait for down irq */
3682 DELAY(10 * sc->intr_coal_delay);
3683 }
3684 wmb();
3685 if (old_down_cnt == sc->down_cnt) {
3686 device_printf(sc->dev, "never got down irq\n");
3687 }
3688
3689 mxge_free_mbufs(sc);
3690
3691 return 0;
3692}
3693
3694static void
3695mxge_setup_cfg_space(mxge_softc_t *sc)
3696{
3697 device_t dev = sc->dev;
3698 int reg;
3699 uint16_t cmd, lnk, pectl;
3700
3701 /* find the PCIe link width and set max read request to 4KB*/
3702 if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
3703 lnk = pci_read_config(dev, reg + 0x12, 2);
3704 sc->link_width = (lnk >> 4) & 0x3f;
3705
3706 pectl = pci_read_config(dev, reg + 0x8, 2);
3707 pectl = (pectl & ~0x7000) | (5 << 12);
3708 pci_write_config(dev, reg + 0x8, pectl, 2);
3709 }
3710
3711 /* Enable DMA and Memory space access */
3712 pci_enable_busmaster(dev);
3713 cmd = pci_read_config(dev, PCIR_COMMAND, 2);
3714 cmd |= PCIM_CMD_MEMEN;
3715 pci_write_config(dev, PCIR_COMMAND, cmd, 2);
3716}
3717
3718static uint32_t
3719mxge_read_reboot(mxge_softc_t *sc)
3720{
3721 device_t dev = sc->dev;
3722 uint32_t vs;
3723
3724 /* find the vendor specific offset */
3725 if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
3726 device_printf(sc->dev,
3727 "could not find vendor specific offset\n");
3728 return (uint32_t)-1;
3729 }
3730 /* enable read32 mode */
3731 pci_write_config(dev, vs + 0x10, 0x3, 1);
3732 /* tell NIC which register to read */
3733 pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3734 return (pci_read_config(dev, vs + 0x14, 4));
3735}
3736
3737static int
3738mxge_watchdog_reset(mxge_softc_t *sc, int slice)
3739{
3740 struct pci_devinfo *dinfo;
3741 mxge_tx_ring_t *tx;
3742 int err;
3743 uint32_t reboot;
3744 uint16_t cmd;
3745
3746 err = ENXIO;
3747
3748 device_printf(sc->dev, "Watchdog reset!\n");
3749
3750 /*
3751 * check to see if the NIC rebooted. If it did, then all of
3752 * PCI config space has been reset, and things like the
3753 * busmaster bit will be zero. If this is the case, then we
3754 * must restore PCI config space before the NIC can be used
3755 * again
3756 */
3757 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3758 if (cmd == 0xffff) {
3759 /*
3760 * maybe the watchdog caught the NIC rebooting; wait
3761 * up to 100ms for it to finish. If it does not come
3762 * back, then give up
3763 */
3764 DELAY(1000*100);
3765 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3766 if (cmd == 0xffff) {
3767 device_printf(sc->dev, "NIC disappeared!\n");
3768 return (err);
3769 }
3770 }
3771 if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3772 /* print the reboot status */
3773 reboot = mxge_read_reboot(sc);
3774 device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3775 reboot);
3776 /* restore PCI configuration space */
3777 dinfo = device_get_ivars(sc->dev);
3778 pci_cfg_restore(sc->dev, dinfo);
3779
3780 /* and redo any changes we made to our config space */
3781 mxge_setup_cfg_space(sc);
3782
2ab1b8a9 3783 if (sc->ifp->if_flags & IFF_RUNNING) {
8892ea20
AE
3784 mxge_close(sc);
3785 err = mxge_open(sc);
3786 }
3787 } else {
3788 tx = &sc->ss[slice].tx;
3789 device_printf(sc->dev,
3790 "NIC did not reboot, slice %d ring state:\n",
3791 slice);
3792 device_printf(sc->dev,
3793 "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3794 tx->req, tx->done, tx->queue_active);
3795 device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3796 tx->activate, tx->deactivate);
3797 device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3798 tx->pkt_done,
3799 be32toh(sc->ss->fw_stats->send_done_count));
3800 device_printf(sc->dev, "not resetting\n");
3801 }
3802 return (err);
3803}
3804
3805static int
3806mxge_watchdog(mxge_softc_t *sc)
3807{
3808 mxge_tx_ring_t *tx;
3809 uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3810 int i, err = 0;
3811
3812 /* see if we have outstanding transmits, which
3813 have been pending for more than mxge_ticks */
3814 for (i = 0;
3815#ifdef IFNET_BUF_RING
3816 (i < sc->num_slices) && (err == 0);
3817#else
3818 (i < 1) && (err == 0);
3819#endif
3820 i++) {
3821 tx = &sc->ss[i].tx;
3822 if (tx->req != tx->done &&
3823 tx->watchdog_req != tx->watchdog_done &&
3824 tx->done == tx->watchdog_done) {
3825 /* check for pause blocking before resetting */
3826 if (tx->watchdog_rx_pause == rx_pause)
3827 err = mxge_watchdog_reset(sc, i);
3828 else
3829 device_printf(sc->dev, "Flow control blocking "
3830 "xmits, check link partner\n");
3831 }
3832
3833 tx->watchdog_req = tx->req;
3834 tx->watchdog_done = tx->done;
3835 tx->watchdog_rx_pause = rx_pause;
3836 }
3837
3838 if (sc->need_media_probe)
3839 mxge_media_probe(sc);
3840 return (err);
3841}
3842
3843static void
3844mxge_update_stats(mxge_softc_t *sc)
3845{
3846 struct mxge_slice_state *ss;
3847 u_long ipackets = 0;
3848 u_long opackets = 0;
3849#ifdef IFNET_BUF_RING
3850 u_long obytes = 0;
3851 u_long omcasts = 0;
3852 u_long odrops = 0;
3853#endif
3854 u_long oerrors = 0;
3855 int slice;
3856
3857 for (slice = 0; slice < sc->num_slices; slice++) {
3858 ss = &sc->ss[slice];
3859 ipackets += ss->ipackets;
3860 opackets += ss->opackets;
3861#ifdef IFNET_BUF_RING
3862 obytes += ss->obytes;
3863 omcasts += ss->omcasts;
3864 odrops += ss->tx.br->br_drops;
3865#endif
3866 oerrors += ss->oerrors;
3867 }
3868 sc->ifp->if_ipackets = ipackets;
3869 sc->ifp->if_opackets = opackets;
3870#ifdef IFNET_BUF_RING
3871 sc->ifp->if_obytes = obytes;
3872 sc->ifp->if_omcasts = omcasts;
3873 sc->ifp->if_snd.ifq_drops = odrops;
3874#endif
3875 sc->ifp->if_oerrors = oerrors;
3876}
3877
3878static void
3879mxge_tick(void *arg)
3880{
3881 mxge_softc_t *sc = arg;
3882 int err = 0;
3883
2e8181d0 3884 lwkt_serialize_enter(sc->ifp->if_serializer);
8892ea20
AE
3885 /* aggregate stats from different slices */
3886 mxge_update_stats(sc);
3887 if (!sc->watchdog_countdown) {
3888 err = mxge_watchdog(sc);
3889 sc->watchdog_countdown = 4;
3890 }
3891 sc->watchdog_countdown--;
3892 if (err == 0)
3893 callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
2e8181d0 3894 lwkt_serialize_exit(sc->ifp->if_serializer);
8892ea20
AE
3895}
3896
3897static int
3898mxge_media_change(struct ifnet *ifp)
3899{
3900 return EINVAL;
3901}
3902
3903static int
3904mxge_change_mtu(mxge_softc_t *sc, int mtu)
3905{
3906 struct ifnet *ifp = sc->ifp;
3907 int real_mtu, old_mtu;
3908 int err = 0;
3909
cd0543ff
AE
3910 if (ifp->if_serializer)
3911 ASSERT_SERIALIZED(ifp->if_serializer);
8892ea20 3912
b915556e 3913 real_mtu = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
8892ea20
AE
3914 if ((real_mtu > sc->max_mtu) || real_mtu < 60)
3915 return EINVAL;
8892ea20
AE
3916 old_mtu = ifp->if_mtu;
3917 ifp->if_mtu = mtu;
2ab1b8a9 3918 if (ifp->if_flags & IFF_RUNNING) {
8892ea20
AE
3919 mxge_close(sc);
3920 err = mxge_open(sc);
3921 if (err != 0) {
3922 ifp->if_mtu = old_mtu;
3923 mxge_close(sc);
3924 (void) mxge_open(sc);
3925 }
3926 }
8892ea20
AE
3927 return err;
3928}
3929
3930static void
3931mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
3932{
3933 mxge_softc_t *sc = ifp->if_softc;
3934
3935
3936 if (sc == NULL)
3937 return;
3938 ifmr->ifm_status = IFM_AVALID;
3939 ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
3940 ifmr->ifm_active = IFM_AUTO | IFM_ETHER;
3941 ifmr->ifm_active |= sc->link_state ? IFM_FDX : 0;
3942}
3943
3944static int
137195a6 3945mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data, struct ucred *cr)
8892ea20
AE
3946{
3947 mxge_softc_t *sc = ifp->if_softc;
3948 struct ifreq *ifr = (struct ifreq *)data;
3949 int err, mask;