netmap: change header includes
[dragonfly.git] / sys / net / netmap / netmap_vale.c
... / ...
CommitLineData
1/*
2 * Copyright (C) 2013 Universita` di Pisa. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 */
25
26
27/*
28 * This module implements the VALE switch for netmap
29
30--- VALE SWITCH ---
31
32NMG_LOCK() serializes all modifications to switches and ports.
33A switch cannot be deleted until all ports are gone.
34
35For each switch, an SX lock (RWlock on linux) protects
36deletion of ports. When configuring or deleting a new port, the
37lock is acquired in exclusive mode (after holding NMG_LOCK).
38When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
39The lock is held throughout the entire forwarding cycle,
40during which the thread may incur in a page fault.
41Hence it is important that sleepable shared locks are used.
42
43On the rx ring, the per-port lock is grabbed initially to reserve
44a number of slot in the ring, then the lock is released,
45packets are copied from source to destination, and then
46the lock is acquired again and the receive ring is updated.
47(A similar thing is done on the tx ring for NIC and host stack
48ports attached to the switch)
49
50 */
51
52/*
53 * OS-specific code that is used only within this file.
54 * Other OS-specific code that must be accessed by drivers
55 * is present in netmap_kern.h
56 */
57
58
59#include <sys/types.h>
60#include <sys/errno.h>
61#include <sys/param.h> /* defines used in kernel.h */
62#include <sys/kernel.h> /* types used in module initialization */
63#include <sys/conf.h> /* cdevsw struct, UID, GID */
64#include <sys/sockio.h>
65#include <sys/socketvar.h> /* struct socket */
66#include <sys/malloc.h>
67#include <sys/poll.h>
68#include <sys/lock.h>
69#include <sys/socket.h> /* sockaddrs */
70#include <sys/sysctl.h>
71#include <net/if.h>
72#include <net/if_var.h>
73#include <net/bpf.h> /* BIOCIMMEDIATE */
74#include <sys/bus.h> /* bus_dmamap_* */
75#include <sys/endian.h>
76#include <sys/refcount.h>
77
78
79#define BDG_RWLOCK_T struct lock
80
81#define BDG_RWINIT(b) \
82 lockinit(&(b)->bdg_lock, "bdg lock", 0, LK_CANRECURSE)
83#define BDG_WLOCK(b) lockmgr(&(b)->bdg_lock, LK_EXCLUSIVE)
84#define BDG_WUNLOCK(b) lockmgr(&(b)->bdg_lock, LK_RELEASE)
85#define BDG_RLOCK(b) lockmgr(&(b)->bdg_lock, LK_SHARED)
86#define BDG_RTRYLOCK(b) lockmgr(&(b)->bdg_lock, LK_SHARED|LK_NOWAIT)
87#define BDG_RUNLOCK(b) lockmgr(&(b)->bdg_lock, LK_RELEASE)
88#define BDG_RWDESTROY(b) lockuninit(&(b)->bdg_lock)
89
90/*
91 * common headers
92 */
93
94#include <net/netmap.h>
95
96#include <net/netmap/netmap_kern.h>
97#include <net/netmap/netmap_mem2.h>
98
99#ifdef WITH_VALE
100
101/*
102 * system parameters (most of them in netmap_kern.h)
103 * NM_NAME prefix for switch port names, default "vale"
104 * NM_BDG_MAXPORTS number of ports
105 * NM_BRIDGES max number of switches in the system.
106 * XXX should become a sysctl or tunable
107 *
108 * Switch ports are named valeX:Y where X is the switch name and Y
109 * is the port. If Y matches a physical interface name, the port is
110 * connected to a physical device.
111 *
112 * Unlike physical interfaces, switch ports use their own memory region
113 * for rings and buffers.
114 * The virtual interfaces use per-queue lock instead of core lock.
115 * In the tx loop, we aggregate traffic in batches to make all operations
116 * faster. The batch size is bridge_batch.
117 */
118#define NM_BDG_MAXRINGS 16 /* XXX unclear how many. */
119#define NM_BDG_MAXSLOTS 4096 /* XXX same as above */
120#define NM_BRIDGE_RINGSIZE 1024 /* in the device */
121#define NM_BDG_HASH 1024 /* forwarding table entries */
122#define NM_BDG_BATCH 1024 /* entries in the forwarding buffer */
123#define NM_MULTISEG 64 /* max size of a chain of bufs */
124/* actual size of the tables */
125#define NM_BDG_BATCH_MAX (NM_BDG_BATCH + NM_MULTISEG)
126/* NM_FT_NULL terminates a list of slots in the ft */
127#define NM_FT_NULL NM_BDG_BATCH_MAX
128#define NM_BRIDGES 8 /* number of bridges */
129
130
131/*
132 * bridge_batch is set via sysctl to the max batch size to be
133 * used in the bridge. The actual value may be larger as the
134 * last packet in the block may overflow the size.
135 */
136int bridge_batch = NM_BDG_BATCH; /* bridge batch size */
137SYSCTL_DECL(_dev_netmap);
138SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0 , "");
139
140
141static int bdg_netmap_attach(struct netmap_adapter *);
142static int bdg_netmap_reg(struct netmap_adapter *na, int onoff);
143static int netmap_bwrap_attach(struct ifnet *, struct ifnet *);
144static int netmap_bwrap_register(struct netmap_adapter *, int onoff);
145int kern_netmap_regif(struct nmreq *nmr);
146
147/*
148 * Each transmit queue accumulates a batch of packets into
149 * a structure before forwarding. Packets to the same
150 * destination are put in a list using ft_next as a link field.
151 * ft_frags and ft_next are valid only on the first fragment.
152 */
153struct nm_bdg_fwd { /* forwarding entry for a bridge */
154 void *ft_buf; /* netmap or indirect buffer */
155 uint8_t ft_frags; /* how many fragments (only on 1st frag) */
156 uint8_t _ft_port; /* dst port (unused) */
157 uint16_t ft_flags; /* flags, e.g. indirect */
158 uint16_t ft_len; /* src fragment len */
159 uint16_t ft_next; /* next packet to same destination */
160};
161
162/*
163 * For each output interface, nm_bdg_q is used to construct a list.
164 * bq_len is the number of output buffers (we can have coalescing
165 * during the copy).
166 */
167struct nm_bdg_q {
168 uint16_t bq_head;
169 uint16_t bq_tail;
170 uint32_t bq_len; /* number of buffers */
171};
172
173/* XXX revise this */
174struct nm_hash_ent {
175 uint64_t mac; /* the top 2 bytes are the epoch */
176 uint64_t ports;
177};
178
179/*
180 * nm_bridge is a descriptor for a VALE switch.
181 * Interfaces for a bridge are all in bdg_ports[].
182 * The array has fixed size, an empty entry does not terminate
183 * the search, but lookups only occur on attach/detach so we
184 * don't mind if they are slow.
185 *
186 * The bridge is non blocking on the transmit ports: excess
187 * packets are dropped if there is no room on the output port.
188 *
189 * bdg_lock protects accesses to the bdg_ports array.
190 * This is a rw lock (or equivalent).
191 */
192struct nm_bridge {
193 /* XXX what is the proper alignment/layout ? */
194 BDG_RWLOCK_T bdg_lock; /* protects bdg_ports */
195 int bdg_namelen;
196 uint32_t bdg_active_ports; /* 0 means free */
197 char bdg_basename[IFNAMSIZ];
198
199 /* Indexes of active ports (up to active_ports)
200 * and all other remaining ports.
201 */
202 uint8_t bdg_port_index[NM_BDG_MAXPORTS];
203
204 struct netmap_vp_adapter *bdg_ports[NM_BDG_MAXPORTS];
205
206
207 /*
208 * The function to decide the destination port.
209 * It returns either of an index of the destination port,
210 * NM_BDG_BROADCAST to broadcast this packet, or NM_BDG_NOPORT not to
211 * forward this packet. ring_nr is the source ring index, and the
212 * function may overwrite this value to forward this packet to a
213 * different ring index.
214 * This function must be set by netmap_bdgctl().
215 */
216 bdg_lookup_fn_t nm_bdg_lookup;
217
218 /* the forwarding table, MAC+ports.
219 * XXX should be changed to an argument to be passed to
220 * the lookup function, and allocated on attach
221 */
222 struct nm_hash_ent ht[NM_BDG_HASH];
223};
224
225
226/*
227 * XXX in principle nm_bridges could be created dynamically
228 * Right now we have a static array and deletions are protected
229 * by an exclusive lock.
230 */
231struct nm_bridge nm_bridges[NM_BRIDGES];
232
233
234/*
235 * A few function to tell which kind of port are we using.
236 * XXX should we hold a lock ?
237 *
238 * nma_is_vp() virtual port
239 * nma_is_host() port connected to the host stack
240 * nma_is_hw() port connected to a NIC
241 * nma_is_generic() generic netmap adapter XXX stop this madness
242 */
243static __inline int
244nma_is_vp(struct netmap_adapter *na)
245{
246 return na->nm_register == bdg_netmap_reg;
247}
248
249
250static __inline int
251nma_is_host(struct netmap_adapter *na)
252{
253 return na->nm_register == NULL;
254}
255
256
257static __inline int
258nma_is_hw(struct netmap_adapter *na)
259{
260 /* In case of sw adapter, nm_register is NULL */
261 return !nma_is_vp(na) && !nma_is_host(na) && !nma_is_generic(na);
262}
263
264static __inline int
265nma_is_bwrap(struct netmap_adapter *na)
266{
267 return na->nm_register == netmap_bwrap_register;
268}
269
270
271
272/*
273 * this is a slightly optimized copy routine which rounds
274 * to multiple of 64 bytes and is often faster than dealing
275 * with other odd sizes. We assume there is enough room
276 * in the source and destination buffers.
277 *
278 * XXX only for multiples of 64 bytes, non overlapped.
279 */
280static inline void
281pkt_copy(void *_src, void *_dst, int l)
282{
283 uint64_t *src = _src;
284 uint64_t *dst = _dst;
285 if (unlikely(l >= 1024)) {
286 memcpy(dst, src, l);
287 return;
288 }
289 for (; likely(l > 0); l-=64) {
290 *dst++ = *src++;
291 *dst++ = *src++;
292 *dst++ = *src++;
293 *dst++ = *src++;
294 *dst++ = *src++;
295 *dst++ = *src++;
296 *dst++ = *src++;
297 *dst++ = *src++;
298 }
299}
300
301
302
303/*
304 * locate a bridge among the existing ones.
305 * MUST BE CALLED WITH NMG_LOCK()
306 *
307 * a ':' in the name terminates the bridge name. Otherwise, just NM_NAME.
308 * We assume that this is called with a name of at least NM_NAME chars.
309 */
310static struct nm_bridge *
311nm_find_bridge(const char *name, int create)
312{
313 int i, l, namelen;
314 struct nm_bridge *b = NULL;
315
316 NMG_LOCK_ASSERT();
317
318 namelen = strlen(NM_NAME); /* base length */
319 l = name ? strlen(name) : 0; /* actual length */
320 if (l < namelen) {
321 D("invalid bridge name %s", name ? name : NULL);
322 return NULL;
323 }
324 for (i = namelen + 1; i < l; i++) {
325 if (name[i] == ':') {
326 namelen = i;
327 break;
328 }
329 }
330 if (namelen >= IFNAMSIZ)
331 namelen = IFNAMSIZ;
332 ND("--- prefix is '%.*s' ---", namelen, name);
333
334 /* lookup the name, remember empty slot if there is one */
335 for (i = 0; i < NM_BRIDGES; i++) {
336 struct nm_bridge *x = nm_bridges + i;
337
338 if (x->bdg_active_ports == 0) {
339 if (create && b == NULL)
340 b = x; /* record empty slot */
341 } else if (x->bdg_namelen != namelen) {
342 continue;
343 } else if (strncmp(name, x->bdg_basename, namelen) == 0) {
344 ND("found '%.*s' at %d", namelen, name, i);
345 b = x;
346 break;
347 }
348 }
349 if (i == NM_BRIDGES && b) { /* name not found, can create entry */
350 /* initialize the bridge */
351 strncpy(b->bdg_basename, name, namelen);
352 ND("create new bridge %s with ports %d", b->bdg_basename,
353 b->bdg_active_ports);
354 b->bdg_namelen = namelen;
355 b->bdg_active_ports = 0;
356 for (i = 0; i < NM_BDG_MAXPORTS; i++)
357 b->bdg_port_index[i] = i;
358 /* set the default function */
359 b->nm_bdg_lookup = netmap_bdg_learning;
360 /* reset the MAC address table */
361 bzero(b->ht, sizeof(struct nm_hash_ent) * NM_BDG_HASH);
362 }
363 return b;
364}
365
366
367/*
368 * Free the forwarding tables for rings attached to switch ports.
369 */
370static void
371nm_free_bdgfwd(struct netmap_adapter *na)
372{
373 int nrings, i;
374 struct netmap_kring *kring;
375
376 NMG_LOCK_ASSERT();
377 nrings = nma_is_vp(na) ? na->num_tx_rings : na->num_rx_rings;
378 kring = nma_is_vp(na) ? na->tx_rings : na->rx_rings;
379 for (i = 0; i < nrings; i++) {
380 if (kring[i].nkr_ft) {
381 kfree(kring[i].nkr_ft, M_DEVBUF);
382 kring[i].nkr_ft = NULL; /* protect from freeing twice */
383 }
384 }
385}
386
387
388/*
389 * Allocate the forwarding tables for the rings attached to the bridge ports.
390 */
391static int
392nm_alloc_bdgfwd(struct netmap_adapter *na)
393{
394 int nrings, l, i, num_dstq;
395 struct netmap_kring *kring;
396
397 NMG_LOCK_ASSERT();
398 /* all port:rings + broadcast */
399 num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1;
400 l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX;
401 l += sizeof(struct nm_bdg_q) * num_dstq;
402 l += sizeof(uint16_t) * NM_BDG_BATCH_MAX;
403
404 nrings = na->num_tx_rings + 1;
405 kring = na->tx_rings;
406 for (i = 0; i < nrings; i++) {
407 struct nm_bdg_fwd *ft;
408 struct nm_bdg_q *dstq;
409 int j;
410
411 ft = kmalloc(l, M_DEVBUF, M_NOWAIT | M_ZERO);
412 if (!ft) {
413 nm_free_bdgfwd(na);
414 return ENOMEM;
415 }
416 dstq = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX);
417 for (j = 0; j < num_dstq; j++) {
418 dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL;
419 dstq[j].bq_len = 0;
420 }
421 kring[i].nkr_ft = ft;
422 }
423 return 0;
424}
425
426
427static void
428netmap_bdg_detach_common(struct nm_bridge *b, int hw, int sw)
429{
430 int s_hw = hw, s_sw = sw;
431 int i, lim =b->bdg_active_ports;
432 uint8_t tmp[NM_BDG_MAXPORTS];
433
434 /*
435 New algorithm:
436 make a copy of bdg_port_index;
437 lookup NA(ifp)->bdg_port and SWNA(ifp)->bdg_port
438 in the array of bdg_port_index, replacing them with
439 entries from the bottom of the array;
440 decrement bdg_active_ports;
441 acquire BDG_WLOCK() and copy back the array.
442 */
443
444 D("detach %d and %d (lim %d)", hw, sw, lim);
445 /* make a copy of the list of active ports, update it,
446 * and then copy back within BDG_WLOCK().
447 */
448 memcpy(tmp, b->bdg_port_index, sizeof(tmp));
449 for (i = 0; (hw >= 0 || sw >= 0) && i < lim; ) {
450 if (hw >= 0 && tmp[i] == hw) {
451 ND("detach hw %d at %d", hw, i);
452 lim--; /* point to last active port */
453 tmp[i] = tmp[lim]; /* swap with i */
454 tmp[lim] = hw; /* now this is inactive */
455 hw = -1;
456 } else if (sw >= 0 && tmp[i] == sw) {
457 ND("detach sw %d at %d", sw, i);
458 lim--;
459 tmp[i] = tmp[lim];
460 tmp[lim] = sw;
461 sw = -1;
462 } else {
463 i++;
464 }
465 }
466 if (hw >= 0 || sw >= 0) {
467 D("XXX delete failed hw %d sw %d, should panic...", hw, sw);
468 }
469
470 BDG_WLOCK(b);
471 b->bdg_ports[s_hw] = NULL;
472 if (s_sw >= 0) {
473 b->bdg_ports[s_sw] = NULL;
474 }
475 memcpy(b->bdg_port_index, tmp, sizeof(tmp));
476 b->bdg_active_ports = lim;
477 BDG_WUNLOCK(b);
478
479 ND("now %d active ports", lim);
480 if (lim == 0) {
481 ND("marking bridge %s as free", b->bdg_basename);
482 b->nm_bdg_lookup = NULL;
483 }
484}
485
486static void
487netmap_adapter_vp_dtor(struct netmap_adapter *na)
488{
489 struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na;
490 struct nm_bridge *b = vpna->na_bdg;
491 struct ifnet *ifp = na->ifp;
492
493 ND("%s has %d references", NM_IFPNAME(ifp), na->na_refcount);
494
495 if (b) {
496 netmap_bdg_detach_common(b, vpna->bdg_port, -1);
497 }
498
499 bzero(ifp, sizeof(*ifp));
500 kfree(ifp, M_DEVBUF);
501 na->ifp = NULL;
502}
503
504int
505netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
506{
507 const char *name = nmr->nr_name;
508 struct ifnet *ifp;
509 int error = 0;
510 struct netmap_adapter *ret;
511 struct netmap_vp_adapter *vpna;
512 struct nm_bridge *b;
513 int i, j, cand = -1, cand2 = -1;
514 int needed;
515
516 *na = NULL; /* default return value */
517
518 /* first try to see if this is a bridge port. */
519 NMG_LOCK_ASSERT();
520 if (strncmp(name, NM_NAME, sizeof(NM_NAME) - 1)) {
521 return 0; /* no error, but no VALE prefix */
522 }
523
524 b = nm_find_bridge(name, create);
525 if (b == NULL) {
526 D("no bridges available for '%s'", name);
527 return (ENXIO);
528 }
529
530 /* Now we are sure that name starts with the bridge's name,
531 * lookup the port in the bridge. We need to scan the entire
532 * list. It is not important to hold a WLOCK on the bridge
533 * during the search because NMG_LOCK already guarantees
534 * that there are no other possible writers.
535 */
536
537 /* lookup in the local list of ports */
538 for (j = 0; j < b->bdg_active_ports; j++) {
539 i = b->bdg_port_index[j];
540 vpna = b->bdg_ports[i];
541 // KASSERT(na != NULL);
542 ifp = vpna->up.ifp;
543 /* XXX make sure the name only contains one : */
544 if (!strcmp(NM_IFPNAME(ifp), name)) {
545 netmap_adapter_get(&vpna->up);
546 ND("found existing if %s refs %d", name,
547 vpna->na_bdg_refcount);
548 *na = (struct netmap_adapter *)vpna;
549 return 0;
550 }
551 }
552 /* not found, should we create it? */
553 if (!create)
554 return ENXIO;
555 /* yes we should, see if we have space to attach entries */
556 needed = 2; /* in some cases we only need 1 */
557 if (b->bdg_active_ports + needed >= NM_BDG_MAXPORTS) {
558 D("bridge full %d, cannot create new port", b->bdg_active_ports);
559 return EINVAL;
560 }
561 /* record the next two ports available, but do not allocate yet */
562 cand = b->bdg_port_index[b->bdg_active_ports];
563 cand2 = b->bdg_port_index[b->bdg_active_ports + 1];
564 ND("+++ bridge %s port %s used %d avail %d %d",
565 b->bdg_basename, name, b->bdg_active_ports, cand, cand2);
566
567 /*
568 * try see if there is a matching NIC with this name
569 * (after the bridge's name)
570 */
571 ifp = ifunit(name + b->bdg_namelen + 1);
572 if (!ifp) { /* this is a virtual port */
573 /* Create a temporary NA with arguments, then
574 * bdg_netmap_attach() will allocate the real one
575 * and attach it to the ifp
576 */
577 struct netmap_adapter tmp_na;
578
579 if (nmr->nr_cmd) {
580 /* nr_cmd must be 0 for a virtual port */
581 return EINVAL;
582 }
583 bzero(&tmp_na, sizeof(tmp_na));
584 /* bound checking */
585 tmp_na.num_tx_rings = nmr->nr_tx_rings;
586 nm_bound_var(&tmp_na.num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
587 nmr->nr_tx_rings = tmp_na.num_tx_rings; // write back
588 tmp_na.num_rx_rings = nmr->nr_rx_rings;
589 nm_bound_var(&tmp_na.num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
590 nmr->nr_rx_rings = tmp_na.num_rx_rings; // write back
591 nm_bound_var(&nmr->nr_tx_slots, NM_BRIDGE_RINGSIZE,
592 1, NM_BDG_MAXSLOTS, NULL);
593 tmp_na.num_tx_desc = nmr->nr_tx_slots;
594 nm_bound_var(&nmr->nr_rx_slots, NM_BRIDGE_RINGSIZE,
595 1, NM_BDG_MAXSLOTS, NULL);
596 tmp_na.num_rx_desc = nmr->nr_rx_slots;
597
598 /* create a struct ifnet for the new port.
599 * need M_NOWAIT as we are under nma_lock
600 */
601 ifp = kmalloc(sizeof(*ifp), M_DEVBUF, M_NOWAIT | M_ZERO);
602 if (!ifp)
603 return ENOMEM;
604
605 strcpy(ifp->if_xname, name);
606 tmp_na.ifp = ifp;
607 /* bdg_netmap_attach creates a struct netmap_adapter */
608 error = bdg_netmap_attach(&tmp_na);
609 if (error) {
610 D("error %d", error);
611 kfree(ifp, M_DEVBUF);
612 return error;
613 }
614 ret = NA(ifp);
615 cand2 = -1; /* only need one port */
616 } else { /* this is a NIC */
617 struct ifnet *fake_ifp;
618
619 error = netmap_get_hw_na(ifp, &ret);
620 if (error || ret == NULL)
621 goto out;
622
623 /* make sure the NIC is not already in use */
624 if (NETMAP_OWNED_BY_ANY(ret)) {
625 D("NIC %s busy, cannot attach to bridge",
626 NM_IFPNAME(ifp));
627 error = EINVAL;
628 goto out;
629 }
630 /* create a fake interface */
631 fake_ifp = kmalloc(sizeof(*ifp), M_DEVBUF, M_NOWAIT | M_ZERO);
632 if (!fake_ifp) {
633 error = ENOMEM;
634 goto out;
635 }
636 strcpy(fake_ifp->if_xname, name);
637 error = netmap_bwrap_attach(fake_ifp, ifp);
638 if (error) {
639 kfree(fake_ifp, M_DEVBUF);
640 goto out;
641 }
642 ret = NA(fake_ifp);
643 if (nmr->nr_arg1 != NETMAP_BDG_HOST)
644 cand2 = -1; /* only need one port */
645#if 0
646 if_rele(ifp);
647#endif
648 }
649 vpna = (struct netmap_vp_adapter *)ret;
650
651 BDG_WLOCK(b);
652 vpna->bdg_port = cand;
653 ND("NIC %p to bridge port %d", vpna, cand);
654 /* bind the port to the bridge (virtual ports are not active) */
655 b->bdg_ports[cand] = vpna;
656 vpna->na_bdg = b;
657 b->bdg_active_ports++;
658 if (cand2 >= 0) {
659 struct netmap_vp_adapter *hostna = vpna + 1;
660 /* also bind the host stack to the bridge */
661 b->bdg_ports[cand2] = hostna;
662 hostna->bdg_port = cand2;
663 hostna->na_bdg = b;
664 b->bdg_active_ports++;
665 ND("host %p to bridge port %d", hostna, cand2);
666 }
667 ND("if %s refs %d", name, vpna->up.na_refcount);
668 BDG_WUNLOCK(b);
669 *na = ret;
670 netmap_adapter_get(ret);
671 return 0;
672
673out:
674#if 0
675 if_rele(ifp);
676#endif
677
678 return error;
679}
680
681
682/* Process NETMAP_BDG_ATTACH and NETMAP_BDG_DETACH */
683static int
684nm_bdg_attach(struct nmreq *nmr)
685{
686 struct netmap_adapter *na;
687 struct netmap_if *nifp;
688 struct netmap_priv_d *npriv;
689 struct netmap_bwrap_adapter *bna;
690 int error;
691
692 npriv = kmalloc(sizeof(*npriv), M_DEVBUF, M_NOWAIT|M_ZERO);
693 if (npriv == NULL)
694 return ENOMEM;
695 NMG_LOCK();
696 /* XXX probably netmap_get_bdg_na() */
697 error = netmap_get_na(nmr, &na, 1 /* create if not exists */);
698 if (error) /* no device, or another bridge or user owns the device */
699 goto unlock_exit;
700 /* netmap_get_na() sets na_bdg if this is a physical interface
701 * that we can attach to a switch.
702 */
703 if (!nma_is_bwrap(na)) {
704 /* got reference to a virtual port or direct access to a NIC.
705 * perhaps specified no bridge prefix or wrong NIC name
706 */
707 error = EINVAL;
708 goto unref_exit;
709 }
710
711 if (na->active_fds > 0) { /* already registered */
712 error = EBUSY;
713 goto unref_exit;
714 }
715
716 nifp = netmap_do_regif(npriv, na, nmr->nr_ringid, &error);
717 if (!nifp) {
718 goto unref_exit;
719 }
720
721 bna = (struct netmap_bwrap_adapter*)na;
722 bna->na_kpriv = npriv;
723 NMG_UNLOCK();
724 ND("registered %s to netmap-mode", NM_IFPNAME(na->ifp));
725 return 0;
726
727unref_exit:
728 netmap_adapter_put(na);
729unlock_exit:
730 NMG_UNLOCK();
731 bzero(npriv, sizeof(*npriv));
732 kfree(npriv, M_DEVBUF);
733 return error;
734}
735
736static int
737nm_bdg_detach(struct nmreq *nmr)
738{
739 struct netmap_adapter *na;
740 int error;
741 struct netmap_bwrap_adapter *bna;
742 int last_instance;
743
744 NMG_LOCK();
745 error = netmap_get_na(nmr, &na, 0 /* don't create */);
746 if (error) { /* no device, or another bridge or user owns the device */
747 goto unlock_exit;
748 }
749 if (!nma_is_bwrap(na)) {
750 /* got reference to a virtual port or direct access to a NIC.
751 * perhaps specified no bridge's prefix or wrong NIC's name
752 */
753 error = EINVAL;
754 goto unref_exit;
755 }
756 bna = (struct netmap_bwrap_adapter *)na;
757
758 if (na->active_fds == 0) { /* not registered */
759 error = EINVAL;
760 goto unref_exit;
761 }
762
763 last_instance = netmap_dtor_locked(bna->na_kpriv); /* unregister */
764 if (!last_instance) {
765 D("--- error, trying to detach an entry with active mmaps");
766 error = EINVAL;
767 } else {
768 struct netmap_priv_d *npriv = bna->na_kpriv;
769
770 bna->na_kpriv = NULL;
771 D("deleting priv");
772
773 bzero(npriv, sizeof(*npriv));
774 kfree(npriv, M_DEVBUF);
775 }
776
777unref_exit:
778 netmap_adapter_put(na);
779unlock_exit:
780 NMG_UNLOCK();
781 return error;
782
783}
784
785
786/* exported to kernel callers, e.g. OVS ?
787 * Entry point.
788 * Called without NMG_LOCK.
789 */
790int
791netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func)
792{
793 struct nm_bridge *b;
794 struct netmap_vp_adapter *na;
795 struct ifnet *iter;
796 char *name = nmr->nr_name;
797 int cmd = nmr->nr_cmd, namelen = strlen(name);
798 int error = 0, i, j;
799
800 switch (cmd) {
801 case NETMAP_BDG_ATTACH:
802 error = nm_bdg_attach(nmr);
803 break;
804
805 case NETMAP_BDG_DETACH:
806 error = nm_bdg_detach(nmr);
807 break;
808
809 case NETMAP_BDG_LIST:
810 /* this is used to enumerate bridges and ports */
811 if (namelen) { /* look up indexes of bridge and port */
812 if (strncmp(name, NM_NAME, strlen(NM_NAME))) {
813 error = EINVAL;
814 break;
815 }
816 NMG_LOCK();
817 b = nm_find_bridge(name, 0 /* don't create */);
818 if (!b) {
819 error = ENOENT;
820 NMG_UNLOCK();
821 break;
822 }
823
824 error = ENOENT;
825 for (j = 0; j < b->bdg_active_ports; j++) {
826 i = b->bdg_port_index[j];
827 na = b->bdg_ports[i];
828 if (na == NULL) {
829 D("---AAAAAAAAARGH-------");
830 continue;
831 }
832 iter = na->up.ifp;
833 /* the former and the latter identify a
834 * virtual port and a NIC, respectively
835 */
836 if (!strcmp(iter->if_xname, name)) {
837 /* bridge index */
838 nmr->nr_arg1 = b - nm_bridges;
839 nmr->nr_arg2 = i; /* port index */
840 error = 0;
841 break;
842 }
843 }
844 NMG_UNLOCK();
845 } else {
846 /* return the first non-empty entry starting from
847 * bridge nr_arg1 and port nr_arg2.
848 *
849 * Users can detect the end of the same bridge by
850 * seeing the new and old value of nr_arg1, and can
851 * detect the end of all the bridge by error != 0
852 */
853 i = nmr->nr_arg1;
854 j = nmr->nr_arg2;
855
856 NMG_LOCK();
857 for (error = ENOENT; i < NM_BRIDGES; i++) {
858 b = nm_bridges + i;
859 if (j >= b->bdg_active_ports) {
860 j = 0; /* following bridges scan from 0 */
861 continue;
862 }
863 nmr->nr_arg1 = i;
864 nmr->nr_arg2 = j;
865 j = b->bdg_port_index[j];
866 na = b->bdg_ports[j];
867 iter = na->up.ifp;
868 strncpy(name, iter->if_xname, (size_t)IFNAMSIZ);
869 error = 0;
870 break;
871 }
872 NMG_UNLOCK();
873 }
874 break;
875
876 case NETMAP_BDG_LOOKUP_REG:
877 /* register a lookup function to the given bridge.
878 * nmr->nr_name may be just bridge's name (including ':'
879 * if it is not just NM_NAME).
880 */
881 if (!func) {
882 error = EINVAL;
883 break;
884 }
885 NMG_LOCK();
886 b = nm_find_bridge(name, 0 /* don't create */);
887 if (!b) {
888 error = EINVAL;
889 } else {
890 b->nm_bdg_lookup = func;
891 }
892 NMG_UNLOCK();
893 break;
894
895 default:
896 D("invalid cmd (nmr->nr_cmd) (0x%x)", cmd);
897 error = EINVAL;
898 break;
899 }
900 return error;
901}
902
903
904static int
905netmap_vp_krings_create(struct netmap_adapter *na)
906{
907 u_int ntx, nrx, tailroom;
908 int error, i;
909 uint32_t *leases;
910
911 /* XXX vps do not need host rings,
912 * but we crash if we don't have one
913 */
914 ntx = na->num_tx_rings + 1;
915 nrx = na->num_rx_rings + 1;
916
917 /*
918 * Leases are attached to RX rings on vale ports
919 */
920 tailroom = sizeof(uint32_t) * na->num_rx_desc * nrx;
921
922 error = netmap_krings_create(na, ntx, nrx, tailroom);
923 if (error)
924 return error;
925
926 leases = na->tailroom;
927
928 for (i = 0; i < nrx; i++) { /* Receive rings */
929 na->rx_rings[i].nkr_leases = leases;
930 leases += na->num_rx_desc;
931 }
932
933 error = nm_alloc_bdgfwd(na);
934 if (error) {
935 netmap_krings_delete(na);
936 return error;
937 }
938
939 return 0;
940}
941
942static void
943netmap_vp_krings_delete(struct netmap_adapter *na)
944{
945 nm_free_bdgfwd(na);
946 netmap_krings_delete(na);
947}
948
949
950static int
951nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n,
952 struct netmap_vp_adapter *na, u_int ring_nr);
953
954
955/*
956 * Grab packets from a kring, move them into the ft structure
957 * associated to the tx (input) port. Max one instance per port,
958 * filtered on input (ioctl, poll or XXX).
959 * Returns the next position in the ring.
960 */
961static int
962nm_bdg_preflush(struct netmap_vp_adapter *na, u_int ring_nr,
963 struct netmap_kring *kring, u_int end)
964{
965 struct netmap_ring *ring = kring->ring;
966 struct nm_bdg_fwd *ft;
967 u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1;
968 u_int ft_i = 0; /* start from 0 */
969 u_int frags = 1; /* how many frags ? */
970 struct nm_bridge *b = na->na_bdg;
971
972 /* To protect against modifications to the bridge we acquire a
973 * shared lock, waiting if we can sleep (if the source port is
974 * attached to a user process) or with a trylock otherwise (NICs).
975 */
976 ND("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j);
977 if (na->up.na_flags & NAF_BDG_MAYSLEEP)
978 BDG_RLOCK(b);
979 else if (!BDG_RTRYLOCK(b))
980 return 0;
981 ND(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j);
982 ft = kring->nkr_ft;
983
984 for (; likely(j != end); j = nm_next(j, lim)) {
985 struct netmap_slot *slot = &ring->slot[j];
986 char *buf;
987
988 ft[ft_i].ft_len = slot->len;
989 ft[ft_i].ft_flags = slot->flags;
990
991 ND("flags is 0x%x", slot->flags);
992 /* this slot goes into a list so initialize the link field */
993 ft[ft_i].ft_next = NM_FT_NULL;
994 buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ?
995 (void *)(uintptr_t)slot->ptr : BDG_NMB(&na->up, slot);
996 prefetch(buf);
997 ++ft_i;
998 if (slot->flags & NS_MOREFRAG) {
999 frags++;
1000 continue;
1001 }
1002 if (unlikely(netmap_verbose && frags > 1))
1003 RD(5, "%d frags at %d", frags, ft_i - frags);
1004 ft[ft_i - frags].ft_frags = frags;
1005 frags = 1;
1006 if (unlikely((int)ft_i >= bridge_batch))
1007 ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr);
1008 }
1009 if (frags > 1) {
1010 D("truncate incomplete fragment at %d (%d frags)", ft_i, frags);
1011 // ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG
1012 ft[ft_i - 1].ft_frags &= ~NS_MOREFRAG;
1013 ft[ft_i - frags].ft_frags = frags - 1;
1014 }
1015 if (ft_i)
1016 ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr);
1017 BDG_RUNLOCK(b);
1018 return j;
1019}
1020
1021
1022/*
1023 *---- support for virtual bridge -----
1024 */
1025
1026/* ----- FreeBSD if_bridge hash function ------- */
1027
1028/*
1029 * The following hash function is adapted from "Hash Functions" by Bob Jenkins
1030 * ("Algorithm Alley", Dr. Dobbs Journal, September 1997).
1031 *
1032 * http://www.burtleburtle.net/bob/hash/spooky.html
1033 */
1034#define mix(a, b, c) \
1035do { \
1036 a -= b; a -= c; a ^= (c >> 13); \
1037 b -= c; b -= a; b ^= (a << 8); \
1038 c -= a; c -= b; c ^= (b >> 13); \
1039 a -= b; a -= c; a ^= (c >> 12); \
1040 b -= c; b -= a; b ^= (a << 16); \
1041 c -= a; c -= b; c ^= (b >> 5); \
1042 a -= b; a -= c; a ^= (c >> 3); \
1043 b -= c; b -= a; b ^= (a << 10); \
1044 c -= a; c -= b; c ^= (b >> 15); \
1045} while (/*CONSTCOND*/0)
1046
1047static __inline uint32_t
1048nm_bridge_rthash(const uint8_t *addr)
1049{
1050 uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hask key
1051
1052 b += addr[5] << 8;
1053 b += addr[4];
1054 a += addr[3] << 24;
1055 a += addr[2] << 16;
1056 a += addr[1] << 8;
1057 a += addr[0];
1058
1059 mix(a, b, c);
1060#define BRIDGE_RTHASH_MASK (NM_BDG_HASH-1)
1061 return (c & BRIDGE_RTHASH_MASK);
1062}
1063
1064#undef mix
1065
1066
1067static int
1068bdg_netmap_reg(struct netmap_adapter *na, int onoff)
1069{
1070 struct netmap_vp_adapter *vpna =
1071 (struct netmap_vp_adapter*)na;
1072 struct ifnet *ifp = na->ifp;
1073
1074 /* the interface is already attached to the bridge,
1075 * so we only need to toggle IFCAP_NETMAP.
1076 */
1077 BDG_WLOCK(vpna->na_bdg);
1078 if (onoff) {
1079 ifp->if_capenable |= IFCAP_NETMAP;
1080 } else {
1081 ifp->if_capenable &= ~IFCAP_NETMAP;
1082 }
1083 BDG_WUNLOCK(vpna->na_bdg);
1084 return 0;
1085}
1086
1087
1088/*
1089 * Lookup function for a learning bridge.
1090 * Update the hash table with the source address,
1091 * and then returns the destination port index, and the
1092 * ring in *dst_ring (at the moment, always use ring 0)
1093 */
1094u_int
1095netmap_bdg_learning(char *buf, u_int buf_len, uint8_t *dst_ring,
1096 struct netmap_vp_adapter *na)
1097{
1098 struct nm_hash_ent *ht = na->na_bdg->ht;
1099 uint32_t sh, dh;
1100 u_int dst, mysrc = na->bdg_port;
1101 uint64_t smac, dmac;
1102
1103 if (buf_len < 14) {
1104 D("invalid buf length %d", buf_len);
1105 return NM_BDG_NOPORT;
1106 }
1107 dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff;
1108 smac = le64toh(*(uint64_t *)(buf + 4));
1109 smac >>= 16;
1110
1111 /*
1112 * The hash is somewhat expensive, there might be some
1113 * worthwhile optimizations here.
1114 */
1115 if ((buf[6] & 1) == 0) { /* valid src */
1116 uint8_t *s = buf+6;
1117 sh = nm_bridge_rthash(s); // XXX hash of source
1118 /* update source port forwarding entry */
1119 ht[sh].mac = smac; /* XXX expire ? */
1120 ht[sh].ports = mysrc;
1121 if (netmap_verbose)
1122 D("src %02x:%02x:%02x:%02x:%02x:%02x on port %d",
1123 s[0], s[1], s[2], s[3], s[4], s[5], mysrc);
1124 }
1125 dst = NM_BDG_BROADCAST;
1126 if ((buf[0] & 1) == 0) { /* unicast */
1127 dh = nm_bridge_rthash(buf); // XXX hash of dst
1128 if (ht[dh].mac == dmac) { /* found dst */
1129 dst = ht[dh].ports;
1130 }
1131 /* XXX otherwise return NM_BDG_UNKNOWN ? */
1132 }
1133 *dst_ring = 0;
1134 return dst;
1135}
1136
1137
1138/*
1139 * This flush routine supports only unicast and broadcast but a large
1140 * number of ports, and lets us replace the learn and dispatch functions.
1141 */
1142int
1143nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na,
1144 u_int ring_nr)
1145{
1146 struct nm_bdg_q *dst_ents, *brddst;
1147 uint16_t num_dsts = 0, *dsts;
1148 struct nm_bridge *b = na->na_bdg;
1149 u_int i, j, me = na->bdg_port;
1150
1151 /*
1152 * The work area (pointed by ft) is followed by an array of
1153 * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS
1154 * queues per port plus one for the broadcast traffic.
1155 * Then we have an array of destination indexes.
1156 */
1157 dst_ents = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX);
1158 dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1);
1159
1160 /* first pass: find a destination for each packet in the batch */
1161 for (i = 0; likely(i < n); i += ft[i].ft_frags) {
1162 uint8_t dst_ring = ring_nr; /* default, same ring as origin */
1163 uint16_t dst_port, d_i;
1164 struct nm_bdg_q *d;
1165
1166 ND("slot %d frags %d", i, ft[i].ft_frags);
1167 dst_port = b->nm_bdg_lookup(ft[i].ft_buf, ft[i].ft_len,
1168 &dst_ring, na);
1169 if (netmap_verbose > 255)
1170 RD(5, "slot %d port %d -> %d", i, me, dst_port);
1171 if (dst_port == NM_BDG_NOPORT)
1172 continue; /* this packet is identified to be dropped */
1173 else if (unlikely(dst_port > NM_BDG_MAXPORTS))
1174 continue;
1175 else if (dst_port == NM_BDG_BROADCAST)
1176 dst_ring = 0; /* broadcasts always go to ring 0 */
1177 else if (unlikely(dst_port == me ||
1178 !b->bdg_ports[dst_port]))
1179 continue;
1180
1181 /* get a position in the scratch pad */
1182 d_i = dst_port * NM_BDG_MAXRINGS + dst_ring;
1183 d = dst_ents + d_i;
1184
1185 /* append the first fragment to the list */
1186 if (d->bq_head == NM_FT_NULL) { /* new destination */
1187 d->bq_head = d->bq_tail = i;
1188 /* remember this position to be scanned later */
1189 if (dst_port != NM_BDG_BROADCAST)
1190 dsts[num_dsts++] = d_i;
1191 } else {
1192 ft[d->bq_tail].ft_next = i;
1193 d->bq_tail = i;
1194 }
1195 d->bq_len += ft[i].ft_frags;
1196 }
1197
1198 /*
1199 * Broadcast traffic goes to ring 0 on all destinations.
1200 * So we need to add these rings to the list of ports to scan.
1201 * XXX at the moment we scan all NM_BDG_MAXPORTS ports, which is
1202 * expensive. We should keep a compact list of active destinations
1203 * so we could shorten this loop.
1204 */
1205 brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS;
1206 if (brddst->bq_head != NM_FT_NULL) {
1207 for (j = 0; likely(j < b->bdg_active_ports); j++) {
1208 uint16_t d_i;
1209 i = b->bdg_port_index[j];
1210 if (unlikely(i == me))
1211 continue;
1212 d_i = i * NM_BDG_MAXRINGS;
1213 if (dst_ents[d_i].bq_head == NM_FT_NULL)
1214 dsts[num_dsts++] = d_i;
1215 }
1216 }
1217
1218 ND(5, "pass 1 done %d pkts %d dsts", n, num_dsts);
1219 /* second pass: scan destinations (XXX will be modular somehow) */
1220 for (i = 0; i < num_dsts; i++) {
1221 struct ifnet *dst_ifp;
1222 struct netmap_vp_adapter *dst_na;
1223 struct netmap_kring *kring;
1224 struct netmap_ring *ring;
1225 u_int dst_nr, lim, j, sent = 0, d_i, next, brd_next;
1226 u_int needed, howmany;
1227 int retry = netmap_txsync_retry;
1228 struct nm_bdg_q *d;
1229 uint32_t my_start = 0, lease_idx = 0;
1230 int nrings;
1231
1232 d_i = dsts[i];
1233 ND("second pass %d port %d", i, d_i);
1234 d = dst_ents + d_i;
1235 // XXX fix the division
1236 dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS];
1237 /* protect from the lookup function returning an inactive
1238 * destination port
1239 */
1240 if (unlikely(dst_na == NULL))
1241 goto cleanup;
1242 if (dst_na->up.na_flags & NAF_SW_ONLY)
1243 goto cleanup;
1244 dst_ifp = dst_na->up.ifp;
1245 /*
1246 * The interface may be in !netmap mode in two cases:
1247 * - when na is attached but not activated yet;
1248 * - when na is being deactivated but is still attached.
1249 */
1250 if (unlikely(!(dst_ifp->if_capenable & IFCAP_NETMAP))) {
1251 ND("not in netmap mode!");
1252 goto cleanup;
1253 }
1254
1255 /* there is at least one either unicast or broadcast packet */
1256 brd_next = brddst->bq_head;
1257 next = d->bq_head;
1258 /* we need to reserve this many slots. If fewer are
1259 * available, some packets will be dropped.
1260 * Packets may have multiple fragments, so we may not use
1261 * there is a chance that we may not use all of the slots
1262 * we have claimed, so we will need to handle the leftover
1263 * ones when we regain the lock.
1264 */
1265 needed = d->bq_len + brddst->bq_len;
1266
1267 ND(5, "pass 2 dst %d is %x %s",
1268 i, d_i, is_vp ? "virtual" : "nic/host");
1269 dst_nr = d_i & (NM_BDG_MAXRINGS-1);
1270 nrings = dst_na->up.num_rx_rings;
1271 if (dst_nr >= nrings)
1272 dst_nr = dst_nr % nrings;
1273 kring = &dst_na->up.rx_rings[dst_nr];
1274 ring = kring->ring;
1275 lim = kring->nkr_num_slots - 1;
1276
1277retry:
1278
1279 /* reserve the buffers in the queue and an entry
1280 * to report completion, and drop lock.
1281 * XXX this might become a helper function.
1282 */
1283 lockmgr(&kring->q_lock, LK_EXCLUSIVE);
1284 if (kring->nkr_stopped) {
1285 lockmgr(&kring->q_lock, LK_RELEASE);
1286 goto cleanup;
1287 }
1288 if (dst_na->retry) {
1289 dst_na->up.nm_notify(&dst_na->up, dst_nr, NR_RX, 0);
1290 }
1291 my_start = j = kring->nkr_hwlease;
1292 howmany = nm_kr_space(kring, 1);
1293 if (needed < howmany)
1294 howmany = needed;
1295 lease_idx = nm_kr_lease(kring, howmany, 1);
1296 lockmgr(&kring->q_lock, LK_RELEASE);
1297
1298 /* only retry if we need more than available slots */
1299 if (retry && needed <= howmany)
1300 retry = 0;
1301
1302 /* copy to the destination queue */
1303 while (howmany > 0) {
1304 struct netmap_slot *slot;
1305 struct nm_bdg_fwd *ft_p, *ft_end;
1306 u_int cnt;
1307
1308 /* find the queue from which we pick next packet.
1309 * NM_FT_NULL is always higher than valid indexes
1310 * so we never dereference it if the other list
1311 * has packets (and if both are empty we never
1312 * get here).
1313 */
1314 if (next < brd_next) {
1315 ft_p = ft + next;
1316 next = ft_p->ft_next;
1317 } else { /* insert broadcast */
1318 ft_p = ft + brd_next;
1319 brd_next = ft_p->ft_next;
1320 }
1321 cnt = ft_p->ft_frags; // cnt > 0
1322 if (unlikely(cnt > howmany))
1323 break; /* no more space */
1324 howmany -= cnt;
1325 if (netmap_verbose && cnt > 1)
1326 RD(5, "rx %d frags to %d", cnt, j);
1327 ft_end = ft_p + cnt;
1328 do {
1329 void *dst, *src = ft_p->ft_buf;
1330 size_t len = (ft_p->ft_len + 63) & ~63;
1331
1332 slot = &ring->slot[j];
1333 dst = BDG_NMB(&dst_na->up, slot);
1334 /* round to a multiple of 64 */
1335
1336 ND("send %d %d bytes at %s:%d",
1337 i, ft_p->ft_len, NM_IFPNAME(dst_ifp), j);
1338 if (ft_p->ft_flags & NS_INDIRECT) {
1339 if (copyin(src, dst, len)) {
1340 // invalid user pointer, pretend len is 0
1341 ft_p->ft_len = 0;
1342 }
1343 } else {
1344 //memcpy(dst, src, len);
1345 pkt_copy(src, dst, (int)len);
1346 }
1347 slot->len = ft_p->ft_len;
1348 slot->flags = (cnt << 8)| NS_MOREFRAG;
1349 j = nm_next(j, lim);
1350 ft_p++;
1351 sent++;
1352 } while (ft_p != ft_end);
1353 slot->flags = (cnt << 8); /* clear flag on last entry */
1354 /* are we done ? */
1355 if (next == NM_FT_NULL && brd_next == NM_FT_NULL)
1356 break;
1357 }
1358 {
1359 /* current position */
1360 uint32_t *p = kring->nkr_leases; /* shorthand */
1361 uint32_t update_pos;
1362 int still_locked = 1;
1363
1364 lockmgr(&kring->q_lock, LK_EXCLUSIVE);
1365 if (unlikely(howmany > 0)) {
1366 /* not used all bufs. If i am the last one
1367 * i can recover the slots, otherwise must
1368 * fill them with 0 to mark empty packets.
1369 */
1370 ND("leftover %d bufs", howmany);
1371 if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) {
1372 /* yes i am the last one */
1373 ND("roll back nkr_hwlease to %d", j);
1374 kring->nkr_hwlease = j;
1375 } else {
1376 while (howmany-- > 0) {
1377 ring->slot[j].len = 0;
1378 ring->slot[j].flags = 0;
1379 j = nm_next(j, lim);
1380 }
1381 }
1382 }
1383 p[lease_idx] = j; /* report I am done */
1384
1385 update_pos = nm_kr_rxpos(kring);
1386
1387 if (my_start == update_pos) {
1388 /* all slots before my_start have been reported,
1389 * so scan subsequent leases to see if other ranges
1390 * have been completed, and to a selwakeup or txsync.
1391 */
1392 while (lease_idx != kring->nkr_lease_idx &&
1393 p[lease_idx] != NR_NOSLOT) {
1394 j = p[lease_idx];
1395 p[lease_idx] = NR_NOSLOT;
1396 lease_idx = nm_next(lease_idx, lim);
1397 }
1398 /* j is the new 'write' position. j != my_start
1399 * means there are new buffers to report
1400 */
1401 if (likely(j != my_start)) {
1402 uint32_t old_avail = kring->nr_hwavail;
1403
1404 kring->nr_hwavail = (j >= kring->nr_hwcur) ?
1405 j - kring->nr_hwcur :
1406 j + lim + 1 - kring->nr_hwcur;
1407 if (kring->nr_hwavail < old_avail) {
1408 D("avail shrink %d -> %d",
1409 old_avail, kring->nr_hwavail);
1410 }
1411 dst_na->up.nm_notify(&dst_na->up, dst_nr, NR_RX, 0);
1412 still_locked = 0;
1413 lockmgr(&kring->q_lock, LK_RELEASE);
1414 if (dst_na->retry && retry--)
1415 goto retry;
1416 }
1417 }
1418 if (still_locked)
1419 lockmgr(&kring->q_lock, LK_RELEASE);
1420 }
1421cleanup:
1422 d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */
1423 d->bq_len = 0;
1424 }
1425 brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */
1426 brddst->bq_len = 0;
1427 return 0;
1428}
1429
1430static int
1431netmap_vp_txsync(struct netmap_vp_adapter *na, u_int ring_nr, int flags)
1432{
1433 struct netmap_kring *kring = &na->up.tx_rings[ring_nr];
1434 struct netmap_ring *ring = kring->ring;
1435 u_int j, k, lim = kring->nkr_num_slots - 1;
1436
1437 k = ring->cur;
1438 if (k > lim)
1439 return netmap_ring_reinit(kring);
1440
1441 if (bridge_batch <= 0) { /* testing only */
1442 j = k; // used all
1443 goto done;
1444 }
1445 if (bridge_batch > NM_BDG_BATCH)
1446 bridge_batch = NM_BDG_BATCH;
1447
1448 j = nm_bdg_preflush(na, ring_nr, kring, k);
1449 if (j != k)
1450 D("early break at %d/ %d, avail %d", j, k, kring->nr_hwavail);
1451 /* k-j modulo ring size is the number of slots processed */
1452 if (k < j)
1453 k += kring->nkr_num_slots;
1454 kring->nr_hwavail = lim - (k - j);
1455
1456done:
1457 kring->nr_hwcur = j;
1458 ring->avail = kring->nr_hwavail;
1459 if (netmap_verbose)
1460 D("%s ring %d flags %d", NM_IFPNAME(na->up.ifp), ring_nr, flags);
1461 return 0;
1462}
1463
1464
1465/*
1466 * main dispatch routine for the bridge.
1467 * We already know that only one thread is running this.
1468 * we must run nm_bdg_preflush without lock.
1469 */
1470static int
1471bdg_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
1472{
1473 struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na;
1474 return netmap_vp_txsync(vpna, ring_nr, flags);
1475}
1476
1477
1478/*
1479 * user process reading from a VALE switch.
1480 * Already protected against concurrent calls from userspace,
1481 * but we must acquire the queue's lock to protect against
1482 * writers on the same queue.
1483 */
1484static int
1485bdg_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
1486{
1487 struct netmap_kring *kring = &na->rx_rings[ring_nr];
1488 struct netmap_ring *ring = kring->ring;
1489 u_int j, lim = kring->nkr_num_slots - 1;
1490 u_int k = ring->cur, resvd = ring->reserved;
1491 int n;
1492
1493 lockmgr(&kring->q_lock, LK_EXCLUSIVE);
1494 if (k > lim) {
1495 D("ouch dangerous reset!!!");
1496 n = netmap_ring_reinit(kring);
1497 goto done;
1498 }
1499
1500 /* skip past packets that userspace has released */
1501 j = kring->nr_hwcur; /* netmap ring index */
1502 if (resvd > 0) {
1503 if (resvd + ring->avail >= lim + 1) {
1504 D("XXX invalid reserve/avail %d %d", resvd, ring->avail);
1505 ring->reserved = resvd = 0; // XXX panic...
1506 }
1507 k = (k >= resvd) ? k - resvd : k + lim + 1 - resvd;
1508 }
1509
1510 if (j != k) { /* userspace has released some packets. */
1511 n = k - j;
1512 if (n < 0)
1513 n += kring->nkr_num_slots;
1514 ND("userspace releases %d packets", n);
1515 for (n = 0; likely(j != k); n++) {
1516 struct netmap_slot *slot = &ring->slot[j];
1517 void *addr = BDG_NMB(na, slot);
1518
1519 if (addr == netmap_buffer_base) { /* bad buf */
1520 D("bad buffer index %d, ignore ?",
1521 slot->buf_idx);
1522 }
1523 slot->flags &= ~NS_BUF_CHANGED;
1524 j = nm_next(j, lim);
1525 }
1526 kring->nr_hwavail -= n;
1527 kring->nr_hwcur = k;
1528 }
1529 /* tell userspace that there are new packets */
1530 ring->avail = kring->nr_hwavail - resvd;
1531 n = 0;
1532done:
1533 lockmgr(&kring->q_lock, LK_RELEASE);
1534 return n;
1535}
1536
1537static int
1538bdg_netmap_attach(struct netmap_adapter *arg)
1539{
1540 struct netmap_vp_adapter *vpna;
1541 struct netmap_adapter *na;
1542 int error;
1543
1544 vpna = kmalloc(sizeof(*vpna), M_DEVBUF, M_NOWAIT | M_ZERO);
1545 if (vpna == NULL)
1546 return ENOMEM;
1547 na = &vpna->up;
1548 *na = *arg;
1549 na->na_flags |= NAF_BDG_MAYSLEEP | NAF_MEM_OWNER;
1550 na->nm_txsync = bdg_netmap_txsync;
1551 na->nm_rxsync = bdg_netmap_rxsync;
1552 na->nm_register = bdg_netmap_reg;
1553 na->nm_dtor = netmap_adapter_vp_dtor;
1554 na->nm_krings_create = netmap_vp_krings_create;
1555 na->nm_krings_delete = netmap_vp_krings_delete;
1556 na->nm_mem = netmap_mem_private_new(NM_IFPNAME(arg->ifp),
1557 na->num_tx_rings, na->num_tx_desc,
1558 na->num_rx_rings, na->num_rx_desc);
1559 /* other nmd fields are set in the common routine */
1560 error = netmap_attach_common(na);
1561 if (error) {
1562 kfree(vpna, M_DEVBUF);
1563 return error;
1564 }
1565 return 0;
1566}
1567
1568static void
1569netmap_bwrap_dtor(struct netmap_adapter *na)
1570{
1571 struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na;
1572 struct netmap_adapter *hwna = bna->hwna;
1573 struct nm_bridge *b = bna->up.na_bdg,
1574 *bh = bna->host.na_bdg;
1575 struct ifnet *ifp = na->ifp;
1576
1577 ND("na %p", na);
1578
1579 if (b) {
1580 netmap_bdg_detach_common(b, bna->up.bdg_port,
1581 (bh ? bna->host.bdg_port : -1));
1582 }
1583
1584 hwna->na_private = NULL;
1585 netmap_adapter_put(hwna);
1586
1587 bzero(ifp, sizeof(*ifp));
1588 kfree(ifp, M_DEVBUF);
1589 na->ifp = NULL;
1590
1591}
1592
1593/*
1594 * Pass packets from nic to the bridge.
1595 * XXX TODO check locking: this is called from the interrupt
1596 * handler so we should make sure that the interface is not
1597 * disconnected while passing down an interrupt.
1598 *
1599 * Note, no user process can access this NIC so we can ignore
1600 * the info in the 'ring'.
1601 */
1602/* callback that overwrites the hwna notify callback.
1603 * Packets come from the outside or from the host stack and are put on an hwna rx ring.
1604 * The bridge wrapper then sends the packets through the bridge.
1605 */
1606static int
1607netmap_bwrap_intr_notify(struct netmap_adapter *na, u_int ring_nr, enum txrx tx, int flags)
1608{
1609 struct ifnet *ifp = na->ifp;
1610 struct netmap_bwrap_adapter *bna = na->na_private;
1611 struct netmap_vp_adapter *hostna = &bna->host;
1612 struct netmap_kring *kring, *bkring;
1613 struct netmap_ring *ring;
1614 int is_host_ring = ring_nr == na->num_rx_rings;
1615 struct netmap_vp_adapter *vpna = &bna->up;
1616 int error = 0;
1617
1618 ND("%s[%d] %s %x", NM_IFPNAME(ifp), ring_nr, (tx == NR_TX ? "TX" : "RX"), flags);
1619
1620 if (flags & NAF_DISABLE_NOTIFY) {
1621 kring = tx == NR_TX ? na->tx_rings : na->rx_rings;
1622 bkring = tx == NR_TX ? vpna->up.rx_rings : vpna->up.tx_rings;
1623 if (kring->nkr_stopped)
1624 netmap_disable_ring(bkring);
1625 else
1626 bkring->nkr_stopped = 0;
1627 return 0;
1628 }
1629
1630 if (ifp == NULL || !(ifp->if_capenable & IFCAP_NETMAP))
1631 return 0;
1632
1633 if (tx == NR_TX)
1634 return 0;
1635
1636 kring = &na->rx_rings[ring_nr];
1637 ring = kring->ring;
1638
1639 /* make sure the ring is not disabled */
1640 if (nm_kr_tryget(kring))
1641 return 0;
1642
1643 if (is_host_ring && hostna->na_bdg == NULL) {
1644 error = bna->save_notify(na, ring_nr, tx, flags);
1645 goto put_out;
1646 }
1647
1648 if (is_host_ring) {
1649 vpna = hostna;
1650 ring_nr = 0;
1651 } else {
1652 /* fetch packets that have arrived.
1653 * XXX maybe do this in a loop ?
1654 */
1655 error = na->nm_rxsync(na, ring_nr, 0);
1656 if (error)
1657 goto put_out;
1658 }
1659 if (kring->nr_hwavail == 0 && netmap_verbose) {
1660 D("how strange, interrupt with no packets on %s",
1661 NM_IFPNAME(ifp));
1662 goto put_out;
1663 }
1664 /* XXX avail ? */
1665 ring->cur = nm_kr_rxpos(kring);
1666 netmap_vp_txsync(vpna, ring_nr, flags);
1667
1668 if (!is_host_ring)
1669 error = na->nm_rxsync(na, ring_nr, 0);
1670
1671put_out:
1672 nm_kr_put(kring);
1673 return error;
1674}
1675
1676static int
1677netmap_bwrap_register(struct netmap_adapter *na, int onoff)
1678{
1679 struct netmap_bwrap_adapter *bna =
1680 (struct netmap_bwrap_adapter *)na;
1681 struct netmap_adapter *hwna = bna->hwna;
1682 struct netmap_vp_adapter *hostna = &bna->host;
1683 int error;
1684
1685 ND("%s %d", NM_IFPNAME(ifp), onoff);
1686
1687 if (onoff) {
1688 int i;
1689
1690 hwna->na_lut = na->na_lut;
1691 hwna->na_lut_objtotal = na->na_lut_objtotal;
1692
1693 if (hostna->na_bdg) {
1694 hostna->up.na_lut = na->na_lut;
1695 hostna->up.na_lut_objtotal = na->na_lut_objtotal;
1696 }
1697
1698 /* cross-link the netmap rings */
1699 for (i = 0; i <= na->num_tx_rings; i++) {
1700 hwna->tx_rings[i].nkr_num_slots = na->rx_rings[i].nkr_num_slots;
1701 hwna->tx_rings[i].ring = na->rx_rings[i].ring;
1702 }
1703 for (i = 0; i <= na->num_rx_rings; i++) {
1704 hwna->rx_rings[i].nkr_num_slots = na->tx_rings[i].nkr_num_slots;
1705 hwna->rx_rings[i].ring = na->tx_rings[i].ring;
1706 }
1707 }
1708
1709 if (hwna->ifp) {
1710 error = hwna->nm_register(hwna, onoff);
1711 if (error)
1712 return error;
1713 }
1714
1715 bdg_netmap_reg(na, onoff);
1716
1717 if (onoff) {
1718 bna->save_notify = hwna->nm_notify;
1719 hwna->nm_notify = netmap_bwrap_intr_notify;
1720 } else {
1721 hwna->nm_notify = bna->save_notify;
1722 hwna->na_lut = NULL;
1723 hwna->na_lut_objtotal = 0;
1724 }
1725
1726 return 0;
1727}
1728
1729static int
1730netmap_bwrap_config(struct netmap_adapter *na, u_int *txr, u_int *txd,
1731 u_int *rxr, u_int *rxd)
1732{
1733 struct netmap_bwrap_adapter *bna =
1734 (struct netmap_bwrap_adapter *)na;
1735 struct netmap_adapter *hwna = bna->hwna;
1736
1737 /* forward the request */
1738 netmap_update_config(hwna);
1739 /* swap the results */
1740 *txr = hwna->num_rx_rings;
1741 *txd = hwna->num_rx_desc;
1742 *rxr = hwna->num_tx_rings;
1743 *rxd = hwna->num_rx_desc;
1744
1745 return 0;
1746}
1747
1748static int
1749netmap_bwrap_krings_create(struct netmap_adapter *na)
1750{
1751 struct netmap_bwrap_adapter *bna =
1752 (struct netmap_bwrap_adapter *)na;
1753 struct netmap_adapter *hwna = bna->hwna;
1754 struct netmap_adapter *hostna = &bna->host.up;
1755 int error;
1756
1757 ND("%s", NM_IFPNAME(na->ifp));
1758
1759 error = netmap_vp_krings_create(na);
1760 if (error)
1761 return error;
1762
1763 error = hwna->nm_krings_create(hwna);
1764 if (error) {
1765 netmap_vp_krings_delete(na);
1766 return error;
1767 }
1768
1769 hostna->tx_rings = na->tx_rings + na->num_tx_rings;
1770 hostna->rx_rings = na->rx_rings + na->num_rx_rings;
1771
1772 return 0;
1773}
1774
1775static void
1776netmap_bwrap_krings_delete(struct netmap_adapter *na)
1777{
1778 struct netmap_bwrap_adapter *bna =
1779 (struct netmap_bwrap_adapter *)na;
1780 struct netmap_adapter *hwna = bna->hwna;
1781
1782 ND("%s", NM_IFPNAME(na->ifp));
1783
1784 hwna->nm_krings_delete(hwna);
1785 netmap_vp_krings_delete(na);
1786}
1787
1788/* notify method for the bridge-->hwna direction */
1789static int
1790netmap_bwrap_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int flags)
1791{
1792 struct netmap_bwrap_adapter *bna =
1793 (struct netmap_bwrap_adapter *)na;
1794 struct netmap_adapter *hwna = bna->hwna;
1795 struct netmap_kring *kring, *hw_kring;
1796 struct netmap_ring *ring;
1797 u_int lim, k;
1798 int error = 0;
1799
1800 if (tx == NR_TX)
1801 return ENXIO;
1802
1803 kring = &na->rx_rings[ring_n];
1804 hw_kring = &hwna->tx_rings[ring_n];
1805 ring = kring->ring;
1806
1807 lim = kring->nkr_num_slots - 1;
1808 k = nm_kr_rxpos(kring);
1809
1810 if (hwna->ifp == NULL || !(hwna->ifp->if_capenable & IFCAP_NETMAP))
1811 return 0;
1812 ring->cur = k;
1813 ND("%s[%d] PRE rx(%d, %d, %d, %d) ring(%d, %d, %d) tx(%d, %d)",
1814 NM_IFPNAME(na->ifp), ring_n,
1815 kring->nr_hwcur, kring->nr_hwavail, kring->nkr_hwlease, kring->nr_hwreserved,
1816 ring->cur, ring->avail, ring->reserved,
1817 hw_kring->nr_hwcur, hw_kring->nr_hwavail);
1818 if (ring_n == na->num_rx_rings) {
1819 netmap_txsync_to_host(hwna);
1820 } else {
1821 error = hwna->nm_txsync(hwna, ring_n, flags);
1822 }
1823 kring->nr_hwcur = ring->cur;
1824 kring->nr_hwavail = 0;
1825 kring->nr_hwreserved = lim - ring->avail;
1826 ND("%s[%d] PST rx(%d, %d, %d, %d) ring(%d, %d, %d) tx(%d, %d)",
1827 NM_IFPNAME(na->ifp), ring_n,
1828 kring->nr_hwcur, kring->nr_hwavail, kring->nkr_hwlease, kring->nr_hwreserved,
1829 ring->cur, ring->avail, ring->reserved,
1830 hw_kring->nr_hwcur, hw_kring->nr_hwavail);
1831
1832 return error;
1833}
1834
1835static int
1836netmap_bwrap_host_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int flags)
1837{
1838 struct netmap_bwrap_adapter *bna = na->na_private;
1839 struct netmap_adapter *port_na = &bna->up.up;
1840 if (tx == NR_TX || ring_n != 0)
1841 return ENXIO;
1842 return netmap_bwrap_notify(port_na, port_na->num_rx_rings, NR_RX, flags);
1843}
1844
1845/* attach a bridge wrapper to the 'real' device */
1846static int
1847netmap_bwrap_attach(struct ifnet *fake, struct ifnet *real)
1848{
1849 struct netmap_bwrap_adapter *bna;
1850 struct netmap_adapter *na;
1851 struct netmap_adapter *hwna = NA(real);
1852 struct netmap_adapter *hostna;
1853 int error;
1854
1855
1856 bna = kmalloc(sizeof(*bna), M_DEVBUF, M_NOWAIT | M_ZERO);
1857 if (bna == NULL)
1858 return ENOMEM;
1859
1860 na = &bna->up.up;
1861 na->ifp = fake;
1862 /* fill the ring data for the bwrap adapter with rx/tx meanings
1863 * swapped. The real cross-linking will be done during register,
1864 * when all the krings will have been created.
1865 */
1866 na->num_rx_rings = hwna->num_tx_rings;
1867 na->num_tx_rings = hwna->num_rx_rings;
1868 na->num_tx_desc = hwna->num_rx_desc;
1869 na->num_rx_desc = hwna->num_tx_desc;
1870 na->nm_dtor = netmap_bwrap_dtor;
1871 na->nm_register = netmap_bwrap_register;
1872 // na->nm_txsync = netmap_bwrap_txsync;
1873 // na->nm_rxsync = netmap_bwrap_rxsync;
1874 na->nm_config = netmap_bwrap_config;
1875 na->nm_krings_create = netmap_bwrap_krings_create;
1876 na->nm_krings_delete = netmap_bwrap_krings_delete;
1877 na->nm_notify = netmap_bwrap_notify;
1878 na->nm_mem = hwna->nm_mem;
1879 na->na_private = na; /* prevent NIOCREGIF */
1880 bna->up.retry = 1; /* XXX maybe this should depend on the hwna */
1881
1882 bna->hwna = hwna;
1883 netmap_adapter_get(hwna);
1884 hwna->na_private = bna; /* weak reference */
1885
1886 hostna = &bna->host.up;
1887 hostna->ifp = hwna->ifp;
1888 hostna->num_tx_rings = 1;
1889 hostna->num_tx_desc = hwna->num_rx_desc;
1890 hostna->num_rx_rings = 1;
1891 hostna->num_rx_desc = hwna->num_tx_desc;
1892 // hostna->nm_txsync = netmap_bwrap_host_txsync;
1893 // hostna->nm_rxsync = netmap_bwrap_host_rxsync;
1894 hostna->nm_notify = netmap_bwrap_host_notify;
1895 hostna->nm_mem = na->nm_mem;
1896 hostna->na_private = bna;
1897
1898 D("%s<->%s txr %d txd %d rxr %d rxd %d", fake->if_xname, real->if_xname,
1899 na->num_tx_rings, na->num_tx_desc,
1900 na->num_rx_rings, na->num_rx_desc);
1901
1902 error = netmap_attach_common(na);
1903 if (error) {
1904 netmap_adapter_put(hwna);
1905 kfree(bna, M_DEVBUF);
1906 return error;
1907 }
1908 return 0;
1909}
1910
1911void
1912netmap_init_bridges(void)
1913{
1914 int i;
1915 bzero(nm_bridges, sizeof(struct nm_bridge) * NM_BRIDGES); /* safety */
1916 for (i = 0; i < NM_BRIDGES; i++)
1917 BDG_RWINIT(&nm_bridges[i]);
1918}
1919#endif /* WITH_VALE */