| 1 | /* |
| 2 | * Copyright (C) 2013 Universita` di Pisa. All rights reserved. |
| 3 | * |
| 4 | * Redistribution and use in source and binary forms, with or without |
| 5 | * modification, are permitted provided that the following conditions |
| 6 | * are met: |
| 7 | * 1. Redistributions of source code must retain the above copyright |
| 8 | * notice, this list of conditions and the following disclaimer. |
| 9 | * 2. Redistributions in binary form must reproduce the above copyright |
| 10 | * notice, this list of conditions and the following disclaimer in the |
| 11 | * documentation and/or other materials provided with the distribution. |
| 12 | * |
| 13 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND |
| 14 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 15 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 16 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE |
| 17 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
| 18 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
| 19 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
| 20 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
| 21 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
| 22 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
| 23 | * SUCH DAMAGE. |
| 24 | */ |
| 25 | |
| 26 | |
| 27 | /* |
| 28 | * This module implements the VALE switch for netmap |
| 29 | |
| 30 | --- VALE SWITCH --- |
| 31 | |
| 32 | NMG_LOCK() serializes all modifications to switches and ports. |
| 33 | A switch cannot be deleted until all ports are gone. |
| 34 | |
| 35 | For each switch, an SX lock (RWlock on linux) protects |
| 36 | deletion of ports. When configuring or deleting a new port, the |
| 37 | lock is acquired in exclusive mode (after holding NMG_LOCK). |
| 38 | When forwarding, the lock is acquired in shared mode (without NMG_LOCK). |
| 39 | The lock is held throughout the entire forwarding cycle, |
| 40 | during which the thread may incur in a page fault. |
| 41 | Hence it is important that sleepable shared locks are used. |
| 42 | |
| 43 | On the rx ring, the per-port lock is grabbed initially to reserve |
| 44 | a number of slot in the ring, then the lock is released, |
| 45 | packets are copied from source to destination, and then |
| 46 | the lock is acquired again and the receive ring is updated. |
| 47 | (A similar thing is done on the tx ring for NIC and host stack |
| 48 | ports attached to the switch) |
| 49 | |
| 50 | */ |
| 51 | |
| 52 | /* |
| 53 | * OS-specific code that is used only within this file. |
| 54 | * Other OS-specific code that must be accessed by drivers |
| 55 | * is present in netmap_kern.h |
| 56 | */ |
| 57 | |
| 58 | |
| 59 | #include <sys/types.h> |
| 60 | #include <sys/errno.h> |
| 61 | #include <sys/param.h> /* defines used in kernel.h */ |
| 62 | #include <sys/kernel.h> /* types used in module initialization */ |
| 63 | #include <sys/conf.h> /* cdevsw struct, UID, GID */ |
| 64 | #include <sys/sockio.h> |
| 65 | #include <sys/socketvar.h> /* struct socket */ |
| 66 | #include <sys/malloc.h> |
| 67 | #include <sys/poll.h> |
| 68 | #include <sys/lock.h> |
| 69 | #include <sys/socket.h> /* sockaddrs */ |
| 70 | #include <sys/sysctl.h> |
| 71 | #include <net/if.h> |
| 72 | #include <net/if_var.h> |
| 73 | #include <net/bpf.h> /* BIOCIMMEDIATE */ |
| 74 | #include <sys/bus.h> /* bus_dmamap_* */ |
| 75 | #include <sys/endian.h> |
| 76 | #include <sys/refcount.h> |
| 77 | |
| 78 | |
| 79 | #define BDG_RWLOCK_T struct lock |
| 80 | |
| 81 | #define BDG_RWINIT(b) \ |
| 82 | lockinit(&(b)->bdg_lock, "bdg lock", 0, LK_CANRECURSE) |
| 83 | #define BDG_WLOCK(b) lockmgr(&(b)->bdg_lock, LK_EXCLUSIVE) |
| 84 | #define BDG_WUNLOCK(b) lockmgr(&(b)->bdg_lock, LK_RELEASE) |
| 85 | #define BDG_RLOCK(b) lockmgr(&(b)->bdg_lock, LK_SHARED) |
| 86 | #define BDG_RTRYLOCK(b) lockmgr(&(b)->bdg_lock, LK_SHARED|LK_NOWAIT) |
| 87 | #define BDG_RUNLOCK(b) lockmgr(&(b)->bdg_lock, LK_RELEASE) |
| 88 | #define BDG_RWDESTROY(b) lockuninit(&(b)->bdg_lock) |
| 89 | |
| 90 | /* |
| 91 | * common headers |
| 92 | */ |
| 93 | |
| 94 | #include <net/netmap.h> |
| 95 | |
| 96 | #include <net/netmap/netmap_kern.h> |
| 97 | #include <net/netmap/netmap_mem2.h> |
| 98 | |
| 99 | #ifdef WITH_VALE |
| 100 | |
| 101 | /* |
| 102 | * system parameters (most of them in netmap_kern.h) |
| 103 | * NM_NAME prefix for switch port names, default "vale" |
| 104 | * NM_BDG_MAXPORTS number of ports |
| 105 | * NM_BRIDGES max number of switches in the system. |
| 106 | * XXX should become a sysctl or tunable |
| 107 | * |
| 108 | * Switch ports are named valeX:Y where X is the switch name and Y |
| 109 | * is the port. If Y matches a physical interface name, the port is |
| 110 | * connected to a physical device. |
| 111 | * |
| 112 | * Unlike physical interfaces, switch ports use their own memory region |
| 113 | * for rings and buffers. |
| 114 | * The virtual interfaces use per-queue lock instead of core lock. |
| 115 | * In the tx loop, we aggregate traffic in batches to make all operations |
| 116 | * faster. The batch size is bridge_batch. |
| 117 | */ |
| 118 | #define NM_BDG_MAXRINGS 16 /* XXX unclear how many. */ |
| 119 | #define NM_BDG_MAXSLOTS 4096 /* XXX same as above */ |
| 120 | #define NM_BRIDGE_RINGSIZE 1024 /* in the device */ |
| 121 | #define NM_BDG_HASH 1024 /* forwarding table entries */ |
| 122 | #define NM_BDG_BATCH 1024 /* entries in the forwarding buffer */ |
| 123 | #define NM_MULTISEG 64 /* max size of a chain of bufs */ |
| 124 | /* actual size of the tables */ |
| 125 | #define NM_BDG_BATCH_MAX (NM_BDG_BATCH + NM_MULTISEG) |
| 126 | /* NM_FT_NULL terminates a list of slots in the ft */ |
| 127 | #define NM_FT_NULL NM_BDG_BATCH_MAX |
| 128 | #define NM_BRIDGES 8 /* number of bridges */ |
| 129 | |
| 130 | |
| 131 | /* |
| 132 | * bridge_batch is set via sysctl to the max batch size to be |
| 133 | * used in the bridge. The actual value may be larger as the |
| 134 | * last packet in the block may overflow the size. |
| 135 | */ |
| 136 | int bridge_batch = NM_BDG_BATCH; /* bridge batch size */ |
| 137 | SYSCTL_DECL(_dev_netmap); |
| 138 | SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0 , ""); |
| 139 | |
| 140 | |
| 141 | static int bdg_netmap_attach(struct netmap_adapter *); |
| 142 | static int bdg_netmap_reg(struct netmap_adapter *na, int onoff); |
| 143 | static int netmap_bwrap_attach(struct ifnet *, struct ifnet *); |
| 144 | static int netmap_bwrap_register(struct netmap_adapter *, int onoff); |
| 145 | int kern_netmap_regif(struct nmreq *nmr); |
| 146 | |
| 147 | /* |
| 148 | * Each transmit queue accumulates a batch of packets into |
| 149 | * a structure before forwarding. Packets to the same |
| 150 | * destination are put in a list using ft_next as a link field. |
| 151 | * ft_frags and ft_next are valid only on the first fragment. |
| 152 | */ |
| 153 | struct nm_bdg_fwd { /* forwarding entry for a bridge */ |
| 154 | void *ft_buf; /* netmap or indirect buffer */ |
| 155 | uint8_t ft_frags; /* how many fragments (only on 1st frag) */ |
| 156 | uint8_t _ft_port; /* dst port (unused) */ |
| 157 | uint16_t ft_flags; /* flags, e.g. indirect */ |
| 158 | uint16_t ft_len; /* src fragment len */ |
| 159 | uint16_t ft_next; /* next packet to same destination */ |
| 160 | }; |
| 161 | |
| 162 | /* |
| 163 | * For each output interface, nm_bdg_q is used to construct a list. |
| 164 | * bq_len is the number of output buffers (we can have coalescing |
| 165 | * during the copy). |
| 166 | */ |
| 167 | struct nm_bdg_q { |
| 168 | uint16_t bq_head; |
| 169 | uint16_t bq_tail; |
| 170 | uint32_t bq_len; /* number of buffers */ |
| 171 | }; |
| 172 | |
| 173 | /* XXX revise this */ |
| 174 | struct nm_hash_ent { |
| 175 | uint64_t mac; /* the top 2 bytes are the epoch */ |
| 176 | uint64_t ports; |
| 177 | }; |
| 178 | |
| 179 | /* |
| 180 | * nm_bridge is a descriptor for a VALE switch. |
| 181 | * Interfaces for a bridge are all in bdg_ports[]. |
| 182 | * The array has fixed size, an empty entry does not terminate |
| 183 | * the search, but lookups only occur on attach/detach so we |
| 184 | * don't mind if they are slow. |
| 185 | * |
| 186 | * The bridge is non blocking on the transmit ports: excess |
| 187 | * packets are dropped if there is no room on the output port. |
| 188 | * |
| 189 | * bdg_lock protects accesses to the bdg_ports array. |
| 190 | * This is a rw lock (or equivalent). |
| 191 | */ |
| 192 | struct nm_bridge { |
| 193 | /* XXX what is the proper alignment/layout ? */ |
| 194 | BDG_RWLOCK_T bdg_lock; /* protects bdg_ports */ |
| 195 | int bdg_namelen; |
| 196 | uint32_t bdg_active_ports; /* 0 means free */ |
| 197 | char bdg_basename[IFNAMSIZ]; |
| 198 | |
| 199 | /* Indexes of active ports (up to active_ports) |
| 200 | * and all other remaining ports. |
| 201 | */ |
| 202 | uint8_t bdg_port_index[NM_BDG_MAXPORTS]; |
| 203 | |
| 204 | struct netmap_vp_adapter *bdg_ports[NM_BDG_MAXPORTS]; |
| 205 | |
| 206 | |
| 207 | /* |
| 208 | * The function to decide the destination port. |
| 209 | * It returns either of an index of the destination port, |
| 210 | * NM_BDG_BROADCAST to broadcast this packet, or NM_BDG_NOPORT not to |
| 211 | * forward this packet. ring_nr is the source ring index, and the |
| 212 | * function may overwrite this value to forward this packet to a |
| 213 | * different ring index. |
| 214 | * This function must be set by netmap_bdgctl(). |
| 215 | */ |
| 216 | bdg_lookup_fn_t nm_bdg_lookup; |
| 217 | |
| 218 | /* the forwarding table, MAC+ports. |
| 219 | * XXX should be changed to an argument to be passed to |
| 220 | * the lookup function, and allocated on attach |
| 221 | */ |
| 222 | struct nm_hash_ent ht[NM_BDG_HASH]; |
| 223 | }; |
| 224 | |
| 225 | |
| 226 | /* |
| 227 | * XXX in principle nm_bridges could be created dynamically |
| 228 | * Right now we have a static array and deletions are protected |
| 229 | * by an exclusive lock. |
| 230 | */ |
| 231 | struct nm_bridge nm_bridges[NM_BRIDGES]; |
| 232 | |
| 233 | |
| 234 | /* |
| 235 | * A few function to tell which kind of port are we using. |
| 236 | * XXX should we hold a lock ? |
| 237 | * |
| 238 | * nma_is_vp() virtual port |
| 239 | * nma_is_host() port connected to the host stack |
| 240 | * nma_is_hw() port connected to a NIC |
| 241 | * nma_is_generic() generic netmap adapter XXX stop this madness |
| 242 | */ |
| 243 | static __inline int |
| 244 | nma_is_vp(struct netmap_adapter *na) |
| 245 | { |
| 246 | return na->nm_register == bdg_netmap_reg; |
| 247 | } |
| 248 | |
| 249 | |
| 250 | static __inline int |
| 251 | nma_is_host(struct netmap_adapter *na) |
| 252 | { |
| 253 | return na->nm_register == NULL; |
| 254 | } |
| 255 | |
| 256 | |
| 257 | static __inline int |
| 258 | nma_is_hw(struct netmap_adapter *na) |
| 259 | { |
| 260 | /* In case of sw adapter, nm_register is NULL */ |
| 261 | return !nma_is_vp(na) && !nma_is_host(na) && !nma_is_generic(na); |
| 262 | } |
| 263 | |
| 264 | static __inline int |
| 265 | nma_is_bwrap(struct netmap_adapter *na) |
| 266 | { |
| 267 | return na->nm_register == netmap_bwrap_register; |
| 268 | } |
| 269 | |
| 270 | |
| 271 | |
| 272 | /* |
| 273 | * this is a slightly optimized copy routine which rounds |
| 274 | * to multiple of 64 bytes and is often faster than dealing |
| 275 | * with other odd sizes. We assume there is enough room |
| 276 | * in the source and destination buffers. |
| 277 | * |
| 278 | * XXX only for multiples of 64 bytes, non overlapped. |
| 279 | */ |
| 280 | static inline void |
| 281 | pkt_copy(void *_src, void *_dst, int l) |
| 282 | { |
| 283 | uint64_t *src = _src; |
| 284 | uint64_t *dst = _dst; |
| 285 | if (unlikely(l >= 1024)) { |
| 286 | memcpy(dst, src, l); |
| 287 | return; |
| 288 | } |
| 289 | for (; likely(l > 0); l-=64) { |
| 290 | *dst++ = *src++; |
| 291 | *dst++ = *src++; |
| 292 | *dst++ = *src++; |
| 293 | *dst++ = *src++; |
| 294 | *dst++ = *src++; |
| 295 | *dst++ = *src++; |
| 296 | *dst++ = *src++; |
| 297 | *dst++ = *src++; |
| 298 | } |
| 299 | } |
| 300 | |
| 301 | |
| 302 | |
| 303 | /* |
| 304 | * locate a bridge among the existing ones. |
| 305 | * MUST BE CALLED WITH NMG_LOCK() |
| 306 | * |
| 307 | * a ':' in the name terminates the bridge name. Otherwise, just NM_NAME. |
| 308 | * We assume that this is called with a name of at least NM_NAME chars. |
| 309 | */ |
| 310 | static struct nm_bridge * |
| 311 | nm_find_bridge(const char *name, int create) |
| 312 | { |
| 313 | int i, l, namelen; |
| 314 | struct nm_bridge *b = NULL; |
| 315 | |
| 316 | NMG_LOCK_ASSERT(); |
| 317 | |
| 318 | namelen = strlen(NM_NAME); /* base length */ |
| 319 | l = name ? strlen(name) : 0; /* actual length */ |
| 320 | if (l < namelen) { |
| 321 | D("invalid bridge name %s", name ? name : NULL); |
| 322 | return NULL; |
| 323 | } |
| 324 | for (i = namelen + 1; i < l; i++) { |
| 325 | if (name[i] == ':') { |
| 326 | namelen = i; |
| 327 | break; |
| 328 | } |
| 329 | } |
| 330 | if (namelen >= IFNAMSIZ) |
| 331 | namelen = IFNAMSIZ; |
| 332 | ND("--- prefix is '%.*s' ---", namelen, name); |
| 333 | |
| 334 | /* lookup the name, remember empty slot if there is one */ |
| 335 | for (i = 0; i < NM_BRIDGES; i++) { |
| 336 | struct nm_bridge *x = nm_bridges + i; |
| 337 | |
| 338 | if (x->bdg_active_ports == 0) { |
| 339 | if (create && b == NULL) |
| 340 | b = x; /* record empty slot */ |
| 341 | } else if (x->bdg_namelen != namelen) { |
| 342 | continue; |
| 343 | } else if (strncmp(name, x->bdg_basename, namelen) == 0) { |
| 344 | ND("found '%.*s' at %d", namelen, name, i); |
| 345 | b = x; |
| 346 | break; |
| 347 | } |
| 348 | } |
| 349 | if (i == NM_BRIDGES && b) { /* name not found, can create entry */ |
| 350 | /* initialize the bridge */ |
| 351 | strncpy(b->bdg_basename, name, namelen); |
| 352 | ND("create new bridge %s with ports %d", b->bdg_basename, |
| 353 | b->bdg_active_ports); |
| 354 | b->bdg_namelen = namelen; |
| 355 | b->bdg_active_ports = 0; |
| 356 | for (i = 0; i < NM_BDG_MAXPORTS; i++) |
| 357 | b->bdg_port_index[i] = i; |
| 358 | /* set the default function */ |
| 359 | b->nm_bdg_lookup = netmap_bdg_learning; |
| 360 | /* reset the MAC address table */ |
| 361 | bzero(b->ht, sizeof(struct nm_hash_ent) * NM_BDG_HASH); |
| 362 | } |
| 363 | return b; |
| 364 | } |
| 365 | |
| 366 | |
| 367 | /* |
| 368 | * Free the forwarding tables for rings attached to switch ports. |
| 369 | */ |
| 370 | static void |
| 371 | nm_free_bdgfwd(struct netmap_adapter *na) |
| 372 | { |
| 373 | int nrings, i; |
| 374 | struct netmap_kring *kring; |
| 375 | |
| 376 | NMG_LOCK_ASSERT(); |
| 377 | nrings = nma_is_vp(na) ? na->num_tx_rings : na->num_rx_rings; |
| 378 | kring = nma_is_vp(na) ? na->tx_rings : na->rx_rings; |
| 379 | for (i = 0; i < nrings; i++) { |
| 380 | if (kring[i].nkr_ft) { |
| 381 | kfree(kring[i].nkr_ft, M_DEVBUF); |
| 382 | kring[i].nkr_ft = NULL; /* protect from freeing twice */ |
| 383 | } |
| 384 | } |
| 385 | } |
| 386 | |
| 387 | |
| 388 | /* |
| 389 | * Allocate the forwarding tables for the rings attached to the bridge ports. |
| 390 | */ |
| 391 | static int |
| 392 | nm_alloc_bdgfwd(struct netmap_adapter *na) |
| 393 | { |
| 394 | int nrings, l, i, num_dstq; |
| 395 | struct netmap_kring *kring; |
| 396 | |
| 397 | NMG_LOCK_ASSERT(); |
| 398 | /* all port:rings + broadcast */ |
| 399 | num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1; |
| 400 | l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX; |
| 401 | l += sizeof(struct nm_bdg_q) * num_dstq; |
| 402 | l += sizeof(uint16_t) * NM_BDG_BATCH_MAX; |
| 403 | |
| 404 | nrings = na->num_tx_rings + 1; |
| 405 | kring = na->tx_rings; |
| 406 | for (i = 0; i < nrings; i++) { |
| 407 | struct nm_bdg_fwd *ft; |
| 408 | struct nm_bdg_q *dstq; |
| 409 | int j; |
| 410 | |
| 411 | ft = kmalloc(l, M_DEVBUF, M_NOWAIT | M_ZERO); |
| 412 | if (!ft) { |
| 413 | nm_free_bdgfwd(na); |
| 414 | return ENOMEM; |
| 415 | } |
| 416 | dstq = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX); |
| 417 | for (j = 0; j < num_dstq; j++) { |
| 418 | dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL; |
| 419 | dstq[j].bq_len = 0; |
| 420 | } |
| 421 | kring[i].nkr_ft = ft; |
| 422 | } |
| 423 | return 0; |
| 424 | } |
| 425 | |
| 426 | |
| 427 | static void |
| 428 | netmap_bdg_detach_common(struct nm_bridge *b, int hw, int sw) |
| 429 | { |
| 430 | int s_hw = hw, s_sw = sw; |
| 431 | int i, lim =b->bdg_active_ports; |
| 432 | uint8_t tmp[NM_BDG_MAXPORTS]; |
| 433 | |
| 434 | /* |
| 435 | New algorithm: |
| 436 | make a copy of bdg_port_index; |
| 437 | lookup NA(ifp)->bdg_port and SWNA(ifp)->bdg_port |
| 438 | in the array of bdg_port_index, replacing them with |
| 439 | entries from the bottom of the array; |
| 440 | decrement bdg_active_ports; |
| 441 | acquire BDG_WLOCK() and copy back the array. |
| 442 | */ |
| 443 | |
| 444 | D("detach %d and %d (lim %d)", hw, sw, lim); |
| 445 | /* make a copy of the list of active ports, update it, |
| 446 | * and then copy back within BDG_WLOCK(). |
| 447 | */ |
| 448 | memcpy(tmp, b->bdg_port_index, sizeof(tmp)); |
| 449 | for (i = 0; (hw >= 0 || sw >= 0) && i < lim; ) { |
| 450 | if (hw >= 0 && tmp[i] == hw) { |
| 451 | ND("detach hw %d at %d", hw, i); |
| 452 | lim--; /* point to last active port */ |
| 453 | tmp[i] = tmp[lim]; /* swap with i */ |
| 454 | tmp[lim] = hw; /* now this is inactive */ |
| 455 | hw = -1; |
| 456 | } else if (sw >= 0 && tmp[i] == sw) { |
| 457 | ND("detach sw %d at %d", sw, i); |
| 458 | lim--; |
| 459 | tmp[i] = tmp[lim]; |
| 460 | tmp[lim] = sw; |
| 461 | sw = -1; |
| 462 | } else { |
| 463 | i++; |
| 464 | } |
| 465 | } |
| 466 | if (hw >= 0 || sw >= 0) { |
| 467 | D("XXX delete failed hw %d sw %d, should panic...", hw, sw); |
| 468 | } |
| 469 | |
| 470 | BDG_WLOCK(b); |
| 471 | b->bdg_ports[s_hw] = NULL; |
| 472 | if (s_sw >= 0) { |
| 473 | b->bdg_ports[s_sw] = NULL; |
| 474 | } |
| 475 | memcpy(b->bdg_port_index, tmp, sizeof(tmp)); |
| 476 | b->bdg_active_ports = lim; |
| 477 | BDG_WUNLOCK(b); |
| 478 | |
| 479 | ND("now %d active ports", lim); |
| 480 | if (lim == 0) { |
| 481 | ND("marking bridge %s as free", b->bdg_basename); |
| 482 | b->nm_bdg_lookup = NULL; |
| 483 | } |
| 484 | } |
| 485 | |
| 486 | static void |
| 487 | netmap_adapter_vp_dtor(struct netmap_adapter *na) |
| 488 | { |
| 489 | struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na; |
| 490 | struct nm_bridge *b = vpna->na_bdg; |
| 491 | struct ifnet *ifp = na->ifp; |
| 492 | |
| 493 | ND("%s has %d references", NM_IFPNAME(ifp), na->na_refcount); |
| 494 | |
| 495 | if (b) { |
| 496 | netmap_bdg_detach_common(b, vpna->bdg_port, -1); |
| 497 | } |
| 498 | |
| 499 | bzero(ifp, sizeof(*ifp)); |
| 500 | kfree(ifp, M_DEVBUF); |
| 501 | na->ifp = NULL; |
| 502 | } |
| 503 | |
| 504 | int |
| 505 | netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create) |
| 506 | { |
| 507 | const char *name = nmr->nr_name; |
| 508 | struct ifnet *ifp; |
| 509 | int error = 0; |
| 510 | struct netmap_adapter *ret; |
| 511 | struct netmap_vp_adapter *vpna; |
| 512 | struct nm_bridge *b; |
| 513 | int i, j, cand = -1, cand2 = -1; |
| 514 | int needed; |
| 515 | |
| 516 | *na = NULL; /* default return value */ |
| 517 | |
| 518 | /* first try to see if this is a bridge port. */ |
| 519 | NMG_LOCK_ASSERT(); |
| 520 | if (strncmp(name, NM_NAME, sizeof(NM_NAME) - 1)) { |
| 521 | return 0; /* no error, but no VALE prefix */ |
| 522 | } |
| 523 | |
| 524 | b = nm_find_bridge(name, create); |
| 525 | if (b == NULL) { |
| 526 | D("no bridges available for '%s'", name); |
| 527 | return (ENXIO); |
| 528 | } |
| 529 | |
| 530 | /* Now we are sure that name starts with the bridge's name, |
| 531 | * lookup the port in the bridge. We need to scan the entire |
| 532 | * list. It is not important to hold a WLOCK on the bridge |
| 533 | * during the search because NMG_LOCK already guarantees |
| 534 | * that there are no other possible writers. |
| 535 | */ |
| 536 | |
| 537 | /* lookup in the local list of ports */ |
| 538 | for (j = 0; j < b->bdg_active_ports; j++) { |
| 539 | i = b->bdg_port_index[j]; |
| 540 | vpna = b->bdg_ports[i]; |
| 541 | // KASSERT(na != NULL); |
| 542 | ifp = vpna->up.ifp; |
| 543 | /* XXX make sure the name only contains one : */ |
| 544 | if (!strcmp(NM_IFPNAME(ifp), name)) { |
| 545 | netmap_adapter_get(&vpna->up); |
| 546 | ND("found existing if %s refs %d", name, |
| 547 | vpna->na_bdg_refcount); |
| 548 | *na = (struct netmap_adapter *)vpna; |
| 549 | return 0; |
| 550 | } |
| 551 | } |
| 552 | /* not found, should we create it? */ |
| 553 | if (!create) |
| 554 | return ENXIO; |
| 555 | /* yes we should, see if we have space to attach entries */ |
| 556 | needed = 2; /* in some cases we only need 1 */ |
| 557 | if (b->bdg_active_ports + needed >= NM_BDG_MAXPORTS) { |
| 558 | D("bridge full %d, cannot create new port", b->bdg_active_ports); |
| 559 | return EINVAL; |
| 560 | } |
| 561 | /* record the next two ports available, but do not allocate yet */ |
| 562 | cand = b->bdg_port_index[b->bdg_active_ports]; |
| 563 | cand2 = b->bdg_port_index[b->bdg_active_ports + 1]; |
| 564 | ND("+++ bridge %s port %s used %d avail %d %d", |
| 565 | b->bdg_basename, name, b->bdg_active_ports, cand, cand2); |
| 566 | |
| 567 | /* |
| 568 | * try see if there is a matching NIC with this name |
| 569 | * (after the bridge's name) |
| 570 | */ |
| 571 | ifp = ifunit(name + b->bdg_namelen + 1); |
| 572 | if (!ifp) { /* this is a virtual port */ |
| 573 | /* Create a temporary NA with arguments, then |
| 574 | * bdg_netmap_attach() will allocate the real one |
| 575 | * and attach it to the ifp |
| 576 | */ |
| 577 | struct netmap_adapter tmp_na; |
| 578 | |
| 579 | if (nmr->nr_cmd) { |
| 580 | /* nr_cmd must be 0 for a virtual port */ |
| 581 | return EINVAL; |
| 582 | } |
| 583 | bzero(&tmp_na, sizeof(tmp_na)); |
| 584 | /* bound checking */ |
| 585 | tmp_na.num_tx_rings = nmr->nr_tx_rings; |
| 586 | nm_bound_var(&tmp_na.num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL); |
| 587 | nmr->nr_tx_rings = tmp_na.num_tx_rings; // write back |
| 588 | tmp_na.num_rx_rings = nmr->nr_rx_rings; |
| 589 | nm_bound_var(&tmp_na.num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL); |
| 590 | nmr->nr_rx_rings = tmp_na.num_rx_rings; // write back |
| 591 | nm_bound_var(&nmr->nr_tx_slots, NM_BRIDGE_RINGSIZE, |
| 592 | 1, NM_BDG_MAXSLOTS, NULL); |
| 593 | tmp_na.num_tx_desc = nmr->nr_tx_slots; |
| 594 | nm_bound_var(&nmr->nr_rx_slots, NM_BRIDGE_RINGSIZE, |
| 595 | 1, NM_BDG_MAXSLOTS, NULL); |
| 596 | tmp_na.num_rx_desc = nmr->nr_rx_slots; |
| 597 | |
| 598 | /* create a struct ifnet for the new port. |
| 599 | * need M_NOWAIT as we are under nma_lock |
| 600 | */ |
| 601 | ifp = kmalloc(sizeof(*ifp), M_DEVBUF, M_NOWAIT | M_ZERO); |
| 602 | if (!ifp) |
| 603 | return ENOMEM; |
| 604 | |
| 605 | strcpy(ifp->if_xname, name); |
| 606 | tmp_na.ifp = ifp; |
| 607 | /* bdg_netmap_attach creates a struct netmap_adapter */ |
| 608 | error = bdg_netmap_attach(&tmp_na); |
| 609 | if (error) { |
| 610 | D("error %d", error); |
| 611 | kfree(ifp, M_DEVBUF); |
| 612 | return error; |
| 613 | } |
| 614 | ret = NA(ifp); |
| 615 | cand2 = -1; /* only need one port */ |
| 616 | } else { /* this is a NIC */ |
| 617 | struct ifnet *fake_ifp; |
| 618 | |
| 619 | error = netmap_get_hw_na(ifp, &ret); |
| 620 | if (error || ret == NULL) |
| 621 | goto out; |
| 622 | |
| 623 | /* make sure the NIC is not already in use */ |
| 624 | if (NETMAP_OWNED_BY_ANY(ret)) { |
| 625 | D("NIC %s busy, cannot attach to bridge", |
| 626 | NM_IFPNAME(ifp)); |
| 627 | error = EINVAL; |
| 628 | goto out; |
| 629 | } |
| 630 | /* create a fake interface */ |
| 631 | fake_ifp = kmalloc(sizeof(*ifp), M_DEVBUF, M_NOWAIT | M_ZERO); |
| 632 | if (!fake_ifp) { |
| 633 | error = ENOMEM; |
| 634 | goto out; |
| 635 | } |
| 636 | strcpy(fake_ifp->if_xname, name); |
| 637 | error = netmap_bwrap_attach(fake_ifp, ifp); |
| 638 | if (error) { |
| 639 | kfree(fake_ifp, M_DEVBUF); |
| 640 | goto out; |
| 641 | } |
| 642 | ret = NA(fake_ifp); |
| 643 | if (nmr->nr_arg1 != NETMAP_BDG_HOST) |
| 644 | cand2 = -1; /* only need one port */ |
| 645 | #if 0 |
| 646 | if_rele(ifp); |
| 647 | #endif |
| 648 | } |
| 649 | vpna = (struct netmap_vp_adapter *)ret; |
| 650 | |
| 651 | BDG_WLOCK(b); |
| 652 | vpna->bdg_port = cand; |
| 653 | ND("NIC %p to bridge port %d", vpna, cand); |
| 654 | /* bind the port to the bridge (virtual ports are not active) */ |
| 655 | b->bdg_ports[cand] = vpna; |
| 656 | vpna->na_bdg = b; |
| 657 | b->bdg_active_ports++; |
| 658 | if (cand2 >= 0) { |
| 659 | struct netmap_vp_adapter *hostna = vpna + 1; |
| 660 | /* also bind the host stack to the bridge */ |
| 661 | b->bdg_ports[cand2] = hostna; |
| 662 | hostna->bdg_port = cand2; |
| 663 | hostna->na_bdg = b; |
| 664 | b->bdg_active_ports++; |
| 665 | ND("host %p to bridge port %d", hostna, cand2); |
| 666 | } |
| 667 | ND("if %s refs %d", name, vpna->up.na_refcount); |
| 668 | BDG_WUNLOCK(b); |
| 669 | *na = ret; |
| 670 | netmap_adapter_get(ret); |
| 671 | return 0; |
| 672 | |
| 673 | out: |
| 674 | #if 0 |
| 675 | if_rele(ifp); |
| 676 | #endif |
| 677 | |
| 678 | return error; |
| 679 | } |
| 680 | |
| 681 | |
| 682 | /* Process NETMAP_BDG_ATTACH and NETMAP_BDG_DETACH */ |
| 683 | static int |
| 684 | nm_bdg_attach(struct nmreq *nmr) |
| 685 | { |
| 686 | struct netmap_adapter *na; |
| 687 | struct netmap_if *nifp; |
| 688 | struct netmap_priv_d *npriv; |
| 689 | struct netmap_bwrap_adapter *bna; |
| 690 | int error; |
| 691 | |
| 692 | npriv = kmalloc(sizeof(*npriv), M_DEVBUF, M_NOWAIT|M_ZERO); |
| 693 | if (npriv == NULL) |
| 694 | return ENOMEM; |
| 695 | NMG_LOCK(); |
| 696 | /* XXX probably netmap_get_bdg_na() */ |
| 697 | error = netmap_get_na(nmr, &na, 1 /* create if not exists */); |
| 698 | if (error) /* no device, or another bridge or user owns the device */ |
| 699 | goto unlock_exit; |
| 700 | /* netmap_get_na() sets na_bdg if this is a physical interface |
| 701 | * that we can attach to a switch. |
| 702 | */ |
| 703 | if (!nma_is_bwrap(na)) { |
| 704 | /* got reference to a virtual port or direct access to a NIC. |
| 705 | * perhaps specified no bridge prefix or wrong NIC name |
| 706 | */ |
| 707 | error = EINVAL; |
| 708 | goto unref_exit; |
| 709 | } |
| 710 | |
| 711 | if (na->active_fds > 0) { /* already registered */ |
| 712 | error = EBUSY; |
| 713 | goto unref_exit; |
| 714 | } |
| 715 | |
| 716 | nifp = netmap_do_regif(npriv, na, nmr->nr_ringid, &error); |
| 717 | if (!nifp) { |
| 718 | goto unref_exit; |
| 719 | } |
| 720 | |
| 721 | bna = (struct netmap_bwrap_adapter*)na; |
| 722 | bna->na_kpriv = npriv; |
| 723 | NMG_UNLOCK(); |
| 724 | ND("registered %s to netmap-mode", NM_IFPNAME(na->ifp)); |
| 725 | return 0; |
| 726 | |
| 727 | unref_exit: |
| 728 | netmap_adapter_put(na); |
| 729 | unlock_exit: |
| 730 | NMG_UNLOCK(); |
| 731 | bzero(npriv, sizeof(*npriv)); |
| 732 | kfree(npriv, M_DEVBUF); |
| 733 | return error; |
| 734 | } |
| 735 | |
| 736 | static int |
| 737 | nm_bdg_detach(struct nmreq *nmr) |
| 738 | { |
| 739 | struct netmap_adapter *na; |
| 740 | int error; |
| 741 | struct netmap_bwrap_adapter *bna; |
| 742 | int last_instance; |
| 743 | |
| 744 | NMG_LOCK(); |
| 745 | error = netmap_get_na(nmr, &na, 0 /* don't create */); |
| 746 | if (error) { /* no device, or another bridge or user owns the device */ |
| 747 | goto unlock_exit; |
| 748 | } |
| 749 | if (!nma_is_bwrap(na)) { |
| 750 | /* got reference to a virtual port or direct access to a NIC. |
| 751 | * perhaps specified no bridge's prefix or wrong NIC's name |
| 752 | */ |
| 753 | error = EINVAL; |
| 754 | goto unref_exit; |
| 755 | } |
| 756 | bna = (struct netmap_bwrap_adapter *)na; |
| 757 | |
| 758 | if (na->active_fds == 0) { /* not registered */ |
| 759 | error = EINVAL; |
| 760 | goto unref_exit; |
| 761 | } |
| 762 | |
| 763 | last_instance = netmap_dtor_locked(bna->na_kpriv); /* unregister */ |
| 764 | if (!last_instance) { |
| 765 | D("--- error, trying to detach an entry with active mmaps"); |
| 766 | error = EINVAL; |
| 767 | } else { |
| 768 | struct netmap_priv_d *npriv = bna->na_kpriv; |
| 769 | |
| 770 | bna->na_kpriv = NULL; |
| 771 | D("deleting priv"); |
| 772 | |
| 773 | bzero(npriv, sizeof(*npriv)); |
| 774 | kfree(npriv, M_DEVBUF); |
| 775 | } |
| 776 | |
| 777 | unref_exit: |
| 778 | netmap_adapter_put(na); |
| 779 | unlock_exit: |
| 780 | NMG_UNLOCK(); |
| 781 | return error; |
| 782 | |
| 783 | } |
| 784 | |
| 785 | |
| 786 | /* exported to kernel callers, e.g. OVS ? |
| 787 | * Entry point. |
| 788 | * Called without NMG_LOCK. |
| 789 | */ |
| 790 | int |
| 791 | netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func) |
| 792 | { |
| 793 | struct nm_bridge *b; |
| 794 | struct netmap_vp_adapter *na; |
| 795 | struct ifnet *iter; |
| 796 | char *name = nmr->nr_name; |
| 797 | int cmd = nmr->nr_cmd, namelen = strlen(name); |
| 798 | int error = 0, i, j; |
| 799 | |
| 800 | switch (cmd) { |
| 801 | case NETMAP_BDG_ATTACH: |
| 802 | error = nm_bdg_attach(nmr); |
| 803 | break; |
| 804 | |
| 805 | case NETMAP_BDG_DETACH: |
| 806 | error = nm_bdg_detach(nmr); |
| 807 | break; |
| 808 | |
| 809 | case NETMAP_BDG_LIST: |
| 810 | /* this is used to enumerate bridges and ports */ |
| 811 | if (namelen) { /* look up indexes of bridge and port */ |
| 812 | if (strncmp(name, NM_NAME, strlen(NM_NAME))) { |
| 813 | error = EINVAL; |
| 814 | break; |
| 815 | } |
| 816 | NMG_LOCK(); |
| 817 | b = nm_find_bridge(name, 0 /* don't create */); |
| 818 | if (!b) { |
| 819 | error = ENOENT; |
| 820 | NMG_UNLOCK(); |
| 821 | break; |
| 822 | } |
| 823 | |
| 824 | error = ENOENT; |
| 825 | for (j = 0; j < b->bdg_active_ports; j++) { |
| 826 | i = b->bdg_port_index[j]; |
| 827 | na = b->bdg_ports[i]; |
| 828 | if (na == NULL) { |
| 829 | D("---AAAAAAAAARGH-------"); |
| 830 | continue; |
| 831 | } |
| 832 | iter = na->up.ifp; |
| 833 | /* the former and the latter identify a |
| 834 | * virtual port and a NIC, respectively |
| 835 | */ |
| 836 | if (!strcmp(iter->if_xname, name)) { |
| 837 | /* bridge index */ |
| 838 | nmr->nr_arg1 = b - nm_bridges; |
| 839 | nmr->nr_arg2 = i; /* port index */ |
| 840 | error = 0; |
| 841 | break; |
| 842 | } |
| 843 | } |
| 844 | NMG_UNLOCK(); |
| 845 | } else { |
| 846 | /* return the first non-empty entry starting from |
| 847 | * bridge nr_arg1 and port nr_arg2. |
| 848 | * |
| 849 | * Users can detect the end of the same bridge by |
| 850 | * seeing the new and old value of nr_arg1, and can |
| 851 | * detect the end of all the bridge by error != 0 |
| 852 | */ |
| 853 | i = nmr->nr_arg1; |
| 854 | j = nmr->nr_arg2; |
| 855 | |
| 856 | NMG_LOCK(); |
| 857 | for (error = ENOENT; i < NM_BRIDGES; i++) { |
| 858 | b = nm_bridges + i; |
| 859 | if (j >= b->bdg_active_ports) { |
| 860 | j = 0; /* following bridges scan from 0 */ |
| 861 | continue; |
| 862 | } |
| 863 | nmr->nr_arg1 = i; |
| 864 | nmr->nr_arg2 = j; |
| 865 | j = b->bdg_port_index[j]; |
| 866 | na = b->bdg_ports[j]; |
| 867 | iter = na->up.ifp; |
| 868 | strncpy(name, iter->if_xname, (size_t)IFNAMSIZ); |
| 869 | error = 0; |
| 870 | break; |
| 871 | } |
| 872 | NMG_UNLOCK(); |
| 873 | } |
| 874 | break; |
| 875 | |
| 876 | case NETMAP_BDG_LOOKUP_REG: |
| 877 | /* register a lookup function to the given bridge. |
| 878 | * nmr->nr_name may be just bridge's name (including ':' |
| 879 | * if it is not just NM_NAME). |
| 880 | */ |
| 881 | if (!func) { |
| 882 | error = EINVAL; |
| 883 | break; |
| 884 | } |
| 885 | NMG_LOCK(); |
| 886 | b = nm_find_bridge(name, 0 /* don't create */); |
| 887 | if (!b) { |
| 888 | error = EINVAL; |
| 889 | } else { |
| 890 | b->nm_bdg_lookup = func; |
| 891 | } |
| 892 | NMG_UNLOCK(); |
| 893 | break; |
| 894 | |
| 895 | default: |
| 896 | D("invalid cmd (nmr->nr_cmd) (0x%x)", cmd); |
| 897 | error = EINVAL; |
| 898 | break; |
| 899 | } |
| 900 | return error; |
| 901 | } |
| 902 | |
| 903 | |
| 904 | static int |
| 905 | netmap_vp_krings_create(struct netmap_adapter *na) |
| 906 | { |
| 907 | u_int ntx, nrx, tailroom; |
| 908 | int error, i; |
| 909 | uint32_t *leases; |
| 910 | |
| 911 | /* XXX vps do not need host rings, |
| 912 | * but we crash if we don't have one |
| 913 | */ |
| 914 | ntx = na->num_tx_rings + 1; |
| 915 | nrx = na->num_rx_rings + 1; |
| 916 | |
| 917 | /* |
| 918 | * Leases are attached to RX rings on vale ports |
| 919 | */ |
| 920 | tailroom = sizeof(uint32_t) * na->num_rx_desc * nrx; |
| 921 | |
| 922 | error = netmap_krings_create(na, ntx, nrx, tailroom); |
| 923 | if (error) |
| 924 | return error; |
| 925 | |
| 926 | leases = na->tailroom; |
| 927 | |
| 928 | for (i = 0; i < nrx; i++) { /* Receive rings */ |
| 929 | na->rx_rings[i].nkr_leases = leases; |
| 930 | leases += na->num_rx_desc; |
| 931 | } |
| 932 | |
| 933 | error = nm_alloc_bdgfwd(na); |
| 934 | if (error) { |
| 935 | netmap_krings_delete(na); |
| 936 | return error; |
| 937 | } |
| 938 | |
| 939 | return 0; |
| 940 | } |
| 941 | |
| 942 | static void |
| 943 | netmap_vp_krings_delete(struct netmap_adapter *na) |
| 944 | { |
| 945 | nm_free_bdgfwd(na); |
| 946 | netmap_krings_delete(na); |
| 947 | } |
| 948 | |
| 949 | |
| 950 | static int |
| 951 | nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, |
| 952 | struct netmap_vp_adapter *na, u_int ring_nr); |
| 953 | |
| 954 | |
| 955 | /* |
| 956 | * Grab packets from a kring, move them into the ft structure |
| 957 | * associated to the tx (input) port. Max one instance per port, |
| 958 | * filtered on input (ioctl, poll or XXX). |
| 959 | * Returns the next position in the ring. |
| 960 | */ |
| 961 | static int |
| 962 | nm_bdg_preflush(struct netmap_vp_adapter *na, u_int ring_nr, |
| 963 | struct netmap_kring *kring, u_int end) |
| 964 | { |
| 965 | struct netmap_ring *ring = kring->ring; |
| 966 | struct nm_bdg_fwd *ft; |
| 967 | u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1; |
| 968 | u_int ft_i = 0; /* start from 0 */ |
| 969 | u_int frags = 1; /* how many frags ? */ |
| 970 | struct nm_bridge *b = na->na_bdg; |
| 971 | |
| 972 | /* To protect against modifications to the bridge we acquire a |
| 973 | * shared lock, waiting if we can sleep (if the source port is |
| 974 | * attached to a user process) or with a trylock otherwise (NICs). |
| 975 | */ |
| 976 | ND("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j); |
| 977 | if (na->up.na_flags & NAF_BDG_MAYSLEEP) |
| 978 | BDG_RLOCK(b); |
| 979 | else if (!BDG_RTRYLOCK(b)) |
| 980 | return 0; |
| 981 | ND(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j); |
| 982 | ft = kring->nkr_ft; |
| 983 | |
| 984 | for (; likely(j != end); j = nm_next(j, lim)) { |
| 985 | struct netmap_slot *slot = &ring->slot[j]; |
| 986 | char *buf; |
| 987 | |
| 988 | ft[ft_i].ft_len = slot->len; |
| 989 | ft[ft_i].ft_flags = slot->flags; |
| 990 | |
| 991 | ND("flags is 0x%x", slot->flags); |
| 992 | /* this slot goes into a list so initialize the link field */ |
| 993 | ft[ft_i].ft_next = NM_FT_NULL; |
| 994 | buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ? |
| 995 | (void *)(uintptr_t)slot->ptr : BDG_NMB(&na->up, slot); |
| 996 | prefetch(buf); |
| 997 | ++ft_i; |
| 998 | if (slot->flags & NS_MOREFRAG) { |
| 999 | frags++; |
| 1000 | continue; |
| 1001 | } |
| 1002 | if (unlikely(netmap_verbose && frags > 1)) |
| 1003 | RD(5, "%d frags at %d", frags, ft_i - frags); |
| 1004 | ft[ft_i - frags].ft_frags = frags; |
| 1005 | frags = 1; |
| 1006 | if (unlikely((int)ft_i >= bridge_batch)) |
| 1007 | ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr); |
| 1008 | } |
| 1009 | if (frags > 1) { |
| 1010 | D("truncate incomplete fragment at %d (%d frags)", ft_i, frags); |
| 1011 | // ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG |
| 1012 | ft[ft_i - 1].ft_frags &= ~NS_MOREFRAG; |
| 1013 | ft[ft_i - frags].ft_frags = frags - 1; |
| 1014 | } |
| 1015 | if (ft_i) |
| 1016 | ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr); |
| 1017 | BDG_RUNLOCK(b); |
| 1018 | return j; |
| 1019 | } |
| 1020 | |
| 1021 | |
| 1022 | /* |
| 1023 | *---- support for virtual bridge ----- |
| 1024 | */ |
| 1025 | |
| 1026 | /* ----- FreeBSD if_bridge hash function ------- */ |
| 1027 | |
| 1028 | /* |
| 1029 | * The following hash function is adapted from "Hash Functions" by Bob Jenkins |
| 1030 | * ("Algorithm Alley", Dr. Dobbs Journal, September 1997). |
| 1031 | * |
| 1032 | * http://www.burtleburtle.net/bob/hash/spooky.html |
| 1033 | */ |
| 1034 | #define mix(a, b, c) \ |
| 1035 | do { \ |
| 1036 | a -= b; a -= c; a ^= (c >> 13); \ |
| 1037 | b -= c; b -= a; b ^= (a << 8); \ |
| 1038 | c -= a; c -= b; c ^= (b >> 13); \ |
| 1039 | a -= b; a -= c; a ^= (c >> 12); \ |
| 1040 | b -= c; b -= a; b ^= (a << 16); \ |
| 1041 | c -= a; c -= b; c ^= (b >> 5); \ |
| 1042 | a -= b; a -= c; a ^= (c >> 3); \ |
| 1043 | b -= c; b -= a; b ^= (a << 10); \ |
| 1044 | c -= a; c -= b; c ^= (b >> 15); \ |
| 1045 | } while (/*CONSTCOND*/0) |
| 1046 | |
| 1047 | static __inline uint32_t |
| 1048 | nm_bridge_rthash(const uint8_t *addr) |
| 1049 | { |
| 1050 | uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hask key |
| 1051 | |
| 1052 | b += addr[5] << 8; |
| 1053 | b += addr[4]; |
| 1054 | a += addr[3] << 24; |
| 1055 | a += addr[2] << 16; |
| 1056 | a += addr[1] << 8; |
| 1057 | a += addr[0]; |
| 1058 | |
| 1059 | mix(a, b, c); |
| 1060 | #define BRIDGE_RTHASH_MASK (NM_BDG_HASH-1) |
| 1061 | return (c & BRIDGE_RTHASH_MASK); |
| 1062 | } |
| 1063 | |
| 1064 | #undef mix |
| 1065 | |
| 1066 | |
| 1067 | static int |
| 1068 | bdg_netmap_reg(struct netmap_adapter *na, int onoff) |
| 1069 | { |
| 1070 | struct netmap_vp_adapter *vpna = |
| 1071 | (struct netmap_vp_adapter*)na; |
| 1072 | struct ifnet *ifp = na->ifp; |
| 1073 | |
| 1074 | /* the interface is already attached to the bridge, |
| 1075 | * so we only need to toggle IFCAP_NETMAP. |
| 1076 | */ |
| 1077 | BDG_WLOCK(vpna->na_bdg); |
| 1078 | if (onoff) { |
| 1079 | ifp->if_capenable |= IFCAP_NETMAP; |
| 1080 | } else { |
| 1081 | ifp->if_capenable &= ~IFCAP_NETMAP; |
| 1082 | } |
| 1083 | BDG_WUNLOCK(vpna->na_bdg); |
| 1084 | return 0; |
| 1085 | } |
| 1086 | |
| 1087 | |
| 1088 | /* |
| 1089 | * Lookup function for a learning bridge. |
| 1090 | * Update the hash table with the source address, |
| 1091 | * and then returns the destination port index, and the |
| 1092 | * ring in *dst_ring (at the moment, always use ring 0) |
| 1093 | */ |
| 1094 | u_int |
| 1095 | netmap_bdg_learning(char *buf, u_int buf_len, uint8_t *dst_ring, |
| 1096 | struct netmap_vp_adapter *na) |
| 1097 | { |
| 1098 | struct nm_hash_ent *ht = na->na_bdg->ht; |
| 1099 | uint32_t sh, dh; |
| 1100 | u_int dst, mysrc = na->bdg_port; |
| 1101 | uint64_t smac, dmac; |
| 1102 | |
| 1103 | if (buf_len < 14) { |
| 1104 | D("invalid buf length %d", buf_len); |
| 1105 | return NM_BDG_NOPORT; |
| 1106 | } |
| 1107 | dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff; |
| 1108 | smac = le64toh(*(uint64_t *)(buf + 4)); |
| 1109 | smac >>= 16; |
| 1110 | |
| 1111 | /* |
| 1112 | * The hash is somewhat expensive, there might be some |
| 1113 | * worthwhile optimizations here. |
| 1114 | */ |
| 1115 | if ((buf[6] & 1) == 0) { /* valid src */ |
| 1116 | uint8_t *s = buf+6; |
| 1117 | sh = nm_bridge_rthash(s); // XXX hash of source |
| 1118 | /* update source port forwarding entry */ |
| 1119 | ht[sh].mac = smac; /* XXX expire ? */ |
| 1120 | ht[sh].ports = mysrc; |
| 1121 | if (netmap_verbose) |
| 1122 | D("src %02x:%02x:%02x:%02x:%02x:%02x on port %d", |
| 1123 | s[0], s[1], s[2], s[3], s[4], s[5], mysrc); |
| 1124 | } |
| 1125 | dst = NM_BDG_BROADCAST; |
| 1126 | if ((buf[0] & 1) == 0) { /* unicast */ |
| 1127 | dh = nm_bridge_rthash(buf); // XXX hash of dst |
| 1128 | if (ht[dh].mac == dmac) { /* found dst */ |
| 1129 | dst = ht[dh].ports; |
| 1130 | } |
| 1131 | /* XXX otherwise return NM_BDG_UNKNOWN ? */ |
| 1132 | } |
| 1133 | *dst_ring = 0; |
| 1134 | return dst; |
| 1135 | } |
| 1136 | |
| 1137 | |
| 1138 | /* |
| 1139 | * This flush routine supports only unicast and broadcast but a large |
| 1140 | * number of ports, and lets us replace the learn and dispatch functions. |
| 1141 | */ |
| 1142 | int |
| 1143 | nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na, |
| 1144 | u_int ring_nr) |
| 1145 | { |
| 1146 | struct nm_bdg_q *dst_ents, *brddst; |
| 1147 | uint16_t num_dsts = 0, *dsts; |
| 1148 | struct nm_bridge *b = na->na_bdg; |
| 1149 | u_int i, j, me = na->bdg_port; |
| 1150 | |
| 1151 | /* |
| 1152 | * The work area (pointed by ft) is followed by an array of |
| 1153 | * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS |
| 1154 | * queues per port plus one for the broadcast traffic. |
| 1155 | * Then we have an array of destination indexes. |
| 1156 | */ |
| 1157 | dst_ents = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX); |
| 1158 | dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1); |
| 1159 | |
| 1160 | /* first pass: find a destination for each packet in the batch */ |
| 1161 | for (i = 0; likely(i < n); i += ft[i].ft_frags) { |
| 1162 | uint8_t dst_ring = ring_nr; /* default, same ring as origin */ |
| 1163 | uint16_t dst_port, d_i; |
| 1164 | struct nm_bdg_q *d; |
| 1165 | |
| 1166 | ND("slot %d frags %d", i, ft[i].ft_frags); |
| 1167 | dst_port = b->nm_bdg_lookup(ft[i].ft_buf, ft[i].ft_len, |
| 1168 | &dst_ring, na); |
| 1169 | if (netmap_verbose > 255) |
| 1170 | RD(5, "slot %d port %d -> %d", i, me, dst_port); |
| 1171 | if (dst_port == NM_BDG_NOPORT) |
| 1172 | continue; /* this packet is identified to be dropped */ |
| 1173 | else if (unlikely(dst_port > NM_BDG_MAXPORTS)) |
| 1174 | continue; |
| 1175 | else if (dst_port == NM_BDG_BROADCAST) |
| 1176 | dst_ring = 0; /* broadcasts always go to ring 0 */ |
| 1177 | else if (unlikely(dst_port == me || |
| 1178 | !b->bdg_ports[dst_port])) |
| 1179 | continue; |
| 1180 | |
| 1181 | /* get a position in the scratch pad */ |
| 1182 | d_i = dst_port * NM_BDG_MAXRINGS + dst_ring; |
| 1183 | d = dst_ents + d_i; |
| 1184 | |
| 1185 | /* append the first fragment to the list */ |
| 1186 | if (d->bq_head == NM_FT_NULL) { /* new destination */ |
| 1187 | d->bq_head = d->bq_tail = i; |
| 1188 | /* remember this position to be scanned later */ |
| 1189 | if (dst_port != NM_BDG_BROADCAST) |
| 1190 | dsts[num_dsts++] = d_i; |
| 1191 | } else { |
| 1192 | ft[d->bq_tail].ft_next = i; |
| 1193 | d->bq_tail = i; |
| 1194 | } |
| 1195 | d->bq_len += ft[i].ft_frags; |
| 1196 | } |
| 1197 | |
| 1198 | /* |
| 1199 | * Broadcast traffic goes to ring 0 on all destinations. |
| 1200 | * So we need to add these rings to the list of ports to scan. |
| 1201 | * XXX at the moment we scan all NM_BDG_MAXPORTS ports, which is |
| 1202 | * expensive. We should keep a compact list of active destinations |
| 1203 | * so we could shorten this loop. |
| 1204 | */ |
| 1205 | brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS; |
| 1206 | if (brddst->bq_head != NM_FT_NULL) { |
| 1207 | for (j = 0; likely(j < b->bdg_active_ports); j++) { |
| 1208 | uint16_t d_i; |
| 1209 | i = b->bdg_port_index[j]; |
| 1210 | if (unlikely(i == me)) |
| 1211 | continue; |
| 1212 | d_i = i * NM_BDG_MAXRINGS; |
| 1213 | if (dst_ents[d_i].bq_head == NM_FT_NULL) |
| 1214 | dsts[num_dsts++] = d_i; |
| 1215 | } |
| 1216 | } |
| 1217 | |
| 1218 | ND(5, "pass 1 done %d pkts %d dsts", n, num_dsts); |
| 1219 | /* second pass: scan destinations (XXX will be modular somehow) */ |
| 1220 | for (i = 0; i < num_dsts; i++) { |
| 1221 | struct ifnet *dst_ifp; |
| 1222 | struct netmap_vp_adapter *dst_na; |
| 1223 | struct netmap_kring *kring; |
| 1224 | struct netmap_ring *ring; |
| 1225 | u_int dst_nr, lim, j, sent = 0, d_i, next, brd_next; |
| 1226 | u_int needed, howmany; |
| 1227 | int retry = netmap_txsync_retry; |
| 1228 | struct nm_bdg_q *d; |
| 1229 | uint32_t my_start = 0, lease_idx = 0; |
| 1230 | int nrings; |
| 1231 | |
| 1232 | d_i = dsts[i]; |
| 1233 | ND("second pass %d port %d", i, d_i); |
| 1234 | d = dst_ents + d_i; |
| 1235 | // XXX fix the division |
| 1236 | dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS]; |
| 1237 | /* protect from the lookup function returning an inactive |
| 1238 | * destination port |
| 1239 | */ |
| 1240 | if (unlikely(dst_na == NULL)) |
| 1241 | goto cleanup; |
| 1242 | if (dst_na->up.na_flags & NAF_SW_ONLY) |
| 1243 | goto cleanup; |
| 1244 | dst_ifp = dst_na->up.ifp; |
| 1245 | /* |
| 1246 | * The interface may be in !netmap mode in two cases: |
| 1247 | * - when na is attached but not activated yet; |
| 1248 | * - when na is being deactivated but is still attached. |
| 1249 | */ |
| 1250 | if (unlikely(!(dst_ifp->if_capenable & IFCAP_NETMAP))) { |
| 1251 | ND("not in netmap mode!"); |
| 1252 | goto cleanup; |
| 1253 | } |
| 1254 | |
| 1255 | /* there is at least one either unicast or broadcast packet */ |
| 1256 | brd_next = brddst->bq_head; |
| 1257 | next = d->bq_head; |
| 1258 | /* we need to reserve this many slots. If fewer are |
| 1259 | * available, some packets will be dropped. |
| 1260 | * Packets may have multiple fragments, so we may not use |
| 1261 | * there is a chance that we may not use all of the slots |
| 1262 | * we have claimed, so we will need to handle the leftover |
| 1263 | * ones when we regain the lock. |
| 1264 | */ |
| 1265 | needed = d->bq_len + brddst->bq_len; |
| 1266 | |
| 1267 | ND(5, "pass 2 dst %d is %x %s", |
| 1268 | i, d_i, is_vp ? "virtual" : "nic/host"); |
| 1269 | dst_nr = d_i & (NM_BDG_MAXRINGS-1); |
| 1270 | nrings = dst_na->up.num_rx_rings; |
| 1271 | if (dst_nr >= nrings) |
| 1272 | dst_nr = dst_nr % nrings; |
| 1273 | kring = &dst_na->up.rx_rings[dst_nr]; |
| 1274 | ring = kring->ring; |
| 1275 | lim = kring->nkr_num_slots - 1; |
| 1276 | |
| 1277 | retry: |
| 1278 | |
| 1279 | /* reserve the buffers in the queue and an entry |
| 1280 | * to report completion, and drop lock. |
| 1281 | * XXX this might become a helper function. |
| 1282 | */ |
| 1283 | lockmgr(&kring->q_lock, LK_EXCLUSIVE); |
| 1284 | if (kring->nkr_stopped) { |
| 1285 | lockmgr(&kring->q_lock, LK_RELEASE); |
| 1286 | goto cleanup; |
| 1287 | } |
| 1288 | if (dst_na->retry) { |
| 1289 | dst_na->up.nm_notify(&dst_na->up, dst_nr, NR_RX, 0); |
| 1290 | } |
| 1291 | my_start = j = kring->nkr_hwlease; |
| 1292 | howmany = nm_kr_space(kring, 1); |
| 1293 | if (needed < howmany) |
| 1294 | howmany = needed; |
| 1295 | lease_idx = nm_kr_lease(kring, howmany, 1); |
| 1296 | lockmgr(&kring->q_lock, LK_RELEASE); |
| 1297 | |
| 1298 | /* only retry if we need more than available slots */ |
| 1299 | if (retry && needed <= howmany) |
| 1300 | retry = 0; |
| 1301 | |
| 1302 | /* copy to the destination queue */ |
| 1303 | while (howmany > 0) { |
| 1304 | struct netmap_slot *slot; |
| 1305 | struct nm_bdg_fwd *ft_p, *ft_end; |
| 1306 | u_int cnt; |
| 1307 | |
| 1308 | /* find the queue from which we pick next packet. |
| 1309 | * NM_FT_NULL is always higher than valid indexes |
| 1310 | * so we never dereference it if the other list |
| 1311 | * has packets (and if both are empty we never |
| 1312 | * get here). |
| 1313 | */ |
| 1314 | if (next < brd_next) { |
| 1315 | ft_p = ft + next; |
| 1316 | next = ft_p->ft_next; |
| 1317 | } else { /* insert broadcast */ |
| 1318 | ft_p = ft + brd_next; |
| 1319 | brd_next = ft_p->ft_next; |
| 1320 | } |
| 1321 | cnt = ft_p->ft_frags; // cnt > 0 |
| 1322 | if (unlikely(cnt > howmany)) |
| 1323 | break; /* no more space */ |
| 1324 | howmany -= cnt; |
| 1325 | if (netmap_verbose && cnt > 1) |
| 1326 | RD(5, "rx %d frags to %d", cnt, j); |
| 1327 | ft_end = ft_p + cnt; |
| 1328 | do { |
| 1329 | void *dst, *src = ft_p->ft_buf; |
| 1330 | size_t len = (ft_p->ft_len + 63) & ~63; |
| 1331 | |
| 1332 | slot = &ring->slot[j]; |
| 1333 | dst = BDG_NMB(&dst_na->up, slot); |
| 1334 | /* round to a multiple of 64 */ |
| 1335 | |
| 1336 | ND("send %d %d bytes at %s:%d", |
| 1337 | i, ft_p->ft_len, NM_IFPNAME(dst_ifp), j); |
| 1338 | if (ft_p->ft_flags & NS_INDIRECT) { |
| 1339 | if (copyin(src, dst, len)) { |
| 1340 | // invalid user pointer, pretend len is 0 |
| 1341 | ft_p->ft_len = 0; |
| 1342 | } |
| 1343 | } else { |
| 1344 | //memcpy(dst, src, len); |
| 1345 | pkt_copy(src, dst, (int)len); |
| 1346 | } |
| 1347 | slot->len = ft_p->ft_len; |
| 1348 | slot->flags = (cnt << 8)| NS_MOREFRAG; |
| 1349 | j = nm_next(j, lim); |
| 1350 | ft_p++; |
| 1351 | sent++; |
| 1352 | } while (ft_p != ft_end); |
| 1353 | slot->flags = (cnt << 8); /* clear flag on last entry */ |
| 1354 | /* are we done ? */ |
| 1355 | if (next == NM_FT_NULL && brd_next == NM_FT_NULL) |
| 1356 | break; |
| 1357 | } |
| 1358 | { |
| 1359 | /* current position */ |
| 1360 | uint32_t *p = kring->nkr_leases; /* shorthand */ |
| 1361 | uint32_t update_pos; |
| 1362 | int still_locked = 1; |
| 1363 | |
| 1364 | lockmgr(&kring->q_lock, LK_EXCLUSIVE); |
| 1365 | if (unlikely(howmany > 0)) { |
| 1366 | /* not used all bufs. If i am the last one |
| 1367 | * i can recover the slots, otherwise must |
| 1368 | * fill them with 0 to mark empty packets. |
| 1369 | */ |
| 1370 | ND("leftover %d bufs", howmany); |
| 1371 | if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) { |
| 1372 | /* yes i am the last one */ |
| 1373 | ND("roll back nkr_hwlease to %d", j); |
| 1374 | kring->nkr_hwlease = j; |
| 1375 | } else { |
| 1376 | while (howmany-- > 0) { |
| 1377 | ring->slot[j].len = 0; |
| 1378 | ring->slot[j].flags = 0; |
| 1379 | j = nm_next(j, lim); |
| 1380 | } |
| 1381 | } |
| 1382 | } |
| 1383 | p[lease_idx] = j; /* report I am done */ |
| 1384 | |
| 1385 | update_pos = nm_kr_rxpos(kring); |
| 1386 | |
| 1387 | if (my_start == update_pos) { |
| 1388 | /* all slots before my_start have been reported, |
| 1389 | * so scan subsequent leases to see if other ranges |
| 1390 | * have been completed, and to a selwakeup or txsync. |
| 1391 | */ |
| 1392 | while (lease_idx != kring->nkr_lease_idx && |
| 1393 | p[lease_idx] != NR_NOSLOT) { |
| 1394 | j = p[lease_idx]; |
| 1395 | p[lease_idx] = NR_NOSLOT; |
| 1396 | lease_idx = nm_next(lease_idx, lim); |
| 1397 | } |
| 1398 | /* j is the new 'write' position. j != my_start |
| 1399 | * means there are new buffers to report |
| 1400 | */ |
| 1401 | if (likely(j != my_start)) { |
| 1402 | uint32_t old_avail = kring->nr_hwavail; |
| 1403 | |
| 1404 | kring->nr_hwavail = (j >= kring->nr_hwcur) ? |
| 1405 | j - kring->nr_hwcur : |
| 1406 | j + lim + 1 - kring->nr_hwcur; |
| 1407 | if (kring->nr_hwavail < old_avail) { |
| 1408 | D("avail shrink %d -> %d", |
| 1409 | old_avail, kring->nr_hwavail); |
| 1410 | } |
| 1411 | dst_na->up.nm_notify(&dst_na->up, dst_nr, NR_RX, 0); |
| 1412 | still_locked = 0; |
| 1413 | lockmgr(&kring->q_lock, LK_RELEASE); |
| 1414 | if (dst_na->retry && retry--) |
| 1415 | goto retry; |
| 1416 | } |
| 1417 | } |
| 1418 | if (still_locked) |
| 1419 | lockmgr(&kring->q_lock, LK_RELEASE); |
| 1420 | } |
| 1421 | cleanup: |
| 1422 | d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */ |
| 1423 | d->bq_len = 0; |
| 1424 | } |
| 1425 | brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */ |
| 1426 | brddst->bq_len = 0; |
| 1427 | return 0; |
| 1428 | } |
| 1429 | |
| 1430 | static int |
| 1431 | netmap_vp_txsync(struct netmap_vp_adapter *na, u_int ring_nr, int flags) |
| 1432 | { |
| 1433 | struct netmap_kring *kring = &na->up.tx_rings[ring_nr]; |
| 1434 | struct netmap_ring *ring = kring->ring; |
| 1435 | u_int j, k, lim = kring->nkr_num_slots - 1; |
| 1436 | |
| 1437 | k = ring->cur; |
| 1438 | if (k > lim) |
| 1439 | return netmap_ring_reinit(kring); |
| 1440 | |
| 1441 | if (bridge_batch <= 0) { /* testing only */ |
| 1442 | j = k; // used all |
| 1443 | goto done; |
| 1444 | } |
| 1445 | if (bridge_batch > NM_BDG_BATCH) |
| 1446 | bridge_batch = NM_BDG_BATCH; |
| 1447 | |
| 1448 | j = nm_bdg_preflush(na, ring_nr, kring, k); |
| 1449 | if (j != k) |
| 1450 | D("early break at %d/ %d, avail %d", j, k, kring->nr_hwavail); |
| 1451 | /* k-j modulo ring size is the number of slots processed */ |
| 1452 | if (k < j) |
| 1453 | k += kring->nkr_num_slots; |
| 1454 | kring->nr_hwavail = lim - (k - j); |
| 1455 | |
| 1456 | done: |
| 1457 | kring->nr_hwcur = j; |
| 1458 | ring->avail = kring->nr_hwavail; |
| 1459 | if (netmap_verbose) |
| 1460 | D("%s ring %d flags %d", NM_IFPNAME(na->up.ifp), ring_nr, flags); |
| 1461 | return 0; |
| 1462 | } |
| 1463 | |
| 1464 | |
| 1465 | /* |
| 1466 | * main dispatch routine for the bridge. |
| 1467 | * We already know that only one thread is running this. |
| 1468 | * we must run nm_bdg_preflush without lock. |
| 1469 | */ |
| 1470 | static int |
| 1471 | bdg_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) |
| 1472 | { |
| 1473 | struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na; |
| 1474 | return netmap_vp_txsync(vpna, ring_nr, flags); |
| 1475 | } |
| 1476 | |
| 1477 | |
| 1478 | /* |
| 1479 | * user process reading from a VALE switch. |
| 1480 | * Already protected against concurrent calls from userspace, |
| 1481 | * but we must acquire the queue's lock to protect against |
| 1482 | * writers on the same queue. |
| 1483 | */ |
| 1484 | static int |
| 1485 | bdg_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) |
| 1486 | { |
| 1487 | struct netmap_kring *kring = &na->rx_rings[ring_nr]; |
| 1488 | struct netmap_ring *ring = kring->ring; |
| 1489 | u_int j, lim = kring->nkr_num_slots - 1; |
| 1490 | u_int k = ring->cur, resvd = ring->reserved; |
| 1491 | int n; |
| 1492 | |
| 1493 | lockmgr(&kring->q_lock, LK_EXCLUSIVE); |
| 1494 | if (k > lim) { |
| 1495 | D("ouch dangerous reset!!!"); |
| 1496 | n = netmap_ring_reinit(kring); |
| 1497 | goto done; |
| 1498 | } |
| 1499 | |
| 1500 | /* skip past packets that userspace has released */ |
| 1501 | j = kring->nr_hwcur; /* netmap ring index */ |
| 1502 | if (resvd > 0) { |
| 1503 | if (resvd + ring->avail >= lim + 1) { |
| 1504 | D("XXX invalid reserve/avail %d %d", resvd, ring->avail); |
| 1505 | ring->reserved = resvd = 0; // XXX panic... |
| 1506 | } |
| 1507 | k = (k >= resvd) ? k - resvd : k + lim + 1 - resvd; |
| 1508 | } |
| 1509 | |
| 1510 | if (j != k) { /* userspace has released some packets. */ |
| 1511 | n = k - j; |
| 1512 | if (n < 0) |
| 1513 | n += kring->nkr_num_slots; |
| 1514 | ND("userspace releases %d packets", n); |
| 1515 | for (n = 0; likely(j != k); n++) { |
| 1516 | struct netmap_slot *slot = &ring->slot[j]; |
| 1517 | void *addr = BDG_NMB(na, slot); |
| 1518 | |
| 1519 | if (addr == netmap_buffer_base) { /* bad buf */ |
| 1520 | D("bad buffer index %d, ignore ?", |
| 1521 | slot->buf_idx); |
| 1522 | } |
| 1523 | slot->flags &= ~NS_BUF_CHANGED; |
| 1524 | j = nm_next(j, lim); |
| 1525 | } |
| 1526 | kring->nr_hwavail -= n; |
| 1527 | kring->nr_hwcur = k; |
| 1528 | } |
| 1529 | /* tell userspace that there are new packets */ |
| 1530 | ring->avail = kring->nr_hwavail - resvd; |
| 1531 | n = 0; |
| 1532 | done: |
| 1533 | lockmgr(&kring->q_lock, LK_RELEASE); |
| 1534 | return n; |
| 1535 | } |
| 1536 | |
| 1537 | static int |
| 1538 | bdg_netmap_attach(struct netmap_adapter *arg) |
| 1539 | { |
| 1540 | struct netmap_vp_adapter *vpna; |
| 1541 | struct netmap_adapter *na; |
| 1542 | int error; |
| 1543 | |
| 1544 | vpna = kmalloc(sizeof(*vpna), M_DEVBUF, M_NOWAIT | M_ZERO); |
| 1545 | if (vpna == NULL) |
| 1546 | return ENOMEM; |
| 1547 | na = &vpna->up; |
| 1548 | *na = *arg; |
| 1549 | na->na_flags |= NAF_BDG_MAYSLEEP | NAF_MEM_OWNER; |
| 1550 | na->nm_txsync = bdg_netmap_txsync; |
| 1551 | na->nm_rxsync = bdg_netmap_rxsync; |
| 1552 | na->nm_register = bdg_netmap_reg; |
| 1553 | na->nm_dtor = netmap_adapter_vp_dtor; |
| 1554 | na->nm_krings_create = netmap_vp_krings_create; |
| 1555 | na->nm_krings_delete = netmap_vp_krings_delete; |
| 1556 | na->nm_mem = netmap_mem_private_new(NM_IFPNAME(arg->ifp), |
| 1557 | na->num_tx_rings, na->num_tx_desc, |
| 1558 | na->num_rx_rings, na->num_rx_desc); |
| 1559 | /* other nmd fields are set in the common routine */ |
| 1560 | error = netmap_attach_common(na); |
| 1561 | if (error) { |
| 1562 | kfree(vpna, M_DEVBUF); |
| 1563 | return error; |
| 1564 | } |
| 1565 | return 0; |
| 1566 | } |
| 1567 | |
| 1568 | static void |
| 1569 | netmap_bwrap_dtor(struct netmap_adapter *na) |
| 1570 | { |
| 1571 | struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na; |
| 1572 | struct netmap_adapter *hwna = bna->hwna; |
| 1573 | struct nm_bridge *b = bna->up.na_bdg, |
| 1574 | *bh = bna->host.na_bdg; |
| 1575 | struct ifnet *ifp = na->ifp; |
| 1576 | |
| 1577 | ND("na %p", na); |
| 1578 | |
| 1579 | if (b) { |
| 1580 | netmap_bdg_detach_common(b, bna->up.bdg_port, |
| 1581 | (bh ? bna->host.bdg_port : -1)); |
| 1582 | } |
| 1583 | |
| 1584 | hwna->na_private = NULL; |
| 1585 | netmap_adapter_put(hwna); |
| 1586 | |
| 1587 | bzero(ifp, sizeof(*ifp)); |
| 1588 | kfree(ifp, M_DEVBUF); |
| 1589 | na->ifp = NULL; |
| 1590 | |
| 1591 | } |
| 1592 | |
| 1593 | /* |
| 1594 | * Pass packets from nic to the bridge. |
| 1595 | * XXX TODO check locking: this is called from the interrupt |
| 1596 | * handler so we should make sure that the interface is not |
| 1597 | * disconnected while passing down an interrupt. |
| 1598 | * |
| 1599 | * Note, no user process can access this NIC so we can ignore |
| 1600 | * the info in the 'ring'. |
| 1601 | */ |
| 1602 | /* callback that overwrites the hwna notify callback. |
| 1603 | * Packets come from the outside or from the host stack and are put on an hwna rx ring. |
| 1604 | * The bridge wrapper then sends the packets through the bridge. |
| 1605 | */ |
| 1606 | static int |
| 1607 | netmap_bwrap_intr_notify(struct netmap_adapter *na, u_int ring_nr, enum txrx tx, int flags) |
| 1608 | { |
| 1609 | struct ifnet *ifp = na->ifp; |
| 1610 | struct netmap_bwrap_adapter *bna = na->na_private; |
| 1611 | struct netmap_vp_adapter *hostna = &bna->host; |
| 1612 | struct netmap_kring *kring, *bkring; |
| 1613 | struct netmap_ring *ring; |
| 1614 | int is_host_ring = ring_nr == na->num_rx_rings; |
| 1615 | struct netmap_vp_adapter *vpna = &bna->up; |
| 1616 | int error = 0; |
| 1617 | |
| 1618 | ND("%s[%d] %s %x", NM_IFPNAME(ifp), ring_nr, (tx == NR_TX ? "TX" : "RX"), flags); |
| 1619 | |
| 1620 | if (flags & NAF_DISABLE_NOTIFY) { |
| 1621 | kring = tx == NR_TX ? na->tx_rings : na->rx_rings; |
| 1622 | bkring = tx == NR_TX ? vpna->up.rx_rings : vpna->up.tx_rings; |
| 1623 | if (kring->nkr_stopped) |
| 1624 | netmap_disable_ring(bkring); |
| 1625 | else |
| 1626 | bkring->nkr_stopped = 0; |
| 1627 | return 0; |
| 1628 | } |
| 1629 | |
| 1630 | if (ifp == NULL || !(ifp->if_capenable & IFCAP_NETMAP)) |
| 1631 | return 0; |
| 1632 | |
| 1633 | if (tx == NR_TX) |
| 1634 | return 0; |
| 1635 | |
| 1636 | kring = &na->rx_rings[ring_nr]; |
| 1637 | ring = kring->ring; |
| 1638 | |
| 1639 | /* make sure the ring is not disabled */ |
| 1640 | if (nm_kr_tryget(kring)) |
| 1641 | return 0; |
| 1642 | |
| 1643 | if (is_host_ring && hostna->na_bdg == NULL) { |
| 1644 | error = bna->save_notify(na, ring_nr, tx, flags); |
| 1645 | goto put_out; |
| 1646 | } |
| 1647 | |
| 1648 | if (is_host_ring) { |
| 1649 | vpna = hostna; |
| 1650 | ring_nr = 0; |
| 1651 | } else { |
| 1652 | /* fetch packets that have arrived. |
| 1653 | * XXX maybe do this in a loop ? |
| 1654 | */ |
| 1655 | error = na->nm_rxsync(na, ring_nr, 0); |
| 1656 | if (error) |
| 1657 | goto put_out; |
| 1658 | } |
| 1659 | if (kring->nr_hwavail == 0 && netmap_verbose) { |
| 1660 | D("how strange, interrupt with no packets on %s", |
| 1661 | NM_IFPNAME(ifp)); |
| 1662 | goto put_out; |
| 1663 | } |
| 1664 | /* XXX avail ? */ |
| 1665 | ring->cur = nm_kr_rxpos(kring); |
| 1666 | netmap_vp_txsync(vpna, ring_nr, flags); |
| 1667 | |
| 1668 | if (!is_host_ring) |
| 1669 | error = na->nm_rxsync(na, ring_nr, 0); |
| 1670 | |
| 1671 | put_out: |
| 1672 | nm_kr_put(kring); |
| 1673 | return error; |
| 1674 | } |
| 1675 | |
| 1676 | static int |
| 1677 | netmap_bwrap_register(struct netmap_adapter *na, int onoff) |
| 1678 | { |
| 1679 | struct netmap_bwrap_adapter *bna = |
| 1680 | (struct netmap_bwrap_adapter *)na; |
| 1681 | struct netmap_adapter *hwna = bna->hwna; |
| 1682 | struct netmap_vp_adapter *hostna = &bna->host; |
| 1683 | int error; |
| 1684 | |
| 1685 | ND("%s %d", NM_IFPNAME(ifp), onoff); |
| 1686 | |
| 1687 | if (onoff) { |
| 1688 | int i; |
| 1689 | |
| 1690 | hwna->na_lut = na->na_lut; |
| 1691 | hwna->na_lut_objtotal = na->na_lut_objtotal; |
| 1692 | |
| 1693 | if (hostna->na_bdg) { |
| 1694 | hostna->up.na_lut = na->na_lut; |
| 1695 | hostna->up.na_lut_objtotal = na->na_lut_objtotal; |
| 1696 | } |
| 1697 | |
| 1698 | /* cross-link the netmap rings */ |
| 1699 | for (i = 0; i <= na->num_tx_rings; i++) { |
| 1700 | hwna->tx_rings[i].nkr_num_slots = na->rx_rings[i].nkr_num_slots; |
| 1701 | hwna->tx_rings[i].ring = na->rx_rings[i].ring; |
| 1702 | } |
| 1703 | for (i = 0; i <= na->num_rx_rings; i++) { |
| 1704 | hwna->rx_rings[i].nkr_num_slots = na->tx_rings[i].nkr_num_slots; |
| 1705 | hwna->rx_rings[i].ring = na->tx_rings[i].ring; |
| 1706 | } |
| 1707 | } |
| 1708 | |
| 1709 | if (hwna->ifp) { |
| 1710 | error = hwna->nm_register(hwna, onoff); |
| 1711 | if (error) |
| 1712 | return error; |
| 1713 | } |
| 1714 | |
| 1715 | bdg_netmap_reg(na, onoff); |
| 1716 | |
| 1717 | if (onoff) { |
| 1718 | bna->save_notify = hwna->nm_notify; |
| 1719 | hwna->nm_notify = netmap_bwrap_intr_notify; |
| 1720 | } else { |
| 1721 | hwna->nm_notify = bna->save_notify; |
| 1722 | hwna->na_lut = NULL; |
| 1723 | hwna->na_lut_objtotal = 0; |
| 1724 | } |
| 1725 | |
| 1726 | return 0; |
| 1727 | } |
| 1728 | |
| 1729 | static int |
| 1730 | netmap_bwrap_config(struct netmap_adapter *na, u_int *txr, u_int *txd, |
| 1731 | u_int *rxr, u_int *rxd) |
| 1732 | { |
| 1733 | struct netmap_bwrap_adapter *bna = |
| 1734 | (struct netmap_bwrap_adapter *)na; |
| 1735 | struct netmap_adapter *hwna = bna->hwna; |
| 1736 | |
| 1737 | /* forward the request */ |
| 1738 | netmap_update_config(hwna); |
| 1739 | /* swap the results */ |
| 1740 | *txr = hwna->num_rx_rings; |
| 1741 | *txd = hwna->num_rx_desc; |
| 1742 | *rxr = hwna->num_tx_rings; |
| 1743 | *rxd = hwna->num_rx_desc; |
| 1744 | |
| 1745 | return 0; |
| 1746 | } |
| 1747 | |
| 1748 | static int |
| 1749 | netmap_bwrap_krings_create(struct netmap_adapter *na) |
| 1750 | { |
| 1751 | struct netmap_bwrap_adapter *bna = |
| 1752 | (struct netmap_bwrap_adapter *)na; |
| 1753 | struct netmap_adapter *hwna = bna->hwna; |
| 1754 | struct netmap_adapter *hostna = &bna->host.up; |
| 1755 | int error; |
| 1756 | |
| 1757 | ND("%s", NM_IFPNAME(na->ifp)); |
| 1758 | |
| 1759 | error = netmap_vp_krings_create(na); |
| 1760 | if (error) |
| 1761 | return error; |
| 1762 | |
| 1763 | error = hwna->nm_krings_create(hwna); |
| 1764 | if (error) { |
| 1765 | netmap_vp_krings_delete(na); |
| 1766 | return error; |
| 1767 | } |
| 1768 | |
| 1769 | hostna->tx_rings = na->tx_rings + na->num_tx_rings; |
| 1770 | hostna->rx_rings = na->rx_rings + na->num_rx_rings; |
| 1771 | |
| 1772 | return 0; |
| 1773 | } |
| 1774 | |
| 1775 | static void |
| 1776 | netmap_bwrap_krings_delete(struct netmap_adapter *na) |
| 1777 | { |
| 1778 | struct netmap_bwrap_adapter *bna = |
| 1779 | (struct netmap_bwrap_adapter *)na; |
| 1780 | struct netmap_adapter *hwna = bna->hwna; |
| 1781 | |
| 1782 | ND("%s", NM_IFPNAME(na->ifp)); |
| 1783 | |
| 1784 | hwna->nm_krings_delete(hwna); |
| 1785 | netmap_vp_krings_delete(na); |
| 1786 | } |
| 1787 | |
| 1788 | /* notify method for the bridge-->hwna direction */ |
| 1789 | static int |
| 1790 | netmap_bwrap_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int flags) |
| 1791 | { |
| 1792 | struct netmap_bwrap_adapter *bna = |
| 1793 | (struct netmap_bwrap_adapter *)na; |
| 1794 | struct netmap_adapter *hwna = bna->hwna; |
| 1795 | struct netmap_kring *kring, *hw_kring; |
| 1796 | struct netmap_ring *ring; |
| 1797 | u_int lim, k; |
| 1798 | int error = 0; |
| 1799 | |
| 1800 | if (tx == NR_TX) |
| 1801 | return ENXIO; |
| 1802 | |
| 1803 | kring = &na->rx_rings[ring_n]; |
| 1804 | hw_kring = &hwna->tx_rings[ring_n]; |
| 1805 | ring = kring->ring; |
| 1806 | |
| 1807 | lim = kring->nkr_num_slots - 1; |
| 1808 | k = nm_kr_rxpos(kring); |
| 1809 | |
| 1810 | if (hwna->ifp == NULL || !(hwna->ifp->if_capenable & IFCAP_NETMAP)) |
| 1811 | return 0; |
| 1812 | ring->cur = k; |
| 1813 | ND("%s[%d] PRE rx(%d, %d, %d, %d) ring(%d, %d, %d) tx(%d, %d)", |
| 1814 | NM_IFPNAME(na->ifp), ring_n, |
| 1815 | kring->nr_hwcur, kring->nr_hwavail, kring->nkr_hwlease, kring->nr_hwreserved, |
| 1816 | ring->cur, ring->avail, ring->reserved, |
| 1817 | hw_kring->nr_hwcur, hw_kring->nr_hwavail); |
| 1818 | if (ring_n == na->num_rx_rings) { |
| 1819 | netmap_txsync_to_host(hwna); |
| 1820 | } else { |
| 1821 | error = hwna->nm_txsync(hwna, ring_n, flags); |
| 1822 | } |
| 1823 | kring->nr_hwcur = ring->cur; |
| 1824 | kring->nr_hwavail = 0; |
| 1825 | kring->nr_hwreserved = lim - ring->avail; |
| 1826 | ND("%s[%d] PST rx(%d, %d, %d, %d) ring(%d, %d, %d) tx(%d, %d)", |
| 1827 | NM_IFPNAME(na->ifp), ring_n, |
| 1828 | kring->nr_hwcur, kring->nr_hwavail, kring->nkr_hwlease, kring->nr_hwreserved, |
| 1829 | ring->cur, ring->avail, ring->reserved, |
| 1830 | hw_kring->nr_hwcur, hw_kring->nr_hwavail); |
| 1831 | |
| 1832 | return error; |
| 1833 | } |
| 1834 | |
| 1835 | static int |
| 1836 | netmap_bwrap_host_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int flags) |
| 1837 | { |
| 1838 | struct netmap_bwrap_adapter *bna = na->na_private; |
| 1839 | struct netmap_adapter *port_na = &bna->up.up; |
| 1840 | if (tx == NR_TX || ring_n != 0) |
| 1841 | return ENXIO; |
| 1842 | return netmap_bwrap_notify(port_na, port_na->num_rx_rings, NR_RX, flags); |
| 1843 | } |
| 1844 | |
| 1845 | /* attach a bridge wrapper to the 'real' device */ |
| 1846 | static int |
| 1847 | netmap_bwrap_attach(struct ifnet *fake, struct ifnet *real) |
| 1848 | { |
| 1849 | struct netmap_bwrap_adapter *bna; |
| 1850 | struct netmap_adapter *na; |
| 1851 | struct netmap_adapter *hwna = NA(real); |
| 1852 | struct netmap_adapter *hostna; |
| 1853 | int error; |
| 1854 | |
| 1855 | |
| 1856 | bna = kmalloc(sizeof(*bna), M_DEVBUF, M_NOWAIT | M_ZERO); |
| 1857 | if (bna == NULL) |
| 1858 | return ENOMEM; |
| 1859 | |
| 1860 | na = &bna->up.up; |
| 1861 | na->ifp = fake; |
| 1862 | /* fill the ring data for the bwrap adapter with rx/tx meanings |
| 1863 | * swapped. The real cross-linking will be done during register, |
| 1864 | * when all the krings will have been created. |
| 1865 | */ |
| 1866 | na->num_rx_rings = hwna->num_tx_rings; |
| 1867 | na->num_tx_rings = hwna->num_rx_rings; |
| 1868 | na->num_tx_desc = hwna->num_rx_desc; |
| 1869 | na->num_rx_desc = hwna->num_tx_desc; |
| 1870 | na->nm_dtor = netmap_bwrap_dtor; |
| 1871 | na->nm_register = netmap_bwrap_register; |
| 1872 | // na->nm_txsync = netmap_bwrap_txsync; |
| 1873 | // na->nm_rxsync = netmap_bwrap_rxsync; |
| 1874 | na->nm_config = netmap_bwrap_config; |
| 1875 | na->nm_krings_create = netmap_bwrap_krings_create; |
| 1876 | na->nm_krings_delete = netmap_bwrap_krings_delete; |
| 1877 | na->nm_notify = netmap_bwrap_notify; |
| 1878 | na->nm_mem = hwna->nm_mem; |
| 1879 | na->na_private = na; /* prevent NIOCREGIF */ |
| 1880 | bna->up.retry = 1; /* XXX maybe this should depend on the hwna */ |
| 1881 | |
| 1882 | bna->hwna = hwna; |
| 1883 | netmap_adapter_get(hwna); |
| 1884 | hwna->na_private = bna; /* weak reference */ |
| 1885 | |
| 1886 | hostna = &bna->host.up; |
| 1887 | hostna->ifp = hwna->ifp; |
| 1888 | hostna->num_tx_rings = 1; |
| 1889 | hostna->num_tx_desc = hwna->num_rx_desc; |
| 1890 | hostna->num_rx_rings = 1; |
| 1891 | hostna->num_rx_desc = hwna->num_tx_desc; |
| 1892 | // hostna->nm_txsync = netmap_bwrap_host_txsync; |
| 1893 | // hostna->nm_rxsync = netmap_bwrap_host_rxsync; |
| 1894 | hostna->nm_notify = netmap_bwrap_host_notify; |
| 1895 | hostna->nm_mem = na->nm_mem; |
| 1896 | hostna->na_private = bna; |
| 1897 | |
| 1898 | D("%s<->%s txr %d txd %d rxr %d rxd %d", fake->if_xname, real->if_xname, |
| 1899 | na->num_tx_rings, na->num_tx_desc, |
| 1900 | na->num_rx_rings, na->num_rx_desc); |
| 1901 | |
| 1902 | error = netmap_attach_common(na); |
| 1903 | if (error) { |
| 1904 | netmap_adapter_put(hwna); |
| 1905 | kfree(bna, M_DEVBUF); |
| 1906 | return error; |
| 1907 | } |
| 1908 | return 0; |
| 1909 | } |
| 1910 | |
| 1911 | void |
| 1912 | netmap_init_bridges(void) |
| 1913 | { |
| 1914 | int i; |
| 1915 | bzero(nm_bridges, sizeof(struct nm_bridge) * NM_BRIDGES); /* safety */ |
| 1916 | for (i = 0; i < NM_BRIDGES; i++) |
| 1917 | BDG_RWINIT(&nm_bridges[i]); |
| 1918 | } |
| 1919 | #endif /* WITH_VALE */ |