From: Markus Pfeiffer Date: Fri, 22 Nov 2013 22:19:07 +0000 (+0000) Subject: net: import FreeBSD's if_lagg X-Git-Tag: v4.1.0~103 X-Git-Url: https://gitweb.dragonflybsd.org/dragonfly.git/commitdiff_plain/50b1e2350d52fc5300f07dbf383941d8c619b348 net: import FreeBSD's if_lagg --- diff --git a/etc/mtree/BSD.include.dist b/etc/mtree/BSD.include.dist index f713412a33..8fc352add6 100644 --- a/etc/mtree/BSD.include.dist +++ b/etc/mtree/BSD.include.dist @@ -210,6 +210,8 @@ .. tun .. + lagg + .. .. netbt .. diff --git a/include/Makefile b/include/Makefile index 18243269b8..1414821b21 100644 --- a/include/Makefile +++ b/include/Makefile @@ -70,6 +70,7 @@ LSUBDIRS= \ net/bridge net/tap net/tun net/ppp net/ppp_layer net/sl \ net/pf net/altq \ net/vlan net/ipfw net/ip6fw net/dummynet net/sppp net/ip_mroute \ + net/altq net/lagg \ netgraph/UI netgraph/async netgraph/bpf netgraph/bridge \ netgraph/cisco netgraph/echo netgraph/eiface netgraph/etf \ netgraph/ether netgraph/fec netgraph/frame_relay netgraph/hole \ diff --git a/sbin/ifconfig/Makefile b/sbin/ifconfig/Makefile index 9e65730a7e..879f8ce87f 100644 --- a/sbin/ifconfig/Makefile +++ b/sbin/ifconfig/Makefile @@ -17,6 +17,7 @@ SRCS+= af_inet.c # IPv4 support SRCS+= af_inet6.c # IPv6 support SRCS+= ifclone.c # clone device support +SRCS+= iflagg.c #SRCS+= ifmac.c # MAC support SRCS+= ifmedia.c # SIOC[GS]IFMEDIA support SRCS+= ifvlan.c # SIOC[GS]ETVLAN support diff --git a/sbin/ifconfig/iflagg.c b/sbin/ifconfig/iflagg.c new file mode 100644 index 0000000000..b1b0793c7b --- /dev/null +++ b/sbin/ifconfig/iflagg.c @@ -0,0 +1,246 @@ +/*- + */ + +#ifndef lint +static const char rcsid[] = + "$FreeBSD$"; +#endif /* not lint */ + +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "ifconfig.h" + +char lacpbuf[120]; /* LACP peer '[(a,a,a),(p,p,p)]' */ + +static void +setlaggport(const char *val, int d, int s, const struct afswtch *afp) +{ + struct lagg_reqport rp; + + bzero(&rp, sizeof(rp)); + strlcpy(rp.rp_ifname, name, sizeof(rp.rp_ifname)); + strlcpy(rp.rp_portname, val, sizeof(rp.rp_portname)); + + /* Don't choke if the port is already in this lagg. */ + if (ioctl(s, SIOCSLAGGPORT, &rp) && errno != EEXIST) + err(1, "SIOCSLAGGPORT"); +} + +static void +unsetlaggport(const char *val, int d, int s, const struct afswtch *afp) +{ + struct lagg_reqport rp; + + bzero(&rp, sizeof(rp)); + strlcpy(rp.rp_ifname, name, sizeof(rp.rp_ifname)); + strlcpy(rp.rp_portname, val, sizeof(rp.rp_portname)); + + if (ioctl(s, SIOCSLAGGDELPORT, &rp)) + err(1, "SIOCSLAGGDELPORT"); +} + +static void +setlaggproto(const char *val, int d, int s, const struct afswtch *afp) +{ + struct lagg_protos lpr[] = LAGG_PROTOS; + struct lagg_reqall ra; + int i; + + bzero(&ra, sizeof(ra)); + ra.ra_proto = LAGG_PROTO_MAX; + + for (i = 0; i < (sizeof(lpr) / sizeof(lpr[0])); i++) { + if (strcmp(val, lpr[i].lpr_name) == 0) { + ra.ra_proto = lpr[i].lpr_proto; + break; + } + } + if (ra.ra_proto == LAGG_PROTO_MAX) + errx(1, "Invalid aggregation protocol: %s", val); + + strlcpy(ra.ra_ifname, name, sizeof(ra.ra_ifname)); + if (ioctl(s, SIOCSLAGG, &ra) != 0) + err(1, "SIOCSLAGG"); +} + +static void +setlagghash(const char *val, int d, int s, const struct afswtch *afp) +{ + struct lagg_reqflags rf; + char *str, *tmp, *tok; + + + rf.rf_flags = 0; + str = tmp = strdup(val); + while ((tok = strsep(&tmp, ",")) != NULL) { + if (strcmp(tok, "l2") == 0) + rf.rf_flags |= LAGG_F_HASHL2; + else if (strcmp(tok, "l3") == 0) + rf.rf_flags |= LAGG_F_HASHL3; + else if (strcmp(tok, "l4") == 0) + rf.rf_flags |= LAGG_F_HASHL4; + else + errx(1, "Invalid lagghash option: %s", tok); + } + free(str); + if (rf.rf_flags == 0) + errx(1, "No lagghash options supplied"); + + strlcpy(rf.rf_ifname, name, sizeof(rf.rf_ifname)); + if (ioctl(s, SIOCSLAGGHASH, &rf)) + err(1, "SIOCSLAGGHASH"); +} + +static char * +lacp_format_mac(const uint8_t *mac, char *buf, size_t buflen) +{ + snprintf(buf, buflen, "%02X-%02X-%02X-%02X-%02X-%02X", + (int)mac[0], (int)mac[1], (int)mac[2], (int)mac[3], + (int)mac[4], (int)mac[5]); + + return (buf); +} + +static char * +lacp_format_peer(struct lacp_opreq *req, const char *sep) +{ + char macbuf1[20]; + char macbuf2[20]; + + snprintf(lacpbuf, sizeof(lacpbuf), + "[(%04X,%s,%04X,%04X,%04X),%s(%04X,%s,%04X,%04X,%04X)]", + req->actor_prio, + lacp_format_mac(req->actor_mac, macbuf1, sizeof(macbuf1)), + req->actor_key, req->actor_portprio, req->actor_portno, sep, + req->partner_prio, + lacp_format_mac(req->partner_mac, macbuf2, sizeof(macbuf2)), + req->partner_key, req->partner_portprio, req->partner_portno); + + return(lacpbuf); +} + +static void +lagg_status(int s) +{ + struct lagg_protos lpr[] = LAGG_PROTOS; + struct lagg_reqport rp, rpbuf[LAGG_MAX_PORTS]; + struct lagg_reqall ra; + struct lagg_reqflags rf; + struct lacp_opreq *lp; + const char *proto = ""; + int i, isport = 0; + + bzero(&rp, sizeof(rp)); + bzero(&ra, sizeof(ra)); + + strlcpy(rp.rp_ifname, name, sizeof(rp.rp_ifname)); + strlcpy(rp.rp_portname, name, sizeof(rp.rp_portname)); + + if (ioctl(s, SIOCGLAGGPORT, &rp) == 0) + isport = 1; + + strlcpy(ra.ra_ifname, name, sizeof(ra.ra_ifname)); + ra.ra_size = sizeof(rpbuf); + ra.ra_port = rpbuf; + + strlcpy(rf.rf_ifname, name, sizeof(rf.rf_ifname)); + if (ioctl(s, SIOCGLAGGFLAGS, &rf) != 0) + rf.rf_flags = 0; + + if (ioctl(s, SIOCGLAGG, &ra) == 0) { + lp = (struct lacp_opreq *)&ra.ra_lacpreq; + + for (i = 0; i < (sizeof(lpr) / sizeof(lpr[0])); i++) { + if (ra.ra_proto == lpr[i].lpr_proto) { + proto = lpr[i].lpr_name; + break; + } + } + + printf("\tlaggproto %s", proto); + if (rf.rf_flags & LAGG_F_HASHMASK) { + const char *sep = ""; + + printf(" lagghash "); + if (rf.rf_flags & LAGG_F_HASHL2) { + printf("%sl2", sep); + sep = ","; + } + if (rf.rf_flags & LAGG_F_HASHL3) { + printf("%sl3", sep); + sep = ","; + } + if (rf.rf_flags & LAGG_F_HASHL4) { + printf("%sl4", sep); + sep = ","; + } + } + if (isport) + printf(" laggdev %s", rp.rp_ifname); + putchar('\n'); + if (verbose && ra.ra_proto == LAGG_PROTO_LACP) + printf("\tlag id: %s\n", + lacp_format_peer(lp, "\n\t\t ")); + + for (i = 0; i < ra.ra_ports; i++) { + lp = (struct lacp_opreq *)&rpbuf[i].rp_lacpreq; + printf("\tlaggport: %s ", rpbuf[i].rp_portname); + printb("flags", rpbuf[i].rp_flags, LAGG_PORT_BITS); + if (verbose && ra.ra_proto == LAGG_PROTO_LACP) + printf(" state=%X", lp->actor_state); + putchar('\n'); + if (verbose && ra.ra_proto == LAGG_PROTO_LACP) + printf("\t\t%s\n", + lacp_format_peer(lp, "\n\t\t ")); + } + + if (0 /* XXX */) { + printf("\tsupported aggregation protocols:\n"); + for (i = 0; i < (sizeof(lpr) / sizeof(lpr[0])); i++) + printf("\t\tlaggproto %s\n", lpr[i].lpr_name); + } + } +} + +static struct cmd lagg_cmds[] = { + DEF_CMD_ARG("laggport", setlaggport), + DEF_CMD_ARG("-laggport", unsetlaggport), + DEF_CMD_ARG("laggproto", setlaggproto), + DEF_CMD_ARG("lagghash", setlagghash), +}; +static struct afswtch af_lagg = { + .af_name = "af_lagg", + .af_af = AF_UNSPEC, + .af_other_status = lagg_status, +}; + +static __constructor(101) void +lagg_ctor(void) +{ +#define N(a) (sizeof(a) / sizeof(a[0])) + int i; + + for (i = 0; i < N(lagg_cmds); i++) + cmd_register(&lagg_cmds[i]); + af_register(&af_lagg); +#undef N +} diff --git a/share/man/man4/Makefile b/share/man/man4/Makefile index ed134efc7f..82e360dfdb 100644 --- a/share/man/man4/Makefile +++ b/share/man/man4/Makefile @@ -151,6 +151,7 @@ MAN= aac.4 \ km.4 \ ktr.4 \ kue.4 \ + lagg.4 \ lge.4 \ lgue.4 \ lm.4 \ @@ -455,6 +456,7 @@ MLINKS+=ix.4 if_ix.4 \ ix.4 if_ixgbe.4 MLINKS+=jme.4 if_jme.4 MLINKS+=kue.4 if_kue.4 +MLINKS+=lagg.4 if_lagg.4 MLINKS+=lge.4 if_lge.4 MLINKS+=lgue.4 if_lgue.4 MLINKS+=lo.4 loop.4 diff --git a/share/man/man4/lagg.4 b/share/man/man4/lagg.4 new file mode 100644 index 0000000000..bb2cfc6c01 --- /dev/null +++ b/share/man/man4/lagg.4 @@ -0,0 +1,202 @@ +.\" $OpenBSD: trunk.4,v 1.18 2006/06/09 13:53:34 jmc Exp $ +.\" +.\" Copyright (c) 2005, 2006 Reyk Floeter +.\" +.\" Permission to use, copy, modify, and distribute this software for any +.\" purpose with or without fee is hereby granted, provided that the above +.\" copyright notice and this permission notice appear in all copies. +.\" +.\" THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +.\" WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +.\" MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +.\" ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +.\" WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +.\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +.\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +.\" +.\" $FreeBSD$ +.\" +.Dd February 23, 2012 +.Dt LAGG 4 +.Os +.Sh NAME +.Nm lagg +.Nd link aggregation and link failover interface +.Sh SYNOPSIS +To compile this driver into the kernel, +place the following line in your +kernel configuration file: +.Bd -ragged -offset indent +.Cd "device lagg" +.Ed +.Pp +Alternatively, to load the driver as a +module at boot time, place the following line in +.Xr loader.conf 5 : +.Bd -literal -offset indent +if_lagg_load="YES" +.Ed +.Sh DESCRIPTION +The +.Nm +interface allows aggregation of multiple network interfaces as one virtual +.Nm +interface for the purpose of providing fault-tolerance and high-speed links. +.Pp +A +.Nm +interface can be created using the +.Ic ifconfig lagg Ns Ar N Ic create +command. +It can use different link aggregation protocols specified +using the +.Ic laggproto Ar proto +option. +Child interfaces can be added using the +.Ic laggport Ar child-iface +option and removed using the +.Ic -laggport Ar child-iface +option. +.Pp +The driver currently supports the aggregation protocols +.Ic failover +(the default), +.Ic fec , +.Ic lacp , +.Ic loadbalance , +.Ic roundrobin , +and +.Ic none . +The protocols determine which ports are used for outgoing traffic +and whether a specific port accepts incoming traffic. +The interface link state is used to validate if the port is active or +not. +.Bl -tag -width loadbalance +.It Ic failover +Sends traffic only through the active port. +If the master port becomes unavailable, +the next active port is used. +The first interface added is the master port; +any interfaces added after that are used as failover devices. +.Pp +By default, received traffic is only accepted when they are received +through the active port. +This constraint can be relaxed by setting the +.Va net.link.lagg.failover_rx_all +.Xr sysctl 8 +variable to a nonzero value, +which is useful for certain bridged network setups. +.It Ic fec +Supports Cisco EtherChannel. +This is an alias for +.Ic loadbalance +mode. +.It Ic lacp +Supports the IEEE 802.1AX (formerly 802.3ad) Link Aggregation Control Protocol +(LACP) and the Marker Protocol. +LACP will negotiate a set of aggregable links with the peer in to one or more +Link Aggregated Groups. +Each LAG is composed of ports of the same speed, set to full-duplex operation. +The traffic will be balanced across the ports in the LAG with the greatest +total speed, in most cases there will only be one LAG which contains all ports. +In the event of changes in physical connectivity, Link Aggregation will quickly +converge to a new configuration. +.It Ic loadbalance +Balances outgoing traffic across the active ports based on hashed +protocol header information and accepts incoming traffic from +any active port. +This is a static setup and does not negotiate aggregation with the peer or +exchange frames to monitor the link. +The hash includes the Ethernet source and destination address, and, if +available, the VLAN tag, and the IP source and destination address. +.It Ic roundrobin +Distributes outgoing traffic using a round-robin scheduler +through all active ports and accepts incoming traffic from +any active port. +.It Ic none +This protocol is intended to do nothing: it disables any traffic without +disabling the +.Nm +interface itself. +.El +.Pp +Each +.Nm +interface is created at runtime using interface cloning. +This is +most easily done with the +.Xr ifconfig 8 +.Cm create +command or using the +.Va cloned_interfaces +variable in +.Xr rc.conf 5 . +.Pp +The MTU of the first interface to be added is used as the lagg MTU. +All additional interfaces are required to have exactly the same value. +.Pp +The +.Ic loadbalance +and +.Ic lacp +modes will use the RSS hash from the network card if available to avoid +computing one, this may give poor traffic distribution if the hash is invalid +or uses less of the protocol header information. +Local hash computation can be forced per interface by setting the +.Va net.link.lagg.X.use_flowid +.Xr sysctl 8 +variable to zero where X is the interface number. +The default for new interfaces is set via the +.Va net.link.lagg.default_use_flowid +.Xr sysctl 8 . +.Sh EXAMPLES +Create a link aggregation using LACP with two +.Xr bge 4 +Gigabit Ethernet interfaces: +.Bd -literal -offset indent +# ifconfig bge0 up +# ifconfig bge1 up +# ifconfig lagg0 laggproto lacp laggport bge0 laggport bge1 \e + 192.168.1.1 netmask 255.255.255.0 +.Ed +.Pp +The following example uses an active failover interface to set up roaming +between wired and wireless networks using two network devices. +Whenever the wired master interface is unplugged, the wireless failover +device will be used: +.Bd -literal -offset indent +# ifconfig em0 up +# ifconfig ath0 ether 00:11:22:33:44:55 +# ifconfig create wlan0 wlandev ath0 ssid my_net up +# ifconfig lagg0 laggproto failover laggport em0 laggport wlan0 \e + 192.168.1.1 netmask 255.255.255.0 +.Ed +.Pp +(Note the mac address of the wireless device is forced to match the wired +device as a workaround.) +.Sh SEE ALSO +.Xr ng_one2many 4 , +.Xr sysctl 8 , +.Xr ifconfig 8 +.Sh HISTORY +The +.Nm +device first appeared in +.Fx 6.3 . +.Sh AUTHORS +.An -nosplit +The +.Nm +driver was written under the name +.Nm trunk +by +.An Reyk Floeter Aq reyk@openbsd.org . +The LACP implementation was written by +.An YAMAMOTO Takashi +for +.Nx . +.Sh BUGS +There is no way to configure LACP administrative variables, including system +and port priorities. +The current implementation always performs active-mode LACP and uses 0x8000 as +system and port priorities. diff --git a/sys/conf/files b/sys/conf/files index f5c6f95072..caa82fe995 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -1130,6 +1130,8 @@ net/if_ethersubr.c optional ether net/faith/if_faith.c optional faith net/gif/if_gif.c optional gif net/gre/if_gre.c optional gre inet +net/lagg/ieee8023ad_lacp.c optional lagg +net/lagg/if_lagg.c optional lagg net/pfil.c standard net/pf/if_pflog.c optional pflog net/pf/if_pfsync.c optional pf diff --git a/sys/config/VKERNEL64 b/sys/config/VKERNEL64 index 76bba85a7d..5b373344be 100644 --- a/sys/config/VKERNEL64 +++ b/sys/config/VKERNEL64 @@ -97,3 +97,5 @@ pseudo-device bpf #Berkeley packet filter device vkd device vke device vcd + +device lagg diff --git a/sys/config/X86_64_GENERIC b/sys/config/X86_64_GENERIC index 5b07176a9f..1190e541b5 100644 --- a/sys/config/X86_64_GENERIC +++ b/sys/config/X86_64_GENERIC @@ -299,6 +299,7 @@ pseudo-device md # Memory "disks" pseudo-device vn # File image "disks" pseudo-device gif # IPv6 and IPv4 tunneling pseudo-device faith 1 # IPv6-to-IPv4 relaying (translation) +pseudo-device lagg # CARP support options CARP diff --git a/sys/net/Makefile b/sys/net/Makefile index 9794e247de..c31a4111d0 100644 --- a/sys/net/Makefile +++ b/sys/net/Makefile @@ -1,5 +1,5 @@ SUBDIR=accf_data accf_http disc faith gif gre sl stf tap tun \ - vlan zlib bridge dummynet ipfw ip6fw ip_mroute \ + vlan zlib bridge lagg dummynet ipfw ip6fw ip_mroute \ sppp ppp_layer pf .include diff --git a/sys/net/ethernet.h b/sys/net/ethernet.h index 7668ef2dbe..dfdfa6eb08 100644 --- a/sys/net/ethernet.h +++ b/sys/net/ethernet.h @@ -330,6 +330,7 @@ extern const uint8_t etherbroadcastaddr[ETHER_ADDR_LEN]; #define ETHERTYPE_IPAS 0x876C /* IP Autonomous Systems (RFC1701) */ #define ETHERTYPE_SECUREDATA 0x876D /* Secure Data (RFC1701) */ #define ETHERTYPE_FLOWCONTROL 0x8808 /* 802.3x flow control packet */ +#define ETHERTYPE_SLOW 0x8809 /* 802.3ad link aggregation (LACP) */ #define ETHERTYPE_PPP 0x880B /* PPP (obsolete by PPPOE) */ #define ETHERTYPE_HITACHI 0x8820 /* Hitachi Cable (Optoelectronic Systems Laboratory) */ #define ETHERTYPE_MPLS 0x8847 /* MPLS Unicast */ diff --git a/sys/net/if.c b/sys/net/if.c index f8b77ad021..cf659a134f 100644 --- a/sys/net/if.c +++ b/sys/net/if.c @@ -2375,6 +2375,7 @@ if_setlladdr(struct ifnet *ifp, const u_char *lladdr, int len) case IFT_ETHER: /* these types use struct arpcom */ case IFT_XETHER: case IFT_L2VLAN: + case IFT_IEEE8023ADLAG: bcopy(lladdr, ((struct arpcom *)ifp->if_softc)->ac_enaddr, len); bcopy(lladdr, LLADDR(sdl), len); break; diff --git a/sys/net/if_ethersubr.c b/sys/net/if_ethersubr.c index c7a77bec2a..c415dbf1b7 100644 --- a/sys/net/if_ethersubr.c +++ b/sys/net/if_ethersubr.c @@ -116,6 +116,12 @@ struct ifnet *(*bridge_interface_p)(void *if_bridge); static int ether_resolvemulti(struct ifnet *, struct sockaddr **, struct sockaddr *); +/* + * if_lagg(4) support + */ +void (*lagg_input_p)(struct ifnet *, struct mbuf *); +int (*lagg_output_p)(struct ifnet *, struct mbuf *); + const uint8_t etherbroadcastaddr[ETHER_ADDR_LEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; @@ -278,6 +284,13 @@ ether_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, ("%s: if_bridge not loaded!", __func__)); return bridge_output_p(ifp, m); } +#if XXX + if (ifp->if_lagg) { + KASSERT(lagg_output_p != NULL, + ("%s: if_lagg not loaded!", __func__)); + return lagg_output_p(ifp, m); + } +#endif /* * If a simplex interface, and the packet is being sent to our @@ -1000,6 +1013,14 @@ post_stats: ether_type = ntohs(eh->ether_type); KKASSERT(ether_type != ETHERTYPE_VLAN); + /* Handle input from a lagg(4) port */ + if (ifp->if_type == IFT_IEEE8023ADLAG) { + KASSERT(lagg_input_p != NULL, + ("%s: if_lagg not loaded!", __func__)); + (*lagg_input_p)(ifp, m); + return; + } + if (m->m_flags & M_VLANTAG) { void (*vlan_input_func)(struct mbuf *); diff --git a/sys/net/if_media.h b/sys/net/if_media.h index 91ae8ce10d..f4c9320802 100644 --- a/sys/net/if_media.h +++ b/sys/net/if_media.h @@ -160,6 +160,10 @@ uint64_t ifmedia_baudrate(int); #define IFM_10G_LRM 24 /* 10GBase-LRM 850nm Multi-mode */ #define IFM_UNKNOWN 25 /* media types not defined yet */ #define IFM_10G_T 26 /* 10GBase-T - RJ45 */ +#define IFM_40G_CR4 27 /* 40GBase-CR4 */ +#define IFM_40G_SR4 28 /* 40GBase-SR4 */ +#define IFM_40G_LR4 29 /* 40GBase-LR4 */ + #define IFM_ETH_MASTER 0x00000100 /* master mode (1000baseT) */ #define IFM_ETH_RXPAUSE 0x00000200 /* receive PAUSE frames */ @@ -340,6 +344,9 @@ struct ifmedia_description { { IFM_10G_TWINAX_LONG, "10Gbase-Twinax-Long" }, \ { IFM_UNKNOWN, "Unknown" }, \ { IFM_10G_T, "10Gbase-T" }, \ + { IFM_40G_CR4, "40Gbase-CR4" }, \ + { IFM_40G_SR4, "40Gbase-SR4" }, \ + { IFM_40G_LR4, "40Gbase-LR4" }, \ { 0, NULL }, \ } @@ -550,6 +557,9 @@ struct ifmedia_baudrate { { IFM_ETHER|IFM_10G_TWINAX_LONG,IF_Gbps(10ULL) }, \ { IFM_ETHER|IFM_10G_LRM, IF_Gbps(10ULL) }, \ { IFM_ETHER|IFM_10G_T, IF_Gbps(10ULL) }, \ + { IFM_ETHER | IFM_40G_CR4, IF_Gbps(40ULL) }, \ + { IFM_ETHER | IFM_40G_SR4, IF_Gbps(40ULL) }, \ + { IFM_ETHER | IFM_40G_LR4, IF_Gbps(40ULL) }, \ \ { IFM_IEEE80211|IFM_IEEE80211_FH1, IF_Mbps(1) }, \ { IFM_IEEE80211|IFM_IEEE80211_FH2, IF_Mbps(2) }, \ diff --git a/sys/net/if_var.h b/sys/net/if_var.h index 305810e3b8..840d99dd69 100644 --- a/sys/net/if_var.h +++ b/sys/net/if_var.h @@ -395,6 +395,7 @@ struct ifnet { struct ifprefixhead if_prefixhead; /* list of prefixes per if */ const uint8_t *if_broadcastaddr; void *if_bridge; /* bridge glue */ + void *if_lagg; /* lagg glue */ void *if_afdata[AF_MAX]; struct ifaddr *if_lladdr; diff --git a/sys/net/lagg/Makefile b/sys/net/lagg/Makefile new file mode 100644 index 0000000000..f1b3853513 --- /dev/null +++ b/sys/net/lagg/Makefile @@ -0,0 +1,16 @@ +# $DragonFly: src/sys/net/bridge/Makefile,v 1.4 2005/12/21 16:40:25 corecode Exp $ +# + +KMOD= if_lagg +SRCS= if_lagg.c ieee8023ad_lacp.c +SRCS+= opt_inet.h opt_inet6.h + +.if !defined(BUILDING_WITH_KERNEL) +opt_inet.h: + echo "#define INET 1" > ${.TARGET} + +opt_inet6.h: + echo "#define INET6 1" > ${.TARGET} +.endif + +.include diff --git a/sys/net/lagg/ieee8023ad_lacp.c b/sys/net/lagg/ieee8023ad_lacp.c new file mode 100644 index 0000000000..bebfbe3387 --- /dev/null +++ b/sys/net/lagg/ieee8023ad_lacp.c @@ -0,0 +1,2080 @@ +/* $NetBSD: ieee8023ad_lacp.c,v 1.3 2005/12/11 12:24:54 christos Exp $ */ + +/*- + * Copyright (c)2005 YAMAMOTO Takashi, + * Copyright (c)2008 Andrew Thompson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include /* hz */ +#include /* for net/if.h */ +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +/* + * actor system priority and port priority. + * XXX should be configurable. + */ + +#define LACP_SYSTEM_PRIO 0x8000 +#define LACP_PORT_PRIO 0x8000 + +const uint8_t ethermulticastaddr_slowprotocols[ETHER_ADDR_LEN] = + { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x02 }; + +static const struct tlv_template lacp_info_tlv_template[] = { + { LACP_TYPE_ACTORINFO, + sizeof(struct tlvhdr) + sizeof(struct lacp_peerinfo) }, + { LACP_TYPE_PARTNERINFO, + sizeof(struct tlvhdr) + sizeof(struct lacp_peerinfo) }, + { LACP_TYPE_COLLECTORINFO, + sizeof(struct tlvhdr) + sizeof(struct lacp_collectorinfo) }, + { 0, 0 }, +}; + +static const struct tlv_template marker_info_tlv_template[] = { + { MARKER_TYPE_INFO, + sizeof(struct tlvhdr) + sizeof(struct lacp_markerinfo) }, + { 0, 0 }, +}; + +static const struct tlv_template marker_response_tlv_template[] = { + { MARKER_TYPE_RESPONSE, + sizeof(struct tlvhdr) + sizeof(struct lacp_markerinfo) }, + { 0, 0 }, +}; + +typedef void (*lacp_timer_func_t)(struct lacp_port *); + +static void lacp_fill_actorinfo(struct lacp_port *, struct lacp_peerinfo *); +static void lacp_fill_markerinfo(struct lacp_port *, + struct lacp_markerinfo *); + +static uint64_t lacp_aggregator_bandwidth(struct lacp_aggregator *); +static void lacp_suppress_distributing(struct lacp_softc *, + struct lacp_aggregator *); +static void lacp_transit_expire(void *); +static void lacp_update_portmap(struct lacp_softc *); +static void lacp_select_active_aggregator(struct lacp_softc *); +static uint16_t lacp_compose_key(struct lacp_port *); +static int tlv_check(const void *, size_t, const struct tlvhdr *, + const struct tlv_template *, boolean_t); +static void lacp_tick(void *); + +static void lacp_fill_aggregator_id(struct lacp_aggregator *, + const struct lacp_port *); +static void lacp_fill_aggregator_id_peer(struct lacp_peerinfo *, + const struct lacp_peerinfo *); +static int lacp_aggregator_is_compatible(const struct lacp_aggregator *, + const struct lacp_port *); +static int lacp_peerinfo_is_compatible(const struct lacp_peerinfo *, + const struct lacp_peerinfo *); + +static struct lacp_aggregator *lacp_aggregator_get(struct lacp_softc *, + struct lacp_port *); +static void lacp_aggregator_addref(struct lacp_softc *, + struct lacp_aggregator *); +static void lacp_aggregator_delref(struct lacp_softc *, + struct lacp_aggregator *); + +/* receive machine */ + +static int lacp_pdu_input(struct lacp_port *, struct mbuf *); +static int lacp_marker_input(struct lacp_port *, struct mbuf *); +static void lacp_sm_rx(struct lacp_port *, const struct lacpdu *); +static void lacp_sm_rx_timer(struct lacp_port *); +static void lacp_sm_rx_set_expired(struct lacp_port *); +static void lacp_sm_rx_update_ntt(struct lacp_port *, + const struct lacpdu *); +static void lacp_sm_rx_record_pdu(struct lacp_port *, + const struct lacpdu *); +static void lacp_sm_rx_update_selected(struct lacp_port *, + const struct lacpdu *); +static void lacp_sm_rx_record_default(struct lacp_port *); +static void lacp_sm_rx_update_default_selected(struct lacp_port *); +static void lacp_sm_rx_update_selected_from_peerinfo(struct lacp_port *, + const struct lacp_peerinfo *); + +/* mux machine */ + +static void lacp_sm_mux(struct lacp_port *); +static void lacp_set_mux(struct lacp_port *, enum lacp_mux_state); +static void lacp_sm_mux_timer(struct lacp_port *); + +/* periodic transmit machine */ + +static void lacp_sm_ptx_update_timeout(struct lacp_port *, uint8_t); +static void lacp_sm_ptx_tx_schedule(struct lacp_port *); +static void lacp_sm_ptx_timer(struct lacp_port *); + +/* transmit machine */ + +static void lacp_sm_tx(struct lacp_port *); +static void lacp_sm_assert_ntt(struct lacp_port *); + +static void lacp_run_timers(struct lacp_port *); +static int lacp_compare_peerinfo(const struct lacp_peerinfo *, + const struct lacp_peerinfo *); +static int lacp_compare_systemid(const struct lacp_systemid *, + const struct lacp_systemid *); +static void lacp_port_enable(struct lacp_port *); +static void lacp_port_disable(struct lacp_port *); +static void lacp_select(struct lacp_port *); +static void lacp_unselect(struct lacp_port *); +static void lacp_disable_collecting(struct lacp_port *); +static void lacp_enable_collecting(struct lacp_port *); +static void lacp_disable_distributing(struct lacp_port *); +static void lacp_enable_distributing(struct lacp_port *); +static int lacp_xmit_lacpdu(struct lacp_port *); +static int lacp_xmit_marker(struct lacp_port *); + +/* Debugging */ + +static void lacp_dump_lacpdu(const struct lacpdu *); +static const char *lacp_format_partner(const struct lacp_peerinfo *, char *, + size_t); +static const char *lacp_format_lagid(const struct lacp_peerinfo *, + const struct lacp_peerinfo *, char *, size_t); +static const char *lacp_format_lagid_aggregator(const struct lacp_aggregator *, + char *, size_t); +static const char *lacp_format_state(uint8_t, char *, size_t); +static const char *lacp_format_mac(const uint8_t *, char *, size_t); +static const char *lacp_format_systemid(const struct lacp_systemid *, char *, + size_t); +static const char *lacp_format_portid(const struct lacp_portid *, char *, + size_t); +static void lacp_dprintf(const struct lacp_port *, const char *, ...) + __attribute__((__format__(__printf__, 2, 3))); + +static int lacp_debug = 0; +SYSCTL_NODE(_net_link_lagg, OID_AUTO, lacp, CTLFLAG_RD, 0, "ieee802.3ad"); +SYSCTL_INT(_net_link_lagg_lacp, OID_AUTO, debug, CTLFLAG_RW, + &lacp_debug, 0, "Enable LACP debug logging (1=debug, 2=trace)"); +TUNABLE_INT("net.link.lagg.lacp.debug", &lacp_debug); + +#define LACP_DPRINTF(a) if (lacp_debug & 0x01) { lacp_dprintf a ; } +#define LACP_TRACE(a) if (lacp_debug & 0x02) { lacp_dprintf(a,"%s\n",__func__); } +#define LACP_TPRINTF(a) if (lacp_debug & 0x04) { lacp_dprintf a ; } + +/* + * partner administration variables. + * XXX should be configurable. + */ + +static const struct lacp_peerinfo lacp_partner_admin_optimistic = { + .lip_systemid = { .lsi_prio = 0xffff }, + .lip_portid = { .lpi_prio = 0xffff }, + .lip_state = LACP_STATE_SYNC | LACP_STATE_AGGREGATION | + LACP_STATE_COLLECTING | LACP_STATE_DISTRIBUTING, +}; + +static const struct lacp_peerinfo lacp_partner_admin_strict = { + .lip_systemid = { .lsi_prio = 0xffff }, + .lip_portid = { .lpi_prio = 0xffff }, + .lip_state = 0, +}; + +static const lacp_timer_func_t lacp_timer_funcs[LACP_NTIMER] = { + [LACP_TIMER_CURRENT_WHILE] = lacp_sm_rx_timer, + [LACP_TIMER_PERIODIC] = lacp_sm_ptx_timer, + [LACP_TIMER_WAIT_WHILE] = lacp_sm_mux_timer, +}; + +struct mbuf * +lacp_input(struct lagg_port *lgp, struct mbuf *m) +{ + struct lacp_port *lp = LACP_PORT(lgp); + uint8_t subtype; + + if (m->m_pkthdr.len < sizeof(struct ether_header) + sizeof(subtype)) { + m_freem(m); + return (NULL); + } + + m_copydata(m, sizeof(struct ether_header), sizeof(subtype), &subtype); + switch (subtype) { + case SLOWPROTOCOLS_SUBTYPE_LACP: + lacp_pdu_input(lp, m); + return (NULL); + + case SLOWPROTOCOLS_SUBTYPE_MARKER: + lacp_marker_input(lp, m); + return (NULL); + } + + /* Not a subtype we are interested in */ + return (m); +} + +/* + * lacp_pdu_input: process lacpdu + */ +static int +lacp_pdu_input(struct lacp_port *lp, struct mbuf *m) +{ + struct lacp_softc *lsc = lp->lp_lsc; + struct lacpdu *du; + int error = 0; + + if (m->m_pkthdr.len != sizeof(*du)) { + goto bad; + } + + if ((m->m_flags & M_MCAST) == 0) { + goto bad; + } + + if (m->m_len < sizeof(*du)) { + m = m_pullup(m, sizeof(*du)); + if (m == NULL) { + return (ENOMEM); + } + } + + du = mtod(m, struct lacpdu *); + + if (memcmp(&du->ldu_eh.ether_dhost, + ðermulticastaddr_slowprotocols, ETHER_ADDR_LEN)) { + goto bad; + } + + /* + * ignore the version for compatibility with + * the future protocol revisions. + */ +#if 0 + if (du->ldu_sph.sph_version != 1) { + goto bad; + } +#endif + + /* + * ignore tlv types for compatibility with + * the future protocol revisions. + */ + if (tlv_check(du, sizeof(*du), &du->ldu_tlv_actor, + lacp_info_tlv_template, FALSE)) { + goto bad; + } + + if (lacp_debug > 0) { + lacp_dprintf(lp, "lacpdu receive\n"); + lacp_dump_lacpdu(du); + } + + if ((1 << lp->lp_ifp->if_dunit) & lp->lp_lsc->lsc_debug.lsc_rx_test) { + LACP_TPRINTF((lp, "Dropping RX PDU\n")); + goto bad; + } + + LACP_LOCK(lsc); + lacp_sm_rx(lp, du); + LACP_UNLOCK(lsc); + + m_freem(m); + return (error); + +bad: + m_freem(m); + return (EINVAL); +} + +static void +lacp_fill_actorinfo(struct lacp_port *lp, struct lacp_peerinfo *info) +{ + struct lagg_port *lgp = lp->lp_lagg; + struct lagg_softc *sc = lgp->lp_softc; + + info->lip_systemid.lsi_prio = htons(LACP_SYSTEM_PRIO); + memcpy(&info->lip_systemid.lsi_mac, + IF_LLADDR(sc->sc_ifp), ETHER_ADDR_LEN); + info->lip_portid.lpi_prio = htons(LACP_PORT_PRIO); + info->lip_portid.lpi_portno = htons(lp->lp_ifp->if_index); + info->lip_state = lp->lp_state; +} + +static void +lacp_fill_markerinfo(struct lacp_port *lp, struct lacp_markerinfo *info) +{ + struct ifnet *ifp = lp->lp_ifp; + + /* Fill in the port index and system id (encoded as the MAC) */ + info->mi_rq_port = htons(ifp->if_index); + memcpy(&info->mi_rq_system, lp->lp_systemid.lsi_mac, ETHER_ADDR_LEN); + info->mi_rq_xid = htonl(0); +} + +static int +lacp_xmit_lacpdu(struct lacp_port *lp) +{ + struct lagg_port *lgp = lp->lp_lagg; + struct mbuf *m; + struct lacpdu *du; + int error; + + LACP_LOCK_ASSERT(lp->lp_lsc); + + m = m_gethdr(M_NOWAIT, MT_DATA); + if (m == NULL) { + return (ENOMEM); + } + m->m_len = m->m_pkthdr.len = sizeof(*du); + + du = mtod(m, struct lacpdu *); + memset(du, 0, sizeof(*du)); + + memcpy(&du->ldu_eh.ether_dhost, ethermulticastaddr_slowprotocols, + ETHER_ADDR_LEN); + memcpy(&du->ldu_eh.ether_shost, lgp->lp_lladdr, ETHER_ADDR_LEN); + du->ldu_eh.ether_type = htons(ETHERTYPE_SLOW); + + du->ldu_sph.sph_subtype = SLOWPROTOCOLS_SUBTYPE_LACP; + du->ldu_sph.sph_version = 1; + + TLV_SET(&du->ldu_tlv_actor, LACP_TYPE_ACTORINFO, sizeof(du->ldu_actor)); + du->ldu_actor = lp->lp_actor; + + TLV_SET(&du->ldu_tlv_partner, LACP_TYPE_PARTNERINFO, + sizeof(du->ldu_partner)); + du->ldu_partner = lp->lp_partner; + + TLV_SET(&du->ldu_tlv_collector, LACP_TYPE_COLLECTORINFO, + sizeof(du->ldu_collector)); + du->ldu_collector.lci_maxdelay = 0; + + if (lacp_debug > 0) { + lacp_dprintf(lp, "lacpdu transmit\n"); + lacp_dump_lacpdu(du); + } + + m->m_flags |= M_MCAST; + + /* + * XXX should use higher priority queue. + * otherwise network congestion can break aggregation. + */ + + error = lagg_enqueue(lp->lp_ifp, m); + return (error); +} + +static int +lacp_xmit_marker(struct lacp_port *lp) +{ + struct lagg_port *lgp = lp->lp_lagg; + struct mbuf *m; + struct markerdu *mdu; + int error; + + LACP_LOCK_ASSERT(lp->lp_lsc); + + m = m_gethdr(M_NOWAIT, MT_DATA); + if (m == NULL) { + return (ENOMEM); + } + m->m_len = m->m_pkthdr.len = sizeof(*mdu); + + mdu = mtod(m, struct markerdu *); + memset(mdu, 0, sizeof(*mdu)); + + memcpy(&mdu->mdu_eh.ether_dhost, ethermulticastaddr_slowprotocols, + ETHER_ADDR_LEN); + memcpy(&mdu->mdu_eh.ether_shost, lgp->lp_lladdr, ETHER_ADDR_LEN); + mdu->mdu_eh.ether_type = htons(ETHERTYPE_SLOW); + + mdu->mdu_sph.sph_subtype = SLOWPROTOCOLS_SUBTYPE_MARKER; + mdu->mdu_sph.sph_version = 1; + + /* Bump the transaction id and copy over the marker info */ + lp->lp_marker.mi_rq_xid = htonl(ntohl(lp->lp_marker.mi_rq_xid) + 1); + TLV_SET(&mdu->mdu_tlv, MARKER_TYPE_INFO, sizeof(mdu->mdu_info)); + mdu->mdu_info = lp->lp_marker; + + LACP_DPRINTF((lp, "marker transmit, port=%u, sys=%6d, id=%u\n", + ntohs(mdu->mdu_info.mi_rq_port), *mdu->mdu_info.mi_rq_system, + ntohl(mdu->mdu_info.mi_rq_xid))); + + m->m_flags |= M_MCAST; + error = lagg_enqueue(lp->lp_ifp, m); + return (error); +} + +void +lacp_linkstate(struct lagg_port *lgp) +{ + struct lacp_port *lp = LACP_PORT(lgp); + struct lacp_softc *lsc = lp->lp_lsc; + struct ifnet *ifp = lgp->lp_ifp; + struct ifmediareq ifmr; + int error = 0; + u_int media; + uint8_t old_state; + uint16_t old_key; + + bzero((char *)&ifmr, sizeof(ifmr)); + /* ifnet_deserialize_all(ifp); */ + ifnet_serialize_all(ifp); + error = (*ifp->if_ioctl)(ifp, SIOCGIFMEDIA, (caddr_t)&ifmr, NULL); + ifnet_deserialize_all(ifp); + /* ifnet_serialize_all(ifp); */ + + if (error != 0) + return; + + LACP_LOCK(lsc); + media = ifmr.ifm_active; + LACP_DPRINTF((lp, "media changed 0x%x -> 0x%x, ether = %d, fdx = %d, " + "link = %d\n", lp->lp_media, media, IFM_TYPE(media) == IFM_ETHER, + (media & IFM_FDX) != 0, ifp->if_link_state == LINK_STATE_UP)); + old_state = lp->lp_state; + old_key = lp->lp_key; + + + lp->lp_media = media; + /* + * If the port is not an active full duplex Ethernet link then it can + * not be aggregated. + */ + if (IFM_TYPE(media) != IFM_ETHER || (media & IFM_FDX) == 0 || + ifp->if_link_state != LINK_STATE_UP) { + lacp_port_disable(lp); + } else { + lacp_port_enable(lp); + } + lp->lp_key = lacp_compose_key(lp); + + if (old_state != lp->lp_state || old_key != lp->lp_key) { + LACP_DPRINTF((lp, "-> UNSELECTED\n")); + lp->lp_selected = LACP_UNSELECTED; + } + LACP_UNLOCK(lsc); +} + +static void +lacp_tick(void *arg) +{ + struct lacp_softc *lsc = arg; + struct lacp_port *lp; + + LACP_LOCK(lsc); + LIST_FOREACH(lp, &lsc->lsc_ports, lp_next) { + if ((lp->lp_state & LACP_STATE_AGGREGATION) == 0) + continue; + + lacp_run_timers(lp); + + lacp_select(lp); + lacp_sm_mux(lp); + lacp_sm_tx(lp); + lacp_sm_ptx_tx_schedule(lp); + } + LACP_UNLOCK(lsc); + callout_reset(&lsc->lsc_callout, hz, lacp_tick, lsc); +} + +int +lacp_port_create(struct lagg_port *lgp) +{ + struct lagg_softc *sc = lgp->lp_softc; + struct lacp_softc *lsc = LACP_SOFTC(sc); + struct lacp_port *lp; + struct ifnet *ifp = lgp->lp_ifp; + struct sockaddr_dl sdl; + struct ifmultiaddr *rifma = NULL; + int error; + + boolean_t active = TRUE; /* XXX should be configurable */ + boolean_t fast = FALSE; /* XXX should be configurable */ + + bzero((char *)&sdl, sizeof(sdl)); + sdl.sdl_len = sizeof(sdl); + sdl.sdl_family = AF_LINK; + sdl.sdl_index = ifp->if_index; + sdl.sdl_type = IFT_ETHER; + sdl.sdl_alen = ETHER_ADDR_LEN; + + bcopy(ðermulticastaddr_slowprotocols, + LLADDR(&sdl), ETHER_ADDR_LEN); + + error = if_addmulti(ifp, (struct sockaddr *)&sdl, &rifma); + if (error) { + kprintf("%s: ADDMULTI failed on %s\n", __func__, lgp->lp_ifname); + return (error); + } + lp = kmalloc(sizeof(struct lacp_port), + M_DEVBUF, M_NOWAIT|M_ZERO); + if (lp == NULL) + return (ENOMEM); + + LACP_LOCK(lsc); + lgp->lp_psc = (caddr_t)lp; + lp->lp_ifp = ifp; + lp->lp_lagg = lgp; + lp->lp_lsc = lsc; + lp->lp_ifma = rifma; + + LIST_INSERT_HEAD(&lsc->lsc_ports, lp, lp_next); + + lacp_fill_actorinfo(lp, &lp->lp_actor); + lacp_fill_markerinfo(lp, &lp->lp_marker); + lp->lp_state = + (active ? LACP_STATE_ACTIVITY : 0) | + (fast ? LACP_STATE_TIMEOUT : 0); + lp->lp_aggregator = NULL; + lacp_sm_rx_set_expired(lp); + LACP_UNLOCK(lsc); + lacp_linkstate(lgp); + + return (0); +} + +void +lacp_port_destroy(struct lagg_port *lgp) +{ + struct lacp_port *lp = LACP_PORT(lgp); + struct lacp_softc *lsc = lp->lp_lsc; + int i; + + LACP_LOCK(lsc); + for (i = 0; i < LACP_NTIMER; i++) { + LACP_TIMER_DISARM(lp, i); + } + + lacp_disable_collecting(lp); + lacp_disable_distributing(lp); + lacp_unselect(lp); + + /* The address may have already been removed by if_purgemaddrs() */ +#if XXX + if (!lgp->lp_detaching) + if_delmulti_ifma(lp->lp_ifma); +#endif + LIST_REMOVE(lp, lp_next); + LACP_UNLOCK(lsc); + kfree(lp, M_DEVBUF); +} + +void +lacp_req(struct lagg_softc *sc, caddr_t data) +{ + struct lacp_opreq *req = (struct lacp_opreq *)data; + struct lacp_softc *lsc = LACP_SOFTC(sc); + struct lacp_aggregator *la = lsc->lsc_active_aggregator; + + LACP_LOCK(lsc); + bzero(req, sizeof(struct lacp_opreq)); + if (la != NULL) { + req->actor_prio = ntohs(la->la_actor.lip_systemid.lsi_prio); + memcpy(&req->actor_mac, &la->la_actor.lip_systemid.lsi_mac, + ETHER_ADDR_LEN); + req->actor_key = ntohs(la->la_actor.lip_key); + req->actor_portprio = ntohs(la->la_actor.lip_portid.lpi_prio); + req->actor_portno = ntohs(la->la_actor.lip_portid.lpi_portno); + req->actor_state = la->la_actor.lip_state; + + req->partner_prio = ntohs(la->la_partner.lip_systemid.lsi_prio); + memcpy(&req->partner_mac, &la->la_partner.lip_systemid.lsi_mac, + ETHER_ADDR_LEN); + req->partner_key = ntohs(la->la_partner.lip_key); + req->partner_portprio = ntohs(la->la_partner.lip_portid.lpi_prio); + req->partner_portno = ntohs(la->la_partner.lip_portid.lpi_portno); + req->partner_state = la->la_partner.lip_state; + } + LACP_UNLOCK(lsc); +} + +void +lacp_portreq(struct lagg_port *lgp, caddr_t data) +{ + struct lacp_opreq *req = (struct lacp_opreq *)data; + struct lacp_port *lp = LACP_PORT(lgp); + struct lacp_softc *lsc = lp->lp_lsc; + + LACP_LOCK(lsc); + req->actor_prio = ntohs(lp->lp_actor.lip_systemid.lsi_prio); + memcpy(&req->actor_mac, &lp->lp_actor.lip_systemid.lsi_mac, + ETHER_ADDR_LEN); + req->actor_key = ntohs(lp->lp_actor.lip_key); + req->actor_portprio = ntohs(lp->lp_actor.lip_portid.lpi_prio); + req->actor_portno = ntohs(lp->lp_actor.lip_portid.lpi_portno); + req->actor_state = lp->lp_actor.lip_state; + + req->partner_prio = ntohs(lp->lp_partner.lip_systemid.lsi_prio); + memcpy(&req->partner_mac, &lp->lp_partner.lip_systemid.lsi_mac, + ETHER_ADDR_LEN); + req->partner_key = ntohs(lp->lp_partner.lip_key); + req->partner_portprio = ntohs(lp->lp_partner.lip_portid.lpi_prio); + req->partner_portno = ntohs(lp->lp_partner.lip_portid.lpi_portno); + req->partner_state = lp->lp_partner.lip_state; + LACP_UNLOCK(lsc); +} + +static void +lacp_disable_collecting(struct lacp_port *lp) +{ + LACP_DPRINTF((lp, "collecting disabled\n")); + lp->lp_state &= ~LACP_STATE_COLLECTING; +} + +static void +lacp_enable_collecting(struct lacp_port *lp) +{ + LACP_DPRINTF((lp, "collecting enabled\n")); + lp->lp_state |= LACP_STATE_COLLECTING; +} + +static void +lacp_disable_distributing(struct lacp_port *lp) +{ + struct lacp_aggregator *la = lp->lp_aggregator; + struct lacp_softc *lsc = lp->lp_lsc; + struct lagg_softc *sc = lsc->lsc_softc; + char buf[LACP_LAGIDSTR_MAX+1]; + + LACP_LOCK_ASSERT(lsc); + + if (la == NULL || (lp->lp_state & LACP_STATE_DISTRIBUTING) == 0) { + return; + } + + KASSERT(!TAILQ_EMPTY(&la->la_ports), ("no aggregator ports")); + KASSERT(la->la_nports > 0, ("nports invalid (%d)", la->la_nports)); + KASSERT(la->la_refcnt >= la->la_nports, ("aggregator refcnt invalid")); + + LACP_DPRINTF((lp, "disable distributing on aggregator %s, " + "nports %d -> %d\n", + lacp_format_lagid_aggregator(la, buf, sizeof(buf)), + la->la_nports, la->la_nports - 1)); + + TAILQ_REMOVE(&la->la_ports, lp, lp_dist_q); + la->la_nports--; + sc->sc_active = la->la_nports; + + if (lsc->lsc_active_aggregator == la) { + lacp_suppress_distributing(lsc, la); + lacp_select_active_aggregator(lsc); + /* regenerate the port map, the active aggregator has changed */ + lacp_update_portmap(lsc); + } + + lp->lp_state &= ~LACP_STATE_DISTRIBUTING; +} + +static void +lacp_enable_distributing(struct lacp_port *lp) +{ + struct lacp_aggregator *la = lp->lp_aggregator; + struct lacp_softc *lsc = lp->lp_lsc; + struct lagg_softc *sc = lsc->lsc_softc; + char buf[LACP_LAGIDSTR_MAX+1]; + + LACP_LOCK_ASSERT(lsc); + + if ((lp->lp_state & LACP_STATE_DISTRIBUTING) != 0) { + return; + } + + LACP_DPRINTF((lp, "enable distributing on aggregator %s, " + "nports %d -> %d\n", + lacp_format_lagid_aggregator(la, buf, sizeof(buf)), + la->la_nports, la->la_nports + 1)); + + KASSERT(la->la_refcnt > la->la_nports, ("aggregator refcnt invalid")); + TAILQ_INSERT_HEAD(&la->la_ports, lp, lp_dist_q); + la->la_nports++; + sc->sc_active = la->la_nports; + + lp->lp_state |= LACP_STATE_DISTRIBUTING; + + if (lsc->lsc_active_aggregator == la) { + lacp_suppress_distributing(lsc, la); + lacp_update_portmap(lsc); + } else + /* try to become the active aggregator */ + lacp_select_active_aggregator(lsc); +} + +static void +lacp_transit_expire(void *vp) +{ + struct lacp_softc *lsc = vp; + + LACP_LOCK(lsc); + LACP_LOCK_ASSERT(lsc); + + LACP_TRACE(NULL); + + lsc->lsc_suppress_distributing = FALSE; + + LACP_UNLOCK(lsc); +} + +static void +lacp_attach_sysctl(struct lacp_softc *lsc, struct sysctl_oid *p_oid) +{ + struct lagg_softc *sc = lsc->lsc_softc; + + SYSCTL_ADD_UINT(&sc->ctx, SYSCTL_CHILDREN(p_oid), OID_AUTO, + "lacp_strict_mode", + CTLFLAG_RW, + &lsc->lsc_strict_mode, + lsc->lsc_strict_mode, + "Enable LACP strict mode"); +} + +static void +lacp_attach_sysctl_debug(struct lacp_softc *lsc, struct sysctl_oid *p_oid) +{ + struct lagg_softc *sc = lsc->lsc_softc; + struct sysctl_oid *oid; + + /* Create a child of the parent lagg interface */ + oid = SYSCTL_ADD_NODE(&sc->ctx, SYSCTL_CHILDREN(p_oid), + OID_AUTO, "debug", CTLFLAG_RD, NULL, "DEBUG"); + + SYSCTL_ADD_UINT(&sc->ctx, SYSCTL_CHILDREN(oid), OID_AUTO, + "rx_test", + CTLFLAG_RW, + &lsc->lsc_debug.lsc_rx_test, + lsc->lsc_debug.lsc_rx_test, + "Bitmap of if_dunit entries to drop RX frames for"); + SYSCTL_ADD_UINT(&sc->ctx, SYSCTL_CHILDREN(oid), OID_AUTO, + "tx_test", + CTLFLAG_RW, + &lsc->lsc_debug.lsc_tx_test, + lsc->lsc_debug.lsc_tx_test, + "Bitmap of if_dunit entries to drop TX frames for"); +} + +int +lacp_attach(struct lagg_softc *sc) +{ + struct lacp_softc *lsc; + struct sysctl_oid *oid; + + lsc = kmalloc(sizeof(struct lacp_softc), + M_DEVBUF, M_NOWAIT|M_ZERO); + if (lsc == NULL) + return (ENOMEM); + + sc->sc_psc = (caddr_t)lsc; + lsc->lsc_softc = sc; + + lsc->lsc_hashkey = karc4random(); + lsc->lsc_active_aggregator = NULL; + lsc->lsc_strict_mode = 1; + LACP_LOCK_INIT(lsc); + TAILQ_INIT(&lsc->lsc_aggregators); + LIST_INIT(&lsc->lsc_ports); + + /* Create a child of the parent lagg interface */ + oid = SYSCTL_ADD_NODE(&sc->ctx, SYSCTL_CHILDREN(sc->sc_oid), + OID_AUTO, "lacp", CTLFLAG_RD, NULL, "LACP"); + + /* Attach sysctl nodes */ + lacp_attach_sysctl(lsc, oid); + lacp_attach_sysctl_debug(lsc, oid); + +#if XXX + callout_init_mtx(&lsc->lsc_transit_callout, &lsc->lsc_lock, 0); + callout_init_mtx(&lsc->lsc_callout, &lsc->lsc_lock, 0); +#endif + + callout_init(&lsc->lsc_transit_callout); + callout_init(&lsc->lsc_callout); + + /* if the lagg is already up then do the same */ + if (sc->sc_ifp->if_flags & IFF_RUNNING) + lacp_init(sc); + + return (0); +} + +int +lacp_detach(struct lagg_softc *sc) +{ + struct lacp_softc *lsc = LACP_SOFTC(sc); + + KASSERT(TAILQ_EMPTY(&lsc->lsc_aggregators), + ("aggregators still active")); + KASSERT(lsc->lsc_active_aggregator == NULL, + ("aggregator still attached")); + + sc->sc_psc = NULL; + callout_drain(&lsc->lsc_transit_callout); + callout_drain(&lsc->lsc_callout); + + LACP_LOCK_DESTROY(lsc); + kfree(lsc, M_DEVBUF); + return (0); +} + +void +lacp_init(struct lagg_softc *sc) +{ + struct lacp_softc *lsc = LACP_SOFTC(sc); + + LACP_LOCK(lsc); + callout_reset(&lsc->lsc_callout, hz, lacp_tick, lsc); + LACP_UNLOCK(lsc); +} + +void +lacp_stop(struct lagg_softc *sc) +{ + struct lacp_softc *lsc = LACP_SOFTC(sc); + + LACP_LOCK(lsc); + callout_stop(&lsc->lsc_transit_callout); + callout_stop(&lsc->lsc_callout); + LACP_UNLOCK(lsc); +} + +struct lagg_port * +lacp_select_tx_port(struct lagg_softc *sc, struct mbuf *m) +{ + struct lacp_softc *lsc = LACP_SOFTC(sc); + struct lacp_portmap *pm; + struct lacp_port *lp; + uint32_t hash; + + if (__predict_false(lsc->lsc_suppress_distributing)) { + LACP_DPRINTF((NULL, "%s: waiting transit\n", __func__)); + return (NULL); + } + + pm = &lsc->lsc_pmap[lsc->lsc_activemap]; + if (pm->pm_count == 0) { + LACP_DPRINTF((NULL, "%s: no active aggregator\n", __func__)); + return (NULL); + } + +#if XXX + if (sc->use_flowid && (m->m_flags & M_FLOWID)) + hash = m->m_pkthdr.flowid; + else +#endif + hash = lagg_hashmbuf(sc, m, lsc->lsc_hashkey); + hash %= pm->pm_count; + lp = pm->pm_map[hash]; + + KASSERT((lp->lp_state & LACP_STATE_DISTRIBUTING) != 0, + ("aggregated port is not distributing")); + + return (lp->lp_lagg); +} +/* + * lacp_suppress_distributing: drop transmit packets for a while + * to preserve packet ordering. + */ + +static void +lacp_suppress_distributing(struct lacp_softc *lsc, struct lacp_aggregator *la) +{ + struct lacp_port *lp; + + if (lsc->lsc_active_aggregator != la) { + return; + } + + LACP_TRACE(NULL); + + lsc->lsc_suppress_distributing = TRUE; + + /* send a marker frame down each port to verify the queues are empty */ + LIST_FOREACH(lp, &lsc->lsc_ports, lp_next) { + lp->lp_flags |= LACP_PORT_MARK; + lacp_xmit_marker(lp); + } + + /* set a timeout for the marker frames */ + callout_reset(&lsc->lsc_transit_callout, + LACP_TRANSIT_DELAY * hz / 1000, lacp_transit_expire, lsc); +} + +static int +lacp_compare_peerinfo(const struct lacp_peerinfo *a, + const struct lacp_peerinfo *b) +{ + return (memcmp(a, b, offsetof(struct lacp_peerinfo, lip_state))); +} + +static int +lacp_compare_systemid(const struct lacp_systemid *a, + const struct lacp_systemid *b) +{ + return (memcmp(a, b, sizeof(*a))); +} + +#if 0 /* unused */ +static int +lacp_compare_portid(const struct lacp_portid *a, + const struct lacp_portid *b) +{ + return (memcmp(a, b, sizeof(*a))); +} +#endif + +static uint64_t +lacp_aggregator_bandwidth(struct lacp_aggregator *la) +{ + struct lacp_port *lp; + uint64_t speed; + + lp = TAILQ_FIRST(&la->la_ports); + if (lp == NULL) { + return (0); + } + + speed = ifmedia_baudrate(lp->lp_media); + speed *= la->la_nports; + if (speed == 0) { + LACP_DPRINTF((lp, "speed 0? media=0x%x nports=%d\n", + lp->lp_media, la->la_nports)); + } + + return (speed); +} + +/* + * lacp_select_active_aggregator: select an aggregator to be used to transmit + * packets from lagg(4) interface. + */ + +static void +lacp_select_active_aggregator(struct lacp_softc *lsc) +{ + struct lacp_aggregator *la; + struct lacp_aggregator *best_la = NULL; + uint64_t best_speed = 0; + char buf[LACP_LAGIDSTR_MAX+1]; + + LACP_TRACE(NULL); + + TAILQ_FOREACH(la, &lsc->lsc_aggregators, la_q) { + uint64_t speed; + + if (la->la_nports == 0) { + continue; + } + + speed = lacp_aggregator_bandwidth(la); + LACP_DPRINTF((NULL, "%s, speed=%jd, nports=%d\n", + lacp_format_lagid_aggregator(la, buf, sizeof(buf)), + speed, la->la_nports)); + + /* This aggregator is chosen if + * the partner has a better system priority + * or, the total aggregated speed is higher + * or, it is already the chosen aggregator + */ + if ((best_la != NULL && LACP_SYS_PRI(la->la_partner) < + LACP_SYS_PRI(best_la->la_partner)) || + speed > best_speed || + (speed == best_speed && + la == lsc->lsc_active_aggregator)) { + best_la = la; + best_speed = speed; + } + } + + KASSERT(best_la == NULL || best_la->la_nports > 0, + ("invalid aggregator refcnt")); + KASSERT(best_la == NULL || !TAILQ_EMPTY(&best_la->la_ports), + ("invalid aggregator list")); + + if (lsc->lsc_active_aggregator != best_la) { + LACP_DPRINTF((NULL, "active aggregator changed\n")); + LACP_DPRINTF((NULL, "old %s\n", + lacp_format_lagid_aggregator(lsc->lsc_active_aggregator, + buf, sizeof(buf)))); + } else { + LACP_DPRINTF((NULL, "active aggregator not changed\n")); + } + LACP_DPRINTF((NULL, "new %s\n", + lacp_format_lagid_aggregator(best_la, buf, sizeof(buf)))); + + if (lsc->lsc_active_aggregator != best_la) { + lsc->lsc_active_aggregator = best_la; + lacp_update_portmap(lsc); + if (best_la) { + lacp_suppress_distributing(lsc, best_la); + } + } +} + +/* + * Updated the inactive portmap array with the new list of ports and + * make it live. + */ +static void +lacp_update_portmap(struct lacp_softc *lsc) +{ + struct lagg_softc *sc = lsc->lsc_softc; + struct lacp_aggregator *la; + struct lacp_portmap *p; + struct lacp_port *lp; + uint64_t speed; + u_int newmap; + int i; + + newmap = lsc->lsc_activemap == 0 ? 1 : 0; + p = &lsc->lsc_pmap[newmap]; + la = lsc->lsc_active_aggregator; + speed = 0; + bzero(p, sizeof(struct lacp_portmap)); + + if (la != NULL && la->la_nports > 0) { + p->pm_count = la->la_nports; + i = 0; + TAILQ_FOREACH(lp, &la->la_ports, lp_dist_q) + p->pm_map[i++] = lp; + KASSERT(i == p->pm_count, ("Invalid port count")); + speed = lacp_aggregator_bandwidth(la); + } + sc->sc_ifp->if_baudrate = speed; + + /* switch the active portmap over */ + atomic_store_rel_int(&lsc->lsc_activemap, newmap); + LACP_DPRINTF((NULL, "Set table %d with %d ports\n", + lsc->lsc_activemap, + lsc->lsc_pmap[lsc->lsc_activemap].pm_count)); +} + +static uint16_t +lacp_compose_key(struct lacp_port *lp) +{ + struct lagg_port *lgp = lp->lp_lagg; + struct lagg_softc *sc = lgp->lp_softc; + u_int media = lp->lp_media; + uint16_t key; + + if ((lp->lp_state & LACP_STATE_AGGREGATION) == 0) { + + /* + * non-aggregatable links should have unique keys. + * + * XXX this isn't really unique as if_index is 16 bit. + */ + + /* bit 0..14: (some bits of) if_index of this port */ + key = lp->lp_ifp->if_index; + /* bit 15: 1 */ + key |= 0x8000; + } else { + u_int subtype = IFM_SUBTYPE(media); + + KASSERT(IFM_TYPE(media) == IFM_ETHER, ("invalid media type")); + KASSERT((media & IFM_FDX) != 0, ("aggregating HDX interface")); + + /* bit 0..4: IFM_SUBTYPE modulo speed */ + switch (subtype) { + case IFM_10_T: + case IFM_10_2: + case IFM_10_5: + case IFM_10_STP: + case IFM_10_FL: + key = IFM_10_T; + break; + case IFM_100_TX: + case IFM_100_FX: + case IFM_100_T4: + case IFM_100_VG: + case IFM_100_T2: + key = IFM_100_TX; + break; + case IFM_1000_SX: + case IFM_1000_LX: + case IFM_1000_CX: + case IFM_1000_T: + key = IFM_1000_SX; + break; + case IFM_10G_LR: + case IFM_10G_SR: + case IFM_10G_CX4: + case IFM_10G_TWINAX: + case IFM_10G_TWINAX_LONG: + case IFM_10G_LRM: + case IFM_10G_T: + key = IFM_10G_LR; + break; + case IFM_40G_CR4: + case IFM_40G_SR4: + case IFM_40G_LR4: + key = IFM_40G_CR4; + break; + default: + key = subtype; + } + /* bit 5..14: (some bits of) if_index of lagg device */ + key |= 0x7fe0 & ((sc->sc_ifp->if_index) << 5); + /* bit 15: 0 */ + } + return (htons(key)); +} + +static void +lacp_aggregator_addref(struct lacp_softc *lsc, struct lacp_aggregator *la) +{ + char buf[LACP_LAGIDSTR_MAX+1]; + + LACP_DPRINTF((NULL, "%s: lagid=%s, refcnt %d -> %d\n", + __func__, + lacp_format_lagid(&la->la_actor, &la->la_partner, + buf, sizeof(buf)), + la->la_refcnt, la->la_refcnt + 1)); + + KASSERT(la->la_refcnt > 0, ("refcount <= 0")); + la->la_refcnt++; + KASSERT(la->la_refcnt > la->la_nports, ("invalid refcount")); +} + +static void +lacp_aggregator_delref(struct lacp_softc *lsc, struct lacp_aggregator *la) +{ + char buf[LACP_LAGIDSTR_MAX+1]; + + LACP_DPRINTF((NULL, "%s: lagid=%s, refcnt %d -> %d\n", + __func__, + lacp_format_lagid(&la->la_actor, &la->la_partner, + buf, sizeof(buf)), + la->la_refcnt, la->la_refcnt - 1)); + + KASSERT(la->la_refcnt > la->la_nports, ("invalid refcnt")); + la->la_refcnt--; + if (la->la_refcnt > 0) { + return; + } + + KASSERT(la->la_refcnt == 0, ("refcount not zero")); + KASSERT(lsc->lsc_active_aggregator != la, ("aggregator active")); + + TAILQ_REMOVE(&lsc->lsc_aggregators, la, la_q); + + kfree(la, M_DEVBUF); +} + +/* + * lacp_aggregator_get: allocate an aggregator. + */ + +static struct lacp_aggregator * +lacp_aggregator_get(struct lacp_softc *lsc, struct lacp_port *lp) +{ + struct lacp_aggregator *la; + + la = kmalloc(sizeof(*la), M_DEVBUF, M_NOWAIT); + if (la) { + la->la_refcnt = 1; + la->la_nports = 0; + TAILQ_INIT(&la->la_ports); + la->la_pending = 0; + TAILQ_INSERT_TAIL(&lsc->lsc_aggregators, la, la_q); + } + + return (la); +} + +/* + * lacp_fill_aggregator_id: setup a newly allocated aggregator from a port. + */ + +static void +lacp_fill_aggregator_id(struct lacp_aggregator *la, const struct lacp_port *lp) +{ + lacp_fill_aggregator_id_peer(&la->la_partner, &lp->lp_partner); + lacp_fill_aggregator_id_peer(&la->la_actor, &lp->lp_actor); + + la->la_actor.lip_state = lp->lp_state & LACP_STATE_AGGREGATION; +} + +static void +lacp_fill_aggregator_id_peer(struct lacp_peerinfo *lpi_aggr, + const struct lacp_peerinfo *lpi_port) +{ + memset(lpi_aggr, 0, sizeof(*lpi_aggr)); + lpi_aggr->lip_systemid = lpi_port->lip_systemid; + lpi_aggr->lip_key = lpi_port->lip_key; +} + +/* + * lacp_aggregator_is_compatible: check if a port can join to an aggregator. + */ + +static int +lacp_aggregator_is_compatible(const struct lacp_aggregator *la, + const struct lacp_port *lp) +{ + if (!(lp->lp_state & LACP_STATE_AGGREGATION) || + !(lp->lp_partner.lip_state & LACP_STATE_AGGREGATION)) { + return (0); + } + + if (!(la->la_actor.lip_state & LACP_STATE_AGGREGATION)) { + return (0); + } + + if (!lacp_peerinfo_is_compatible(&la->la_partner, &lp->lp_partner)) { + return (0); + } + + if (!lacp_peerinfo_is_compatible(&la->la_actor, &lp->lp_actor)) { + return (0); + } + + return (1); +} + +static int +lacp_peerinfo_is_compatible(const struct lacp_peerinfo *a, + const struct lacp_peerinfo *b) +{ + if (memcmp(&a->lip_systemid, &b->lip_systemid, + sizeof(a->lip_systemid))) { + return (0); + } + + if (memcmp(&a->lip_key, &b->lip_key, sizeof(a->lip_key))) { + return (0); + } + + return (1); +} + +static void +lacp_port_enable(struct lacp_port *lp) +{ + lp->lp_state |= LACP_STATE_AGGREGATION; +} + +static void +lacp_port_disable(struct lacp_port *lp) +{ + lacp_set_mux(lp, LACP_MUX_DETACHED); + + lp->lp_state &= ~LACP_STATE_AGGREGATION; + lp->lp_selected = LACP_UNSELECTED; + lacp_sm_rx_record_default(lp); + lp->lp_partner.lip_state &= ~LACP_STATE_AGGREGATION; + lp->lp_state &= ~LACP_STATE_EXPIRED; +} + +/* + * lacp_select: select an aggregator. create one if necessary. + */ +static void +lacp_select(struct lacp_port *lp) +{ + struct lacp_softc *lsc = lp->lp_lsc; + struct lacp_aggregator *la; + char buf[LACP_LAGIDSTR_MAX+1]; + + if (lp->lp_aggregator) { + return; + } + + KASSERT(!LACP_TIMER_ISARMED(lp, LACP_TIMER_WAIT_WHILE), + ("timer_wait_while still active")); + + LACP_DPRINTF((lp, "port lagid=%s\n", + lacp_format_lagid(&lp->lp_actor, &lp->lp_partner, + buf, sizeof(buf)))); + + TAILQ_FOREACH(la, &lsc->lsc_aggregators, la_q) { + if (lacp_aggregator_is_compatible(la, lp)) { + break; + } + } + + if (la == NULL) { + la = lacp_aggregator_get(lsc, lp); + if (la == NULL) { + LACP_DPRINTF((lp, "aggregator creation failed\n")); + + /* + * will retry on the next tick. + */ + + return; + } + lacp_fill_aggregator_id(la, lp); + LACP_DPRINTF((lp, "aggregator created\n")); + } else { + LACP_DPRINTF((lp, "compatible aggregator found\n")); + if (la->la_refcnt == LACP_MAX_PORTS) + return; + lacp_aggregator_addref(lsc, la); + } + + LACP_DPRINTF((lp, "aggregator lagid=%s\n", + lacp_format_lagid(&la->la_actor, &la->la_partner, + buf, sizeof(buf)))); + + lp->lp_aggregator = la; + lp->lp_selected = LACP_SELECTED; +} + +/* + * lacp_unselect: finish unselect/detach process. + */ + +static void +lacp_unselect(struct lacp_port *lp) +{ + struct lacp_softc *lsc = lp->lp_lsc; + struct lacp_aggregator *la = lp->lp_aggregator; + + KASSERT(!LACP_TIMER_ISARMED(lp, LACP_TIMER_WAIT_WHILE), + ("timer_wait_while still active")); + + if (la == NULL) { + return; + } + + lp->lp_aggregator = NULL; + lacp_aggregator_delref(lsc, la); +} + +/* mux machine */ + +static void +lacp_sm_mux(struct lacp_port *lp) +{ + struct lagg_port *lgp = lp->lp_lagg; + struct lagg_softc *sc = lgp->lp_softc; + enum lacp_mux_state new_state; + boolean_t p_sync = + (lp->lp_partner.lip_state & LACP_STATE_SYNC) != 0; + boolean_t p_collecting = + (lp->lp_partner.lip_state & LACP_STATE_COLLECTING) != 0; + enum lacp_selected selected = lp->lp_selected; + struct lacp_aggregator *la; + + if (lacp_debug > 1) + lacp_dprintf(lp, "%s: state= 0x%x, selected= 0x%x, " + "p_sync= 0x%x, p_collecting= 0x%x\n", __func__, + lp->lp_mux_state, selected, p_sync, p_collecting); + +re_eval: + la = lp->lp_aggregator; + KASSERT(lp->lp_mux_state == LACP_MUX_DETACHED || la != NULL, + ("MUX not detached")); + new_state = lp->lp_mux_state; + switch (lp->lp_mux_state) { + case LACP_MUX_DETACHED: + if (selected != LACP_UNSELECTED) { + new_state = LACP_MUX_WAITING; + } + break; + case LACP_MUX_WAITING: + KASSERT(la->la_pending > 0 || + !LACP_TIMER_ISARMED(lp, LACP_TIMER_WAIT_WHILE), + ("timer_wait_while still active")); + if (selected == LACP_SELECTED && la->la_pending == 0) { + new_state = LACP_MUX_ATTACHED; + } else if (selected == LACP_UNSELECTED) { + new_state = LACP_MUX_DETACHED; + } + break; + case LACP_MUX_ATTACHED: + if (selected == LACP_SELECTED && p_sync) { + new_state = LACP_MUX_COLLECTING; + } else if (selected != LACP_SELECTED) { + new_state = LACP_MUX_DETACHED; + } + break; + case LACP_MUX_COLLECTING: + if (selected == LACP_SELECTED && p_sync && p_collecting) { + new_state = LACP_MUX_DISTRIBUTING; + } else if (selected != LACP_SELECTED || !p_sync) { + new_state = LACP_MUX_ATTACHED; + } + break; + case LACP_MUX_DISTRIBUTING: + if (selected != LACP_SELECTED || !p_sync || !p_collecting) { + new_state = LACP_MUX_COLLECTING; + lacp_dprintf(lp, "Interface stopped DISTRIBUTING, possible flapping\n"); + sc->sc_flapping++; + } + break; + default: + panic("%s: unknown state", __func__); + } + + if (lp->lp_mux_state == new_state) { + return; + } + + lacp_set_mux(lp, new_state); + goto re_eval; +} + +static void +lacp_set_mux(struct lacp_port *lp, enum lacp_mux_state new_state) +{ + struct lacp_aggregator *la = lp->lp_aggregator; + + if (lp->lp_mux_state == new_state) { + return; + } + + switch (new_state) { + case LACP_MUX_DETACHED: + lp->lp_state &= ~LACP_STATE_SYNC; + lacp_disable_distributing(lp); + lacp_disable_collecting(lp); + lacp_sm_assert_ntt(lp); + /* cancel timer */ + if (LACP_TIMER_ISARMED(lp, LACP_TIMER_WAIT_WHILE)) { + KASSERT(la->la_pending > 0, + ("timer_wait_while not active")); + la->la_pending--; + } + LACP_TIMER_DISARM(lp, LACP_TIMER_WAIT_WHILE); + lacp_unselect(lp); + break; + case LACP_MUX_WAITING: + LACP_TIMER_ARM(lp, LACP_TIMER_WAIT_WHILE, + LACP_AGGREGATE_WAIT_TIME); + la->la_pending++; + break; + case LACP_MUX_ATTACHED: + lp->lp_state |= LACP_STATE_SYNC; + lacp_disable_collecting(lp); + lacp_sm_assert_ntt(lp); + break; + case LACP_MUX_COLLECTING: + lacp_enable_collecting(lp); + lacp_disable_distributing(lp); + lacp_sm_assert_ntt(lp); + break; + case LACP_MUX_DISTRIBUTING: + lacp_enable_distributing(lp); + break; + default: + panic("%s: unknown state", __func__); + } + + LACP_DPRINTF((lp, "mux_state %d -> %d\n", lp->lp_mux_state, new_state)); + + lp->lp_mux_state = new_state; +} + +static void +lacp_sm_mux_timer(struct lacp_port *lp) +{ + struct lacp_aggregator *la = lp->lp_aggregator; + char buf[LACP_LAGIDSTR_MAX+1]; + + KASSERT(la->la_pending > 0, ("no pending event")); + + LACP_DPRINTF((lp, "%s: aggregator %s, pending %d -> %d\n", __func__, + lacp_format_lagid(&la->la_actor, &la->la_partner, + buf, sizeof(buf)), + la->la_pending, la->la_pending - 1)); + + la->la_pending--; +} + +/* periodic transmit machine */ + +static void +lacp_sm_ptx_update_timeout(struct lacp_port *lp, uint8_t oldpstate) +{ + if (LACP_STATE_EQ(oldpstate, lp->lp_partner.lip_state, + LACP_STATE_TIMEOUT)) { + return; + } + + LACP_DPRINTF((lp, "partner timeout changed\n")); + + /* + * FAST_PERIODIC -> SLOW_PERIODIC + * or + * SLOW_PERIODIC (-> PERIODIC_TX) -> FAST_PERIODIC + * + * let lacp_sm_ptx_tx_schedule to update timeout. + */ + + LACP_TIMER_DISARM(lp, LACP_TIMER_PERIODIC); + + /* + * if timeout has been shortened, assert NTT. + */ + + if ((lp->lp_partner.lip_state & LACP_STATE_TIMEOUT)) { + lacp_sm_assert_ntt(lp); + } +} + +static void +lacp_sm_ptx_tx_schedule(struct lacp_port *lp) +{ + int timeout; + + if (!(lp->lp_state & LACP_STATE_ACTIVITY) && + !(lp->lp_partner.lip_state & LACP_STATE_ACTIVITY)) { + + /* + * NO_PERIODIC + */ + + LACP_TIMER_DISARM(lp, LACP_TIMER_PERIODIC); + return; + } + + if (LACP_TIMER_ISARMED(lp, LACP_TIMER_PERIODIC)) { + return; + } + + timeout = (lp->lp_partner.lip_state & LACP_STATE_TIMEOUT) ? + LACP_FAST_PERIODIC_TIME : LACP_SLOW_PERIODIC_TIME; + + LACP_TIMER_ARM(lp, LACP_TIMER_PERIODIC, timeout); +} + +static void +lacp_sm_ptx_timer(struct lacp_port *lp) +{ + lacp_sm_assert_ntt(lp); +} + +static void +lacp_sm_rx(struct lacp_port *lp, const struct lacpdu *du) +{ + int timeout; + + /* + * check LACP_DISABLED first + */ + + if (!(lp->lp_state & LACP_STATE_AGGREGATION)) { + return; + } + + /* + * check loopback condition. + */ + + if (!lacp_compare_systemid(&du->ldu_actor.lip_systemid, + &lp->lp_actor.lip_systemid)) { + return; + } + + /* + * EXPIRED, DEFAULTED, CURRENT -> CURRENT + */ + + lacp_sm_rx_update_selected(lp, du); + lacp_sm_rx_update_ntt(lp, du); + lacp_sm_rx_record_pdu(lp, du); + + timeout = (lp->lp_state & LACP_STATE_TIMEOUT) ? + LACP_SHORT_TIMEOUT_TIME : LACP_LONG_TIMEOUT_TIME; + LACP_TIMER_ARM(lp, LACP_TIMER_CURRENT_WHILE, timeout); + + lp->lp_state &= ~LACP_STATE_EXPIRED; + + /* + * kick transmit machine without waiting the next tick. + */ + + lacp_sm_tx(lp); +} + +static void +lacp_sm_rx_set_expired(struct lacp_port *lp) +{ + lp->lp_partner.lip_state &= ~LACP_STATE_SYNC; + lp->lp_partner.lip_state |= LACP_STATE_TIMEOUT; + LACP_TIMER_ARM(lp, LACP_TIMER_CURRENT_WHILE, LACP_SHORT_TIMEOUT_TIME); + lp->lp_state |= LACP_STATE_EXPIRED; +} + +static void +lacp_sm_rx_timer(struct lacp_port *lp) +{ + if ((lp->lp_state & LACP_STATE_EXPIRED) == 0) { + /* CURRENT -> EXPIRED */ + LACP_DPRINTF((lp, "%s: CURRENT -> EXPIRED\n", __func__)); + lacp_sm_rx_set_expired(lp); + } else { + /* EXPIRED -> DEFAULTED */ + LACP_DPRINTF((lp, "%s: EXPIRED -> DEFAULTED\n", __func__)); + lacp_sm_rx_update_default_selected(lp); + lacp_sm_rx_record_default(lp); + lp->lp_state &= ~LACP_STATE_EXPIRED; + } +} + +static void +lacp_sm_rx_record_pdu(struct lacp_port *lp, const struct lacpdu *du) +{ + boolean_t active; + uint8_t oldpstate; + char buf[LACP_STATESTR_MAX+1]; + + LACP_TRACE(lp); + + oldpstate = lp->lp_partner.lip_state; + + active = (du->ldu_actor.lip_state & LACP_STATE_ACTIVITY) + || ((lp->lp_state & LACP_STATE_ACTIVITY) && + (du->ldu_partner.lip_state & LACP_STATE_ACTIVITY)); + + lp->lp_partner = du->ldu_actor; + if (active && + ((LACP_STATE_EQ(lp->lp_state, du->ldu_partner.lip_state, + LACP_STATE_AGGREGATION) && + !lacp_compare_peerinfo(&lp->lp_actor, &du->ldu_partner)) + || (du->ldu_partner.lip_state & LACP_STATE_AGGREGATION) == 0)) { + /* XXX nothing? */ + } else { + lp->lp_partner.lip_state &= ~LACP_STATE_SYNC; + } + + lp->lp_state &= ~LACP_STATE_DEFAULTED; + + if (oldpstate != lp->lp_partner.lip_state) { + LACP_DPRINTF((lp, "old pstate %s\n", + lacp_format_state(oldpstate, buf, sizeof(buf)))); + LACP_DPRINTF((lp, "new pstate %s\n", + lacp_format_state(lp->lp_partner.lip_state, buf, + sizeof(buf)))); + } + + /* XXX Hack, still need to implement 5.4.9 para 2,3,4 */ + if (lp->lp_lsc->lsc_strict_mode) + lp->lp_partner.lip_state |= LACP_STATE_SYNC; + + lacp_sm_ptx_update_timeout(lp, oldpstate); +} + +static void +lacp_sm_rx_update_ntt(struct lacp_port *lp, const struct lacpdu *du) +{ + + LACP_TRACE(lp); + + if (lacp_compare_peerinfo(&lp->lp_actor, &du->ldu_partner) || + !LACP_STATE_EQ(lp->lp_state, du->ldu_partner.lip_state, + LACP_STATE_ACTIVITY | LACP_STATE_SYNC | LACP_STATE_AGGREGATION)) { + LACP_DPRINTF((lp, "%s: assert ntt\n", __func__)); + lacp_sm_assert_ntt(lp); + } +} + +static void +lacp_sm_rx_record_default(struct lacp_port *lp) +{ + uint8_t oldpstate; + + LACP_TRACE(lp); + + oldpstate = lp->lp_partner.lip_state; + if (lp->lp_lsc->lsc_strict_mode) + lp->lp_partner = lacp_partner_admin_strict; + else + lp->lp_partner = lacp_partner_admin_optimistic;; + lp->lp_state |= LACP_STATE_DEFAULTED; + lacp_sm_ptx_update_timeout(lp, oldpstate); +} + +static void +lacp_sm_rx_update_selected_from_peerinfo(struct lacp_port *lp, + const struct lacp_peerinfo *info) +{ + + LACP_TRACE(lp); + + if (lacp_compare_peerinfo(&lp->lp_partner, info) || + !LACP_STATE_EQ(lp->lp_partner.lip_state, info->lip_state, + LACP_STATE_AGGREGATION)) { + lp->lp_selected = LACP_UNSELECTED; + /* mux machine will clean up lp->lp_aggregator */ + } +} + +static void +lacp_sm_rx_update_selected(struct lacp_port *lp, const struct lacpdu *du) +{ + + LACP_TRACE(lp); + + lacp_sm_rx_update_selected_from_peerinfo(lp, &du->ldu_actor); +} + +static void +lacp_sm_rx_update_default_selected(struct lacp_port *lp) +{ + + LACP_TRACE(lp); + + if (lp->lp_lsc->lsc_strict_mode) + lacp_sm_rx_update_selected_from_peerinfo(lp, + &lacp_partner_admin_strict); + else + lacp_sm_rx_update_selected_from_peerinfo(lp, + &lacp_partner_admin_optimistic); +} + +/* transmit machine */ + +static void +lacp_sm_tx(struct lacp_port *lp) +{ + int error = 0; + + if (!(lp->lp_state & LACP_STATE_AGGREGATION) +#if 1 + || (!(lp->lp_state & LACP_STATE_ACTIVITY) + && !(lp->lp_partner.lip_state & LACP_STATE_ACTIVITY)) +#endif + ) { + lp->lp_flags &= ~LACP_PORT_NTT; + } + + if (!(lp->lp_flags & LACP_PORT_NTT)) { + return; + } + + /* Rate limit to 3 PDUs per LACP_FAST_PERIODIC_TIME */ + if (ppsratecheck(&lp->lp_last_lacpdu, &lp->lp_lacpdu_sent, + (3 / LACP_FAST_PERIODIC_TIME)) == 0) { + LACP_DPRINTF((lp, "rate limited pdu\n")); + return; + } + + if (((1 << lp->lp_ifp->if_dunit) & lp->lp_lsc->lsc_debug.lsc_tx_test) == 0) { + error = lacp_xmit_lacpdu(lp); + } else { + LACP_TPRINTF((lp, "Dropping TX PDU\n")); + } + + if (error == 0) { + lp->lp_flags &= ~LACP_PORT_NTT; + } else { + LACP_DPRINTF((lp, "lacpdu transmit failure, error %d\n", + error)); + } +} + +static void +lacp_sm_assert_ntt(struct lacp_port *lp) +{ + + lp->lp_flags |= LACP_PORT_NTT; +} + +static void +lacp_run_timers(struct lacp_port *lp) +{ + int i; + + for (i = 0; i < LACP_NTIMER; i++) { + KASSERT(lp->lp_timer[i] >= 0, + ("invalid timer value %d", lp->lp_timer[i])); + if (lp->lp_timer[i] == 0) { + continue; + } else if (--lp->lp_timer[i] <= 0) { + if (lacp_timer_funcs[i]) { + (*lacp_timer_funcs[i])(lp); + } + } + } +} + +int +lacp_marker_input(struct lacp_port *lp, struct mbuf *m) +{ + struct lacp_softc *lsc = lp->lp_lsc; + struct lagg_port *lgp = lp->lp_lagg; + struct lacp_port *lp2; + struct markerdu *mdu; + int error = 0; + int pending = 0; + + if (m->m_pkthdr.len != sizeof(*mdu)) { + goto bad; + } + + if ((m->m_flags & M_MCAST) == 0) { + goto bad; + } + + if (m->m_len < sizeof(*mdu)) { + m = m_pullup(m, sizeof(*mdu)); + if (m == NULL) { + return (ENOMEM); + } + } + + mdu = mtod(m, struct markerdu *); + + if (memcmp(&mdu->mdu_eh.ether_dhost, + ðermulticastaddr_slowprotocols, ETHER_ADDR_LEN)) { + goto bad; + } + + if (mdu->mdu_sph.sph_version != 1) { + goto bad; + } + + switch (mdu->mdu_tlv.tlv_type) { + case MARKER_TYPE_INFO: + if (tlv_check(mdu, sizeof(*mdu), &mdu->mdu_tlv, + marker_info_tlv_template, TRUE)) { + goto bad; + } + mdu->mdu_tlv.tlv_type = MARKER_TYPE_RESPONSE; + memcpy(&mdu->mdu_eh.ether_dhost, + ðermulticastaddr_slowprotocols, ETHER_ADDR_LEN); + memcpy(&mdu->mdu_eh.ether_shost, + lgp->lp_lladdr, ETHER_ADDR_LEN); + error = lagg_enqueue(lp->lp_ifp, m); + break; + + case MARKER_TYPE_RESPONSE: + if (tlv_check(mdu, sizeof(*mdu), &mdu->mdu_tlv, + marker_response_tlv_template, TRUE)) { + goto bad; + } + LACP_DPRINTF((lp, "marker response, port=%u, sys=%6d, id=%u\n", + ntohs(mdu->mdu_info.mi_rq_port), *mdu->mdu_info.mi_rq_system, + ntohl(mdu->mdu_info.mi_rq_xid))); + + /* Verify that it is the last marker we sent out */ + if (memcmp(&mdu->mdu_info, &lp->lp_marker, + sizeof(struct lacp_markerinfo))) + goto bad; + + LACP_LOCK(lsc); + lp->lp_flags &= ~LACP_PORT_MARK; + + if (lsc->lsc_suppress_distributing) { + /* Check if any ports are waiting for a response */ + LIST_FOREACH(lp2, &lsc->lsc_ports, lp_next) { + if (lp2->lp_flags & LACP_PORT_MARK) { + pending = 1; + break; + } + } + + if (pending == 0) { + /* All interface queues are clear */ + LACP_DPRINTF((NULL, "queue flush complete\n")); + lsc->lsc_suppress_distributing = FALSE; + } + } + LACP_UNLOCK(lsc); + m_freem(m); + break; + + default: + goto bad; + } + + return (error); + +bad: + LACP_DPRINTF((lp, "bad marker frame\n")); + m_freem(m); + return (EINVAL); +} + +static int +tlv_check(const void *p, size_t size, const struct tlvhdr *tlv, + const struct tlv_template *tmpl, boolean_t check_type) +{ + while (/* CONSTCOND */ 1) { + if ((const char *)tlv - (const char *)p + sizeof(*tlv) > size) { + return (EINVAL); + } + if ((check_type && tlv->tlv_type != tmpl->tmpl_type) || + tlv->tlv_length != tmpl->tmpl_length) { + return (EINVAL); + } + if (tmpl->tmpl_type == 0) { + break; + } + tlv = (const struct tlvhdr *) + ((const char *)tlv + tlv->tlv_length); + tmpl++; + } + + return (0); +} + +/* Debugging */ +const char * +lacp_format_mac(const uint8_t *mac, char *buf, size_t buflen) +{ + ksnprintf(buf, buflen, "%02X-%02X-%02X-%02X-%02X-%02X", + (int)mac[0], + (int)mac[1], + (int)mac[2], + (int)mac[3], + (int)mac[4], + (int)mac[5]); + + return (buf); +} + +const char * +lacp_format_systemid(const struct lacp_systemid *sysid, + char *buf, size_t buflen) +{ + char macbuf[LACP_MACSTR_MAX+1]; + + ksnprintf(buf, buflen, "%04X,%s", + ntohs(sysid->lsi_prio), + lacp_format_mac(sysid->lsi_mac, macbuf, sizeof(macbuf))); + + return (buf); +} + +const char * +lacp_format_portid(const struct lacp_portid *portid, char *buf, size_t buflen) +{ + ksnprintf(buf, buflen, "%04X,%04X", + ntohs(portid->lpi_prio), + ntohs(portid->lpi_portno)); + + return (buf); +} + +const char * +lacp_format_partner(const struct lacp_peerinfo *peer, char *buf, size_t buflen) +{ + char sysid[LACP_SYSTEMIDSTR_MAX+1]; + char portid[LACP_PORTIDSTR_MAX+1]; + + ksnprintf(buf, buflen, "(%s,%04X,%s)", + lacp_format_systemid(&peer->lip_systemid, sysid, sizeof(sysid)), + ntohs(peer->lip_key), + lacp_format_portid(&peer->lip_portid, portid, sizeof(portid))); + + return (buf); +} + +const char * +lacp_format_lagid(const struct lacp_peerinfo *a, + const struct lacp_peerinfo *b, char *buf, size_t buflen) +{ + char astr[LACP_PARTNERSTR_MAX+1]; + char bstr[LACP_PARTNERSTR_MAX+1]; + +#if 0 + /* + * there's a convention to display small numbered peer + * in the left. + */ + + if (lacp_compare_peerinfo(a, b) > 0) { + const struct lacp_peerinfo *t; + + t = a; + a = b; + b = t; + } +#endif + + ksnprintf(buf, buflen, "[%s,%s]", + lacp_format_partner(a, astr, sizeof(astr)), + lacp_format_partner(b, bstr, sizeof(bstr))); + + return (buf); +} + +const char * +lacp_format_lagid_aggregator(const struct lacp_aggregator *la, + char *buf, size_t buflen) +{ + if (la == NULL) { + return ("(none)"); + } + + return (lacp_format_lagid(&la->la_actor, &la->la_partner, buf, buflen)); +} + +const char * +lacp_format_state(uint8_t state, char *buf, size_t buflen) +{ + ksnprintf(buf, buflen, "%b", state, LACP_STATE_BITS); + return (buf); +} + +static void +lacp_dump_lacpdu(const struct lacpdu *du) +{ + char buf[LACP_PARTNERSTR_MAX+1]; + char buf2[LACP_STATESTR_MAX+1]; + + kprintf("actor=%s\n", + lacp_format_partner(&du->ldu_actor, buf, sizeof(buf))); + kprintf("actor.state=%s\n", + lacp_format_state(du->ldu_actor.lip_state, buf2, sizeof(buf2))); + kprintf("partner=%s\n", + lacp_format_partner(&du->ldu_partner, buf, sizeof(buf))); + kprintf("partner.state=%s\n", + lacp_format_state(du->ldu_partner.lip_state, buf2, sizeof(buf2))); + + kprintf("maxdelay=%d\n", ntohs(du->ldu_collector.lci_maxdelay)); +} + +static void +lacp_dprintf(const struct lacp_port *lp, const char *fmt, ...) +{ + __va_list va; + + if (lp) { + kprintf("%s: ", lp->lp_ifp->if_xname); + } + + __va_start(va, fmt); + kvprintf(fmt, va); + __va_end(va); +} diff --git a/sys/net/lagg/ieee8023ad_lacp.h b/sys/net/lagg/ieee8023ad_lacp.h new file mode 100644 index 0000000000..5f99b39df7 --- /dev/null +++ b/sys/net/lagg/ieee8023ad_lacp.h @@ -0,0 +1,338 @@ +/* $NetBSD: ieee8023ad_impl.h,v 1.2 2005/12/10 23:21:39 elad Exp $ */ + +/*- + * Copyright (c)2005 YAMAMOTO Takashi, + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * IEEE802.3ad LACP + * + * implementation details. + */ + +#define LACP_TIMER_CURRENT_WHILE 0 +#define LACP_TIMER_PERIODIC 1 +#define LACP_TIMER_WAIT_WHILE 2 +#define LACP_NTIMER 3 + +#define LACP_TIMER_ARM(port, timer, val) \ + (port)->lp_timer[(timer)] = (val) +#define LACP_TIMER_DISARM(port, timer) \ + (port)->lp_timer[(timer)] = 0 +#define LACP_TIMER_ISARMED(port, timer) \ + ((port)->lp_timer[(timer)] > 0) + +/* + * IEEE802.3ad LACP + * + * protocol definitions. + */ + +#define LACP_STATE_ACTIVITY (1<<0) +#define LACP_STATE_TIMEOUT (1<<1) +#define LACP_STATE_AGGREGATION (1<<2) +#define LACP_STATE_SYNC (1<<3) +#define LACP_STATE_COLLECTING (1<<4) +#define LACP_STATE_DISTRIBUTING (1<<5) +#define LACP_STATE_DEFAULTED (1<<6) +#define LACP_STATE_EXPIRED (1<<7) + +#define LACP_PORT_NTT 0x00000001 +#define LACP_PORT_MARK 0x00000002 + +#define LACP_STATE_BITS \ + "\020" \ + "\001ACTIVITY" \ + "\002TIMEOUT" \ + "\003AGGREGATION" \ + "\004SYNC" \ + "\005COLLECTING" \ + "\006DISTRIBUTING" \ + "\007DEFAULTED" \ + "\010EXPIRED" + +/* + * IEEE802.3 slow protocols + * + * protocol (on-wire) definitions. + * + * XXX should be elsewhere. + */ + +#define SLOWPROTOCOLS_SUBTYPE_LACP 1 +#define SLOWPROTOCOLS_SUBTYPE_MARKER 2 + +struct slowprothdr { + uint8_t sph_subtype; + uint8_t sph_version; +} __packed; + +/* + * TLV on-wire structure. + */ + +struct tlvhdr { + uint8_t tlv_type; + uint8_t tlv_length; + /* uint8_t tlv_value[]; */ +} __packed; + +/* + * ... and our implementation. + */ + +#define TLV_SET(tlv, type, length) \ + do { \ + (tlv)->tlv_type = (type); \ + (tlv)->tlv_length = sizeof(*tlv) + (length); \ + } while (/*CONSTCOND*/0) + +struct tlv_template { + uint8_t tmpl_type; + uint8_t tmpl_length; +}; + +struct lacp_systemid { + uint16_t lsi_prio; + uint8_t lsi_mac[6]; +} __packed; + +struct lacp_portid { + uint16_t lpi_prio; + uint16_t lpi_portno; +} __packed; + +struct lacp_peerinfo { + struct lacp_systemid lip_systemid; + uint16_t lip_key; + struct lacp_portid lip_portid; + uint8_t lip_state; + uint8_t lip_resv[3]; +} __packed; + +struct lacp_collectorinfo { + uint16_t lci_maxdelay; + uint8_t lci_resv[12]; +} __packed; + +struct lacpdu { + struct ether_header ldu_eh; + struct slowprothdr ldu_sph; + + struct tlvhdr ldu_tlv_actor; + struct lacp_peerinfo ldu_actor; + struct tlvhdr ldu_tlv_partner; + struct lacp_peerinfo ldu_partner; + struct tlvhdr ldu_tlv_collector; + struct lacp_collectorinfo ldu_collector; + struct tlvhdr ldu_tlv_term; + uint8_t ldu_resv[50]; +} __packed; + +/* + * IEEE802.3ad marker protocol + * + * protocol (on-wire) definitions. + */ +struct lacp_markerinfo { + uint16_t mi_rq_port; + uint8_t mi_rq_system[ETHER_ADDR_LEN]; + uint32_t mi_rq_xid; + uint8_t mi_pad[2]; +} __packed; + +struct markerdu { + struct ether_header mdu_eh; + struct slowprothdr mdu_sph; + + struct tlvhdr mdu_tlv; + struct lacp_markerinfo mdu_info; + struct tlvhdr mdu_tlv_term; + uint8_t mdu_resv[90]; +} __packed; + +#define MARKER_TYPE_INFO 0x01 +#define MARKER_TYPE_RESPONSE 0x02 + +enum lacp_selected { + LACP_UNSELECTED, + LACP_STANDBY, /* not used in this implementation */ + LACP_SELECTED, +}; + +enum lacp_mux_state { + LACP_MUX_DETACHED, + LACP_MUX_WAITING, + LACP_MUX_ATTACHED, + LACP_MUX_COLLECTING, + LACP_MUX_DISTRIBUTING, +}; + +#define LACP_MAX_PORTS 32 + +struct lacp_portmap { + int pm_count; + struct lacp_port *pm_map[LACP_MAX_PORTS]; +}; + +struct lacp_port { + TAILQ_ENTRY(lacp_port) lp_dist_q; + LIST_ENTRY(lacp_port) lp_next; + struct lacp_softc *lp_lsc; + struct lagg_port *lp_lagg; + struct ifnet *lp_ifp; + struct lacp_peerinfo lp_partner; + struct lacp_peerinfo lp_actor; + struct lacp_markerinfo lp_marker; +#define lp_state lp_actor.lip_state +#define lp_key lp_actor.lip_key +#define lp_systemid lp_actor.lip_systemid + struct timeval lp_last_lacpdu; + int lp_lacpdu_sent; + enum lacp_mux_state lp_mux_state; + enum lacp_selected lp_selected; + int lp_flags; + u_int lp_media; /* XXX redundant */ + int lp_timer[LACP_NTIMER]; + struct ifmultiaddr *lp_ifma; + + struct lacp_aggregator *lp_aggregator; +}; + +struct lacp_aggregator { + TAILQ_ENTRY(lacp_aggregator) la_q; + int la_refcnt; /* num of ports which selected us */ + int la_nports; /* num of distributing ports */ + TAILQ_HEAD(, lacp_port) la_ports; /* distributing ports */ + struct lacp_peerinfo la_partner; + struct lacp_peerinfo la_actor; + int la_pending; /* number of ports in wait_while */ +}; + +struct lacp_softc { + struct lagg_softc *lsc_softc; + struct lock lsc_lock; + struct lacp_aggregator *lsc_active_aggregator; + TAILQ_HEAD(, lacp_aggregator) lsc_aggregators; + boolean_t lsc_suppress_distributing; + struct callout lsc_transit_callout; + struct callout lsc_callout; + LIST_HEAD(, lacp_port) lsc_ports; + struct lacp_portmap lsc_pmap[2]; + volatile u_int lsc_activemap; + u_int32_t lsc_hashkey; + struct { + u_int32_t lsc_rx_test; + u_int32_t lsc_tx_test; + } lsc_debug; + u_int32_t lsc_strict_mode; +}; + +#define LACP_TYPE_ACTORINFO 1 +#define LACP_TYPE_PARTNERINFO 2 +#define LACP_TYPE_COLLECTORINFO 3 + +/* timeout values (in sec) */ +#define LACP_FAST_PERIODIC_TIME (1) +#define LACP_SLOW_PERIODIC_TIME (30) +#define LACP_SHORT_TIMEOUT_TIME (3 * LACP_FAST_PERIODIC_TIME) +#define LACP_LONG_TIMEOUT_TIME (3 * LACP_SLOW_PERIODIC_TIME) +#define LACP_CHURN_DETECTION_TIME (60) +#define LACP_AGGREGATE_WAIT_TIME (2) +#define LACP_TRANSIT_DELAY 3000 /* in msec */ + +#define LACP_STATE_EQ(s1, s2, mask) \ + ((((s1) ^ (s2)) & (mask)) == 0) + +#define LACP_SYS_PRI(peer) (peer).lip_systemid.lsi_prio + +#define LACP_PORT(_lp) ((struct lacp_port *)(_lp)->lp_psc) +#define LACP_SOFTC(_sc) ((struct lacp_softc *)(_sc)->sc_psc) + +#define LACP_LOCK_INIT(_lsc) lockinit(&(_lsc)->lsc_lock, \ + "lacp mtx", 0, 0) +#define LACP_LOCK_DESTROY(_lsc) lockuninit(&(_lsc)->lsc_lock) +#define LACP_LOCK(_lsc) lockmgr(&(_lsc)->lsc_lock, LK_EXCLUSIVE) +#define LACP_UNLOCK(_lsc) lockmgr(&(_lsc)->lsc_lock, LK_RELEASE) +#define LACP_LOCK_ASSERT(_lsc) KKASSERT(lockstatus(&(_lsc)->lsc_lock, curthread)==LK_EXCLUSIVE) + +struct mbuf *lacp_input(struct lagg_port *, struct mbuf *); +struct lagg_port *lacp_select_tx_port(struct lagg_softc *, struct mbuf *); +int lacp_attach(struct lagg_softc *); +int lacp_detach(struct lagg_softc *); +void lacp_init(struct lagg_softc *); +void lacp_stop(struct lagg_softc *); +int lacp_port_create(struct lagg_port *); +void lacp_port_destroy(struct lagg_port *); +void lacp_linkstate(struct lagg_port *); +void lacp_req(struct lagg_softc *, caddr_t); +void lacp_portreq(struct lagg_port *, caddr_t); + +static __inline int +lacp_isactive(struct lagg_port *lgp) +{ + struct lacp_port *lp = LACP_PORT(lgp); + struct lacp_softc *lsc = lp->lp_lsc; + struct lacp_aggregator *la = lp->lp_aggregator; + + /* This port is joined to the active aggregator */ + if (la != NULL && la == lsc->lsc_active_aggregator) + return (1); + + return (0); +} + +static __inline int +lacp_iscollecting(struct lagg_port *lgp) +{ + struct lacp_port *lp = LACP_PORT(lgp); + + return ((lp->lp_state & LACP_STATE_COLLECTING) != 0); +} + +static __inline int +lacp_isdistributing(struct lagg_port *lgp) +{ + struct lacp_port *lp = LACP_PORT(lgp); + + return ((lp->lp_state & LACP_STATE_DISTRIBUTING) != 0); +} + +/* following constants don't include terminating NUL */ +#define LACP_MACSTR_MAX (2*6 + 5) +#define LACP_SYSTEMPRIOSTR_MAX (4) +#define LACP_SYSTEMIDSTR_MAX (LACP_SYSTEMPRIOSTR_MAX + 1 + LACP_MACSTR_MAX) +#define LACP_PORTPRIOSTR_MAX (4) +#define LACP_PORTNOSTR_MAX (4) +#define LACP_PORTIDSTR_MAX (LACP_PORTPRIOSTR_MAX + 1 + LACP_PORTNOSTR_MAX) +#define LACP_KEYSTR_MAX (4) +#define LACP_PARTNERSTR_MAX \ + (1 + LACP_SYSTEMIDSTR_MAX + 1 + LACP_KEYSTR_MAX + 1 \ + + LACP_PORTIDSTR_MAX + 1) +#define LACP_LAGIDSTR_MAX \ + (1 + LACP_PARTNERSTR_MAX + 1 + LACP_PARTNERSTR_MAX + 1) +#define LACP_STATESTR_MAX (255) /* XXX */ diff --git a/sys/net/lagg/if_lagg.c b/sys/net/lagg/if_lagg.c new file mode 100644 index 0000000000..379aa7ca0e --- /dev/null +++ b/sys/net/lagg/if_lagg.c @@ -0,0 +1,2128 @@ +/* $OpenBSD: if_trunk.c,v 1.30 2007/01/31 06:20:19 reyk Exp $ */ + +/* + * Copyright (c) 2005, 2006 Reyk Floeter + * Copyright (c) 2007 Andrew Thompson + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include + +#include "opt_inet.h" +#include "opt_inet6.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#if defined(INET) || defined(INET6) +#include +#endif +#ifdef INET +#include +#include +#include +#endif + +#ifdef INET6 +#include +#include +#include +#endif + +#include +#include +#include + +/* Special flags we should propagate to the lagg ports. */ +static struct { + int flag; + int (*func)(struct ifnet *, int); +} lagg_pflags[] = { + {IFF_PROMISC, ifpromisc}, + {IFF_ALLMULTI, if_allmulti}, + {0, NULL} +}; + +SLIST_HEAD(, lagg_softc) lagg_list; /* list of laggs */ +static struct lock lagg_list_lock; +eventhandler_tag lagg_detach_cookie = NULL; + +static int lagg_clone_create(struct if_clone *, int, caddr_t); +static int lagg_clone_destroy(struct ifnet *); +static const char * laggname = "lagg"; +struct if_clone lagg_cloner = IF_CLONE_INITIALIZER("lagg", + lagg_clone_create, + lagg_clone_destroy, + 0, IF_MAXUNIT); + +static void lagg_lladdr(struct lagg_softc *, uint8_t *); +static void lagg_capabilities(struct lagg_softc *); +static void lagg_port_lladdr(struct lagg_port *, uint8_t *); +static void lagg_port_setlladdr(void *, int); +static int lagg_port_create(struct lagg_softc *, struct ifnet *); +static int lagg_port_destroy(struct lagg_port *, int); +static void lagg_input(struct ifnet *, struct mbuf *); +static void lagg_linkstate(struct lagg_softc *); +#if XXX +static void lagg_port_state(struct ifnet *, int); +#endif +static int lagg_port_ioctl(struct ifnet *, u_long, caddr_t, struct ucred *cr); +static int lagg_port_output(struct ifnet *, struct mbuf *, + struct sockaddr *, struct rtentry *); +static void lagg_port_ifdetach(void *arg __unused, struct ifnet *); +#ifdef LAGG_PORT_STACKING +static int lagg_port_checkstacking(struct lagg_softc *); +#endif +static void lagg_port2req(struct lagg_port *, struct lagg_reqport *); +static void lagg_init(void *); +static void lagg_stop(struct lagg_softc *); +static int lagg_ioctl(struct ifnet *, u_long, caddr_t, struct ucred *cr); +static int lagg_ether_setmulti(struct lagg_softc *); +static int lagg_ether_cmdmulti(struct lagg_port *, int); +static int lagg_setflag(struct lagg_port *, int, int, + int (*func)(struct ifnet *, int)); +static int lagg_setflags(struct lagg_port *, int status); +static void lagg_start(struct ifnet *, struct ifaltq_subque *ifsq); +static void lagg_start_dispatch(netmsg_t msg); +/* Not needed? +static int lagg_output(struct ifnet *ifp, struct mbuf *m); +*/ +#if XXX +static int lagg_transmit(struct ifnet *, struct mbuf *); +static void lagg_qflush(struct ifnet *); +#endif +static int lagg_media_change(struct ifnet *); +static void lagg_media_status(struct ifnet *, struct ifmediareq *); +static struct lagg_port *lagg_link_active(struct lagg_softc *, + struct lagg_port *); +static const void *lagg_gethdr(struct mbuf *, u_int, u_int, void *); +static int lagg_sysctl_active(SYSCTL_HANDLER_ARGS); + +/* Simple round robin */ +static int lagg_rr_attach(struct lagg_softc *); +static int lagg_rr_detach(struct lagg_softc *); +static struct ifnet *lagg_rr_select_tx_port(struct lagg_softc *sc, + struct mbuf *m); +static struct mbuf *lagg_rr_input(struct lagg_softc *, struct lagg_port *, + struct mbuf *); + +/* Active failover */ +static int lagg_fail_attach(struct lagg_softc *); +static int lagg_fail_detach(struct lagg_softc *); +static struct ifnet *lagg_fail_select_tx_port(struct lagg_softc *, + struct mbuf *); +static struct mbuf *lagg_fail_input(struct lagg_softc *, struct lagg_port *, + struct mbuf *); + +/* Loadbalancing */ +static int lagg_lb_attach(struct lagg_softc *); +static int lagg_lb_detach(struct lagg_softc *); +static int lagg_lb_port_create(struct lagg_port *); +static void lagg_lb_port_destroy(struct lagg_port *); +static struct ifnet *lagg_lb_select_tx_port(struct lagg_softc *, + struct mbuf *); +static struct mbuf *lagg_lb_input(struct lagg_softc *, struct lagg_port *, + struct mbuf *); +static int lagg_lb_porttable(struct lagg_softc *, struct lagg_port *); + +/* 802.3ad LACP */ +static int lagg_lacp_attach(struct lagg_softc *); +static int lagg_lacp_detach(struct lagg_softc *); +static struct ifnet *lagg_lacp_select_tx_port(struct lagg_softc *, + struct mbuf *); +static struct mbuf *lagg_lacp_input(struct lagg_softc *, struct lagg_port *, + struct mbuf *); +static void lagg_lacp_lladdr(struct lagg_softc *); + +static void lagg_callout(void *); + +/* lagg protocol table */ +static const struct { + int ti_proto; + int (*ti_attach)(struct lagg_softc *); +} lagg_protos[] = { + { LAGG_PROTO_ROUNDROBIN, lagg_rr_attach }, + { LAGG_PROTO_FAILOVER, lagg_fail_attach }, + { LAGG_PROTO_LOADBALANCE, lagg_lb_attach }, + { LAGG_PROTO_ETHERCHANNEL, lagg_lb_attach }, + { LAGG_PROTO_LACP, lagg_lacp_attach }, + { LAGG_PROTO_NONE, NULL } +}; + +SYSCTL_DECL(_net_link); +SYSCTL_NODE(_net_link, OID_AUTO, lagg, CTLFLAG_RW, 0, + "Link Aggregation"); + +static int lagg_failover_rx_all = 0; /* Allow input on any failover links */ +SYSCTL_INT(_net_link_lagg, OID_AUTO, failover_rx_all, CTLFLAG_RW, + &lagg_failover_rx_all, 0, + "Accept input from any interface in a failover lagg"); +static int def_use_flowid = 1; /* Default value for using M_FLOWID */ +TUNABLE_INT("net.link.lagg.default_use_flowid", &def_use_flowid); +SYSCTL_INT(_net_link_lagg, OID_AUTO, default_use_flowid, CTLFLAG_RW, + &def_use_flowid, 0, + "Default setting for using flow id for load sharing"); + +static int +lagg_modevent(module_t mod, int type, void *data) +{ + + switch (type) { + case MOD_LOAD: + lockinit(&lagg_list_lock, "if_lagg list", 0, 0); + SLIST_INIT(&lagg_list); + if_clone_attach(&lagg_cloner); + lagg_input_p = lagg_input; + lagg_detach_cookie = EVENTHANDLER_REGISTER( + ifnet_departure_event, lagg_port_ifdetach, NULL, + EVENTHANDLER_PRI_ANY); + break; + case MOD_UNLOAD: + EVENTHANDLER_DEREGISTER(ifnet_departure_event, + lagg_detach_cookie); + if_clone_detach(&lagg_cloner); + lagg_input_p = NULL; + if_clone_detach(&lagg_cloner); + lockuninit(&lagg_list_lock); + break; + default: + return (EOPNOTSUPP); + } + return (0); +} + +static moduledata_t lagg_mod = { + "if_lagg", + lagg_modevent, + 0 +}; + +DECLARE_MODULE(if_lagg, lagg_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); +MODULE_VERSION(if_lagg, 1); + +/* + * This routine is run via an vlan + * config EVENT + */ +static void +lagg_register_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag) +{ +#if XXX + struct lagg_softc *sc = ifp->if_softc; + struct lagg_port *lp; + + if (ifp->if_softc != arg) /* Not our event */ + return; + + LAGG_RLOCK(sc); + if (!SLIST_EMPTY(&sc->sc_ports)) { + SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) + EVENTHANDLER_INVOKE(vlan_config, lp->lp_ifp); + } + LAGG_RUNLOCK(sc); +#endif + +} + +/* + * This routine is run via an vlan + * unconfig EVENT + */ +static void +lagg_unregister_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag) +{ + struct lagg_softc *sc = ifp->if_softc; + /* + struct lagg_port *lp; + */ + + if (ifp->if_softc != arg) /* Not our event */ + return; + + LAGG_RLOCK(sc); + if (!SLIST_EMPTY(&sc->sc_ports)) { +#if XXX + SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) + EVENTHANDLER_INVOKE(vlan_unconfig, lp->lp_ifp); +#endif + } + LAGG_RUNLOCK(sc); +} + +static int +lagg_clone_create(struct if_clone *ifc, int unit, caddr_t params __unused) +{ + struct lagg_softc *sc; + struct ifnet *ifp; + int i, error = 0; + static const u_char eaddr[6]; /* 00:00:00:00:00:00 */ + struct sysctl_oid *oid; + char num[14]; /* sufficient for 32 bits */ + + sc = kmalloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO); + ifp = sc->sc_ifp = &sc->sc_if; // XXX if_alloc(IFT_ETHER); +/* + if (ifp == NULL) { + kfree(sc, M_DEVBUF); + return (ENOSPC); + } +*/ +/* + sc->sc_ipackets = counter_u64_alloc(M_WAITOK); + sc->sc_opackets = counter_u64_alloc(M_WAITOK); + sc->sc_ibytes = counter_u64_alloc(M_WAITOK); + sc->sc_obytes = counter_u64_alloc(M_WAITOK); +*/ + sysctl_ctx_init(&sc->ctx); + ksnprintf(num, sizeof(num), "%u", unit); + sc->use_flowid = def_use_flowid; + sc->sc_oid = oid = SYSCTL_ADD_NODE(&sc->ctx, + &SYSCTL_NODE_CHILDREN(_net_link, lagg), + OID_AUTO, num, CTLFLAG_RD, NULL, ""); + SYSCTL_ADD_INT(&sc->ctx, SYSCTL_CHILDREN(oid), OID_AUTO, + "use_flowid", CTLTYPE_INT|CTLFLAG_RW, &sc->use_flowid, sc->use_flowid, + "Use flow id for load sharing"); + SYSCTL_ADD_INT(&sc->ctx, SYSCTL_CHILDREN(oid), OID_AUTO, + "count", CTLTYPE_INT|CTLFLAG_RD, &sc->sc_count, sc->sc_count, + "Total number of ports"); + SYSCTL_ADD_PROC(&sc->ctx, SYSCTL_CHILDREN(oid), OID_AUTO, + "active", CTLTYPE_INT|CTLFLAG_RD, sc, 0, lagg_sysctl_active, + "I", "Total number of active ports"); + SYSCTL_ADD_INT(&sc->ctx, SYSCTL_CHILDREN(oid), OID_AUTO, + "flapping", CTLTYPE_INT|CTLFLAG_RD, &sc->sc_flapping, + sc->sc_flapping, "Total number of port change events"); + /* Hash all layers by default */ + sc->sc_flags = LAGG_F_HASHL2|LAGG_F_HASHL3|LAGG_F_HASHL4; + + sc->sc_proto = LAGG_PROTO_NONE; + for (i = 0; lagg_protos[i].ti_proto != LAGG_PROTO_NONE; i++) { + if (lagg_protos[i].ti_proto == LAGG_PROTO_DEFAULT) { + sc->sc_proto = lagg_protos[i].ti_proto; + if ((error = lagg_protos[i].ti_attach(sc)) != 0) { + if_free(ifp); + kfree(sc, M_DEVBUF); + return (error); + } + break; + } + } + LAGG_LOCK_INIT(sc); + LAGG_CALLOUT_LOCK_INIT(sc); + SLIST_INIT(&sc->sc_ports); + TASK_INIT(&sc->sc_lladdr_task, 0, lagg_port_setlladdr, sc); + + /* + * This uses the callout lock rather than the rmlock; one can't + * hold said rmlock during SWI. + */ + callout_init(&sc->sc_callout); + /*, &sc->sc_call_lock, 0); */ + + /* Initialise pseudo media types */ + ifmedia_init(&sc->sc_media, 0, lagg_media_change, + lagg_media_status); + ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_AUTO, 0, NULL); + ifmedia_set(&sc->sc_media, IFM_ETHER | IFM_AUTO); + + if_initname(ifp, laggname, unit); + ifp->if_softc = sc; +#if XXX + ifp->if_transmit = lagg_transmit; + ifp->if_qflush = lagg_qflush; +#endif + ifp->if_mtu = ETHERMTU; + ifp->if_init = lagg_init; + ifp->if_ioctl = lagg_ioctl; + ifp->if_flags = IFF_SIMPLEX | IFF_BROADCAST | IFF_MULTICAST; + ifp->if_start = lagg_start; + ifp->if_type = IFT_ETHER; + ifq_set_maxlen(&ifp->if_snd, ifqmaxlen); + ifq_set_ready(&ifp->if_snd); + ifp->if_hdrlen = ETHER_HDR_LEN; + +#if XXX + ifp->if_capenable = ifp->if_capabilities = IFCAP_HWSTATS; +#endif + /* + * Attach as an ordinary ethernet device, children will be attached + * as special device IFT_IEEE8023ADLAG. + */ + + ether_ifattach(ifp, eaddr, NULL); + + sc->vlan_attach = EVENTHANDLER_REGISTER(vlan_config, + lagg_register_vlan, sc, EVENTHANDLER_PRI_FIRST); + sc->vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig, + lagg_unregister_vlan, sc, EVENTHANDLER_PRI_FIRST); + + /* Insert into the global list of laggs */ + lockmgr(&lagg_list_lock, LK_EXCLUSIVE); + SLIST_INSERT_HEAD(&lagg_list, sc, sc_entries); + lockmgr(&lagg_list_lock, LK_RELEASE); + + callout_reset(&sc->sc_callout, hz, lagg_callout, sc); + + return (0); +} + +static int +lagg_clone_destroy(struct ifnet *ifp) +{ + struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; + struct lagg_port *lp; + + LAGG_WLOCK(sc); + + lagg_stop(sc); + ifp->if_flags &= ~IFF_UP; + + EVENTHANDLER_DEREGISTER(vlan_config, sc->vlan_attach); + EVENTHANDLER_DEREGISTER(vlan_unconfig, sc->vlan_detach); + + /* Shutdown and remove lagg ports */ + while ((lp = SLIST_FIRST(&sc->sc_ports)) != NULL) + lagg_port_destroy(lp, 1); + /* Unhook the aggregation protocol */ + if (sc->sc_detach != NULL) + (*sc->sc_detach)(sc); + + LAGG_WUNLOCK(sc); + + sysctl_ctx_free(&sc->ctx); + ifmedia_removeall(&sc->sc_media); + ether_ifdetach(ifp); + if_free(ifp); + + /* This grabs sc_callout_mtx, serialising it correctly */ + callout_drain(&sc->sc_callout); + +#if 0 + /* At this point it's drained; we can free this */ + counter_u64_free(sc->sc_ipackets); + counter_u64_free(sc->sc_opackets); + counter_u64_free(sc->sc_ibytes); + counter_u64_free(sc->sc_obytes); +#endif + + lockmgr(&lagg_list_lock, LK_EXCLUSIVE); + SLIST_REMOVE(&lagg_list, sc, lagg_softc, sc_entries); + lockmgr(&lagg_list_lock, LK_RELEASE); + + taskqueue_drain(taskqueue_swi, &sc->sc_lladdr_task); + LAGG_LOCK_DESTROY(sc); + LAGG_CALLOUT_LOCK_DESTROY(sc); + kfree(sc, M_DEVBUF); + + return 0; +} + +static void +lagg_lladdr(struct lagg_softc *sc, uint8_t *lladdr) +{ + struct ifnet *ifp = sc->sc_ifp; + + if (memcmp(lladdr, IF_LLADDR(ifp), ETHER_ADDR_LEN) == 0) + return; + + bcopy(lladdr, IF_LLADDR(ifp), ETHER_ADDR_LEN); + /* Let the protocol know the MAC has changed */ + if (sc->sc_lladdr != NULL) + (*sc->sc_lladdr)(sc); + EVENTHANDLER_INVOKE(iflladdr_event, ifp); +} + +static void +lagg_capabilities(struct lagg_softc *sc) +{ + struct lagg_port *lp; + int cap = ~0, ena = ~0; + u_long hwa = ~0UL; + + LAGG_WLOCK_ASSERT(sc); + + /* Get capabilities from the lagg ports */ + SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { + cap &= lp->lp_ifp->if_capabilities; + ena &= lp->lp_ifp->if_capenable; + hwa &= lp->lp_ifp->if_hwassist; + } + cap = (cap == ~0 ? 0 : cap); + ena = (ena == ~0 ? 0 : ena); + hwa = (hwa == ~0 ? 0 : hwa); + + if (sc->sc_ifp->if_capabilities != cap || + sc->sc_ifp->if_capenable != ena || + sc->sc_ifp->if_hwassist != hwa) { + sc->sc_ifp->if_capabilities = cap; + sc->sc_ifp->if_capenable = ena; + sc->sc_ifp->if_hwassist = hwa; + getmicrotime(&sc->sc_ifp->if_lastchange); + + if (sc->sc_ifflags & IFF_DEBUG) + if_printf(sc->sc_ifp, + "capabilities 0x%08x enabled 0x%08x\n", cap, ena); + } +} + +static void +lagg_port_lladdr(struct lagg_port *lp, uint8_t *lladdr) +{ + struct lagg_softc *sc = lp->lp_softc; + struct ifnet *ifp = lp->lp_ifp; + struct lagg_llq *llq; + int pending = 0; + + LAGG_WLOCK_ASSERT(sc); + + if (lp->lp_detaching || + memcmp(lladdr, IF_LLADDR(ifp), ETHER_ADDR_LEN) == 0) + return; + + /* Check to make sure its not already queued to be changed */ + SLIST_FOREACH(llq, &sc->sc_llq_head, llq_entries) { + if (llq->llq_ifp == ifp) { + pending = 1; + break; + } + } + + if (!pending) { + llq = kmalloc(sizeof(struct lagg_llq), M_DEVBUF, M_NOWAIT); + if (llq == NULL) /* XXX what to do */ + return; + } + + /* Update the lladdr even if pending, it may have changed */ + llq->llq_ifp = ifp; + bcopy(lladdr, llq->llq_lladdr, ETHER_ADDR_LEN); + + if (!pending) + SLIST_INSERT_HEAD(&sc->sc_llq_head, llq, llq_entries); + + taskqueue_enqueue(taskqueue_swi, &sc->sc_lladdr_task); +} + +/* + * Set the interface MAC address from a taskqueue to avoid a LOR. + */ +static void +lagg_port_setlladdr(void *arg, int pending) +{ + struct lagg_softc *sc = (struct lagg_softc *)arg; + struct lagg_llq *llq, *head; + struct ifnet *ifp; + int error; + + /* Grab a local reference of the queue and remove it from the softc */ + LAGG_WLOCK(sc); + head = SLIST_FIRST(&sc->sc_llq_head); + SLIST_FIRST(&sc->sc_llq_head) = NULL; + LAGG_WUNLOCK(sc); + + /* + * Traverse the queue and set the lladdr on each ifp. It is safe to do + * unlocked as we have the only reference to it. + */ + for (llq = head; llq != NULL; llq = head) { + ifp = llq->llq_ifp; + + /* Set the link layer address */ + /* CURVNET_SET(ifp->if_vnet); */ + error = if_setlladdr(ifp, llq->llq_lladdr, ETHER_ADDR_LEN); + /* CURVNET_RESTORE(); */ + if (error) + kprintf("%s: setlladdr failed on %s\n", __func__, + ifp->if_xname); + + head = SLIST_NEXT(llq, llq_entries); + kfree(llq, M_DEVBUF); + } +} + +static int +lagg_port_create(struct lagg_softc *sc, struct ifnet *ifp) +{ + struct lagg_softc *sc_ptr; + struct lagg_port *lp; + int error = 0; + + LAGG_WLOCK_ASSERT(sc); + + /* Limit the maximal number of lagg ports */ + if (sc->sc_count >= LAGG_MAX_PORTS) + return (ENOSPC); + + /* Check if port has already been associated to a lagg */ + if (ifp->if_lagg != NULL) { + /* Port is already in the current lagg? */ + lp = (struct lagg_port *)ifp->if_lagg; + if (lp->lp_softc == sc) + return (EEXIST); + return (EBUSY); + } + + /* XXX Disallow non-ethernet interfaces (this should be any of 802) */ + if (ifp->if_type != IFT_ETHER) + return (EPROTONOSUPPORT); + +#ifdef INET6 + /* + * The member interface should not have inet6 address because + * two interfaces with a valid link-local scope zone must not be + * merged in any form. This restriction is needed to + * prevent violation of link-local scope zone. Attempts to + * add a member interface which has inet6 addresses triggers + * removal of all inet6 addresses on the member interface. + */ + SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { + if (in6ifa_llaonifp(lp->lp_ifp)) { + in6_ifdetach(lp->lp_ifp); + if_printf(sc->sc_ifp, + "IPv6 addresses on %s have been removed " + "before adding it as a member to prevent " + "IPv6 address scope violation.\n", + lp->lp_ifp->if_xname); + } + } + if (in6ifa_llaonifp(ifp)) { + in6_ifdetach(ifp); + if_printf(sc->sc_ifp, + "IPv6 addresses on %s have been removed " + "before adding it as a member to prevent " + "IPv6 address scope violation.\n", + ifp->if_xname); + } +#endif + /* Allow the first Ethernet member to define the MTU */ + if (SLIST_EMPTY(&sc->sc_ports)) + sc->sc_ifp->if_mtu = ifp->if_mtu; + else if (sc->sc_ifp->if_mtu != ifp->if_mtu) { + if_printf(sc->sc_ifp, "invalid MTU for %s\n", + ifp->if_xname); + return (EINVAL); + } + + if ((lp = kmalloc(sizeof(struct lagg_port), + M_DEVBUF, M_NOWAIT|M_ZERO)) == NULL) + return (ENOMEM); + + /* Check if port is a stacked lagg */ + lockmgr(&lagg_list_lock, LK_EXCLUSIVE); + SLIST_FOREACH(sc_ptr, &lagg_list, sc_entries) { + if (ifp == sc_ptr->sc_ifp) { + lockmgr(&lagg_list_lock, LK_RELEASE); + kfree(lp, M_DEVBUF); + return (EINVAL); + /* XXX disable stacking for the moment, its untested */ +#ifdef LAGG_PORT_STACKING + lp->lp_flags |= LAGG_PORT_STACK; + if (lagg_port_checkstacking(sc_ptr) >= + LAGG_MAX_STACKING) { + lockmgr(&lagg_list_lock, LK_RELEASE); + kfree(lp, M_DEVBUF); + return (E2BIG); + } +#endif + } + } + lockmgr(&lagg_list_lock, LK_RELEASE); + + /* Change the interface type */ + lp->lp_iftype = ifp->if_type; + ifp->if_type = IFT_IEEE8023ADLAG; + ifp->if_lagg = lp; + lp->lp_ioctl = ifp->if_ioctl; + ifp->if_ioctl = lagg_port_ioctl; + lp->lp_output = ifp->if_output; + ifp->if_output = lagg_port_output; + + lp->lp_ifp = ifp; + lp->lp_softc = sc; + + /* Save port link layer address */ + bcopy(IF_LLADDR(ifp), lp->lp_lladdr, ETHER_ADDR_LEN); + + if (SLIST_EMPTY(&sc->sc_ports)) { + sc->sc_primary = lp; + lagg_lladdr(sc, IF_LLADDR(ifp)); + } else { + /* Update link layer address for this port */ + lagg_port_lladdr(lp, IF_LLADDR(sc->sc_ifp)); + } + + /* Insert into the list of ports */ + SLIST_INSERT_HEAD(&sc->sc_ports, lp, lp_entries); + sc->sc_count++; + + /* Update lagg capabilities */ + lagg_capabilities(sc); + lagg_linkstate(sc); + + /* Add multicast addresses and interface flags to this port */ + lagg_ether_cmdmulti(lp, 1); + lagg_setflags(lp, 1); + + if (sc->sc_port_create != NULL) + error = (*sc->sc_port_create)(lp); + if (error) { + /* remove the port again, without calling sc_port_destroy */ + lagg_port_destroy(lp, 0); + return (error); + } + + return (error); +} + +#ifdef LAGG_PORT_STACKING +static int +lagg_port_checkstacking(struct lagg_softc *sc) +{ + struct lagg_softc *sc_ptr; + struct lagg_port *lp; + int m = 0; + + LAGG_WLOCK_ASSERT(sc); + + SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { + if (lp->lp_flags & LAGG_PORT_STACK) { + sc_ptr = (struct lagg_softc *)lp->lp_ifp->if_softc; + m = MAX(m, lagg_port_checkstacking(sc_ptr)); + } + } + + return (m + 1); +} +#endif + +static int +lagg_port_destroy(struct lagg_port *lp, int runpd) +{ + struct lagg_softc *sc = lp->lp_softc; + struct lagg_port *lp_ptr; + struct lagg_llq *llq; + struct ifnet *ifp = lp->lp_ifp; + + LAGG_WLOCK_ASSERT(sc); + + if (runpd && sc->sc_port_destroy != NULL) + (*sc->sc_port_destroy)(lp); + + /* + * Remove multicast addresses and interface flags from this port and + * reset the MAC address, skip if the interface is being detached. + */ + if (!lp->lp_detaching) { + lagg_ether_cmdmulti(lp, 0); + lagg_setflags(lp, 0); + lagg_port_lladdr(lp, lp->lp_lladdr); + } + + /* Restore interface */ + ifp->if_type = lp->lp_iftype; + ifp->if_ioctl = lp->lp_ioctl; + ifp->if_output = lp->lp_output; + ifp->if_lagg = NULL; + + /* Finally, remove the port from the lagg */ + SLIST_REMOVE(&sc->sc_ports, lp, lagg_port, lp_entries); + sc->sc_count--; + + /* Update the primary interface */ + if (lp == sc->sc_primary) { + uint8_t lladdr[ETHER_ADDR_LEN]; + + if ((lp_ptr = SLIST_FIRST(&sc->sc_ports)) == NULL) { + bzero(&lladdr, ETHER_ADDR_LEN); + } else { + bcopy(lp_ptr->lp_lladdr, + lladdr, ETHER_ADDR_LEN); + } + lagg_lladdr(sc, lladdr); + sc->sc_primary = lp_ptr; + + /* Update link layer address for each port */ + SLIST_FOREACH(lp_ptr, &sc->sc_ports, lp_entries) + lagg_port_lladdr(lp_ptr, lladdr); + } + + /* Remove any pending lladdr changes from the queue */ + if (lp->lp_detaching) { + SLIST_FOREACH(llq, &sc->sc_llq_head, llq_entries) { + if (llq->llq_ifp == ifp) { + SLIST_REMOVE(&sc->sc_llq_head, llq, lagg_llq, + llq_entries); + kfree(llq, M_DEVBUF); + break; /* Only appears once */ + } + } + } + + if (lp->lp_ifflags) + if_printf(ifp, "%s: lp_ifflags unclean\n", __func__); + + kfree(lp, M_DEVBUF); + + /* Update lagg capabilities */ + lagg_capabilities(sc); + lagg_linkstate(sc); + + return (0); +} + +static int +lagg_port_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data, struct ucred *cr) +{ + struct lagg_reqport *rp = (struct lagg_reqport *)data; + struct lagg_softc *sc; + struct lagg_port *lp = NULL; + int error = 0; + + /* Should be checked by the caller */ + if (ifp->if_type != IFT_IEEE8023ADLAG || + (lp = ifp->if_lagg) == NULL || (sc = lp->lp_softc) == NULL) + goto fallback; + + switch (cmd) { + case SIOCGLAGGPORT: + if (rp->rp_portname[0] == '\0' || + ifunit(rp->rp_portname) != ifp) { + error = EINVAL; + break; + } + + LAGG_RLOCK(sc); + if ((lp = ifp->if_lagg) == NULL || lp->lp_softc != sc) { + error = ENOENT; + LAGG_RUNLOCK(sc); + break; + } + + lagg_port2req(lp, rp); + LAGG_RUNLOCK(sc); + break; + + case SIOCSIFCAP: + if (lp->lp_ioctl == NULL) { + error = EINVAL; + break; + } + + error = (*lp->lp_ioctl)(ifp, cmd, data, cr); + if (error) + break; + + /* Update lagg interface capabilities */ + LAGG_WLOCK(sc); + lagg_capabilities(sc); + LAGG_WUNLOCK(sc); + break; + case SIOCGIFMEDIA: + if (lp->lp_ioctl == NULL) { + error = EINVAL; + break; + } + + error = (*lp->lp_ioctl)(ifp, cmd, data, cr); + break; + case SIOCSIFMTU: + /* Do not allow the MTU to be changed once joined */ + error = EINVAL; + break; + + default: + goto fallback; + } + + return (error); + +fallback: + if (lp->lp_ioctl != NULL) { + int result; + result = ((*lp->lp_ioctl)(ifp, cmd, data, cr)); + } + return (EINVAL); +} + +/* + * For direct output to child ports. + */ +static int +lagg_port_output(struct ifnet *ifp, struct mbuf *m, + struct sockaddr *dst, struct rtentry *ro) +{ + struct lagg_port *lp = ifp->if_lagg; + + switch (dst->sa_family) { + case pseudo_AF_HDRCMPLT: + case AF_UNSPEC: + return ((*lp->lp_output)(ifp, m, dst, ro)); + } + + /* drop any other frames */ + m_freem(m); + return (ENETDOWN); +} + +static void +lagg_port_ifdetach(void *arg __unused, struct ifnet *ifp) +{ + struct lagg_port *lp; + struct lagg_softc *sc; + + if ((lp = ifp->if_lagg) == NULL) + return; +#if XXX + /* If the ifnet is just being renamed, don't do anything. */ + if (ifp->if_flags & IFF_RENAMING) + return; +#endif + sc = lp->lp_softc; + + LAGG_WLOCK(sc); + lp->lp_detaching = 1; + lagg_port_destroy(lp, 1); + LAGG_WUNLOCK(sc); +} + +static void +lagg_port2req(struct lagg_port *lp, struct lagg_reqport *rp) +{ + struct lagg_softc *sc = lp->lp_softc; + + strlcpy(rp->rp_ifname, sc->sc_ifname, sizeof(rp->rp_ifname)); + strlcpy(rp->rp_portname, lp->lp_ifp->if_xname, sizeof(rp->rp_portname)); + rp->rp_prio = lp->lp_prio; + rp->rp_flags = lp->lp_flags; + if (sc->sc_portreq != NULL) + (*sc->sc_portreq)(lp, (caddr_t)&rp->rp_psc); + + /* Add protocol specific flags */ + switch (sc->sc_proto) { + case LAGG_PROTO_FAILOVER: + if (lp == sc->sc_primary) + rp->rp_flags |= LAGG_PORT_MASTER; + if (lp == lagg_link_active(sc, sc->sc_primary)) + rp->rp_flags |= LAGG_PORT_ACTIVE; + break; + + case LAGG_PROTO_ROUNDROBIN: + case LAGG_PROTO_LOADBALANCE: + case LAGG_PROTO_ETHERCHANNEL: + if (LAGG_PORTACTIVE(lp)) + rp->rp_flags |= LAGG_PORT_ACTIVE; + break; + + case LAGG_PROTO_LACP: + /* LACP has a different definition of active */ + if (lacp_isactive(lp)) + rp->rp_flags |= LAGG_PORT_ACTIVE; + if (lacp_iscollecting(lp)) + rp->rp_flags |= LAGG_PORT_COLLECTING; + if (lacp_isdistributing(lp)) + rp->rp_flags |= LAGG_PORT_DISTRIBUTING; + break; + } + +} + +static void +lagg_init(void *xsc) +{ + struct lagg_softc *sc = (struct lagg_softc *)xsc; + struct lagg_port *lp; + struct ifnet *ifp = sc->sc_ifp; + + if (ifp->if_flags & IFF_RUNNING) + return; + + LAGG_WLOCK(sc); + + ifp->if_flags |= IFF_RUNNING; + /* Update the port lladdrs */ + SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) + lagg_port_lladdr(lp, IF_LLADDR(ifp)); + + if (sc->sc_init != NULL) + (*sc->sc_init)(sc); + + LAGG_WUNLOCK(sc); +} + +static void +lagg_stop(struct lagg_softc *sc) +{ + struct ifnet *ifp = sc->sc_ifp; + + LAGG_WLOCK_ASSERT(sc); + + if ((ifp->if_flags & IFF_RUNNING) == 0) + return; + + ifp->if_flags &= ~IFF_RUNNING; + + if (sc->sc_stop != NULL) + (*sc->sc_stop)(sc); +} + +static int +lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data, struct ucred *cr) +{ + struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; + struct lagg_reqall *ra = (struct lagg_reqall *)data; + struct lagg_reqport *rp = (struct lagg_reqport *)data, rpbuf; + struct lagg_reqflags *rf = (struct lagg_reqflags *)data; + struct ifreq *ifr = (struct ifreq *)data; + struct lagg_port *lp; + struct ifnet *tpif; + struct thread *td = curthread; + char *buf, *outbuf; + int count, buflen, len, error = 0; + + ASSERT_IFNET_SERIALIZED_ALL(ifp); + + bzero(&rpbuf, sizeof(rpbuf)); + + switch (cmd) { + case SIOCGLAGG: + LAGG_RLOCK(sc); + count = 0; + SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) + count++; + buflen = count * sizeof(struct lagg_reqport); + LAGG_RUNLOCK(sc); + + outbuf = kmalloc(buflen, M_TEMP, M_WAITOK | M_ZERO); + + LAGG_RLOCK(sc); + ra->ra_proto = sc->sc_proto; + if (sc->sc_req != NULL) + (*sc->sc_req)(sc, (caddr_t)&ra->ra_psc); + + count = 0; + buf = outbuf; + len = min(ra->ra_size, buflen); + SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { + if (len < sizeof(rpbuf)) + break; + + lagg_port2req(lp, &rpbuf); + memcpy(buf, &rpbuf, sizeof(rpbuf)); + count++; + buf += sizeof(rpbuf); + len -= sizeof(rpbuf); + } + LAGG_RUNLOCK(sc); + ra->ra_ports = count; + ra->ra_size = count * sizeof(rpbuf); + error = copyout(outbuf, ra->ra_port, ra->ra_size); + kfree(outbuf, M_TEMP); + break; + case SIOCSLAGG: + error = priv_check(td, PRIV_NET_LAGG); + if (error) + break; + if (ra->ra_proto >= LAGG_PROTO_MAX) { + error = EPROTONOSUPPORT; + break; + } + LAGG_WLOCK(sc); + if (sc->sc_proto != LAGG_PROTO_NONE) { + /* Reset protocol first in case detach unlocks */ + sc->sc_proto = LAGG_PROTO_NONE; + error = sc->sc_detach(sc); + sc->sc_detach = NULL; + sc->sc_start = NULL; + sc->sc_input = NULL; + sc->sc_port_create = NULL; + sc->sc_port_destroy = NULL; + sc->sc_linkstate = NULL; + sc->sc_init = NULL; + sc->sc_stop = NULL; + sc->sc_lladdr = NULL; + sc->sc_req = NULL; + sc->sc_portreq = NULL; + } else if (sc->sc_input != NULL) { + /* Still detaching */ + error = EBUSY; + } + if (error != 0) { + LAGG_WUNLOCK(sc); + break; + } + for (int i = 0; i < (sizeof(lagg_protos) / + sizeof(lagg_protos[0])); i++) { + if (lagg_protos[i].ti_proto == ra->ra_proto) { + if (sc->sc_ifflags & IFF_DEBUG) + kprintf("%s: using proto %u\n", + sc->sc_ifname, + lagg_protos[i].ti_proto); + sc->sc_proto = lagg_protos[i].ti_proto; + if (sc->sc_proto != LAGG_PROTO_NONE) + error = lagg_protos[i].ti_attach(sc); + LAGG_WUNLOCK(sc); + return (error); + } + } + LAGG_WUNLOCK(sc); + error = EPROTONOSUPPORT; + break; + case SIOCGLAGGFLAGS: + rf->rf_flags = sc->sc_flags; + break; + case SIOCSLAGGHASH: + error = priv_check(td, PRIV_NET_LAGG); + if (error) + break; + if ((rf->rf_flags & LAGG_F_HASHMASK) == 0) { + error = EINVAL; + break; + } + LAGG_WLOCK(sc); + sc->sc_flags &= ~LAGG_F_HASHMASK; + sc->sc_flags |= rf->rf_flags & LAGG_F_HASHMASK; + LAGG_WUNLOCK(sc); + break; + case SIOCGLAGGPORT: + if (rp->rp_portname[0] == '\0' || + (tpif = ifunit(rp->rp_portname)) == NULL) { + error = EINVAL; + break; + } + + LAGG_RLOCK(sc); + if ((lp = (struct lagg_port *)tpif->if_lagg) == NULL || + lp->lp_softc != sc) { + error = ENOENT; + LAGG_RUNLOCK(sc); + break; + } + + lagg_port2req(lp, rp); + LAGG_RUNLOCK(sc); + break; + case SIOCSLAGGPORT: + error = priv_check(td, PRIV_NET_LAGG); + if (error) + break; + if (rp->rp_portname[0] == '\0' || + (tpif = ifunit(rp->rp_portname)) == NULL) { + error = EINVAL; + break; + } + LAGG_WLOCK(sc); + error = lagg_port_create(sc, tpif); + LAGG_WUNLOCK(sc); + break; + case SIOCSLAGGDELPORT: + error = priv_check(td, PRIV_NET_LAGG); + if (error) + break; + if (rp->rp_portname[0] == '\0' || + (tpif = ifunit(rp->rp_portname)) == NULL) { + error = EINVAL; + break; + } + + LAGG_WLOCK(sc); + if ((lp = (struct lagg_port *)tpif->if_lagg) == NULL || + lp->lp_softc != sc) { + error = ENOENT; + LAGG_WUNLOCK(sc); + break; + } + + error = lagg_port_destroy(lp, 1); + LAGG_WUNLOCK(sc); + break; + case SIOCSIFFLAGS: + /* Set flags on ports too */ + LAGG_WLOCK(sc); + SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { + lagg_setflags(lp, 1); + } + LAGG_WUNLOCK(sc); + + if (!(ifp->if_flags & IFF_UP) && + (ifp->if_flags & IFF_RUNNING)) { + /* + * If interface is marked down and it is running, + * then stop and disable it. + */ + LAGG_WLOCK(sc); + lagg_stop(sc); + LAGG_WUNLOCK(sc); + } else if ((ifp->if_flags & IFF_UP) && + !(ifp->if_flags & IFF_RUNNING)) { + /* + * If interface is marked up and it is stopped, then + * start it. + */ + (*ifp->if_init)(sc); + } + break; + case SIOCADDMULTI: + case SIOCDELMULTI: + LAGG_WLOCK(sc); + error = lagg_ether_setmulti(sc); + LAGG_WUNLOCK(sc); + break; + case SIOCSIFMEDIA: + case SIOCGIFMEDIA: + error = ifmedia_ioctl(ifp, ifr, &sc->sc_media, cmd); + break; + + case SIOCSIFCAP: + case SIOCSIFMTU: + /* Do not allow the MTU or caps to be directly changed */ + error = EINVAL; + break; + + default: + error = ether_ioctl(ifp, cmd, data); + break; + } + return (error); +} + +static int +lagg_ether_setmulti(struct lagg_softc *sc) +{ + struct lagg_port *lp; + + LAGG_WLOCK_ASSERT(sc); + + SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { + /* First, remove any existing filter entries. */ + lagg_ether_cmdmulti(lp, 0); + /* copy all addresses from the lagg interface to the port */ + lagg_ether_cmdmulti(lp, 1); + } + return (0); +} + +static int +lagg_ether_cmdmulti(struct lagg_port *lp, int set) +{ + struct lagg_softc *sc = lp->lp_softc; + struct ifnet *ifp = lp->lp_ifp; + struct ifnet *scifp = sc->sc_ifp; + struct lagg_mc *mc; + struct ifmultiaddr *ifma, *rifma = NULL; + struct sockaddr_dl sdl; + int error; + + ASSERT_IFNET_NOT_SERIALIZED_ALL(ifp); + LAGG_WLOCK_ASSERT(sc); + + bzero((char *)&sdl, sizeof(sdl)); + sdl.sdl_len = sizeof(sdl); + sdl.sdl_family = AF_LINK; + sdl.sdl_type = IFT_ETHER; + sdl.sdl_alen = ETHER_ADDR_LEN; + sdl.sdl_index = ifp->if_index; + + if (set) { + TAILQ_FOREACH(ifma, &scifp->if_multiaddrs, ifma_link) { + if (ifma->ifma_addr->sa_family != AF_LINK) + continue; + bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr), + LLADDR(&sdl), ETHER_ADDR_LEN); + + error = if_addmulti(ifp, (struct sockaddr *)&sdl, &rifma); + if (error) + return (error); + mc = kmalloc(sizeof(struct lagg_mc), M_DEVBUF, M_NOWAIT); + if (mc == NULL) + return (ENOMEM); + mc->mc_ifma = rifma; + SLIST_INSERT_HEAD(&lp->lp_mc_head, mc, mc_entries); + } + } else { + while ((mc = SLIST_FIRST(&lp->lp_mc_head)) != NULL) { + SLIST_REMOVE(&lp->lp_mc_head, mc, lagg_mc, mc_entries); + if_delmulti(ifp, (struct sockaddr *)mc->mc_ifma); + kfree(mc, M_DEVBUF); + } + } + return (0); +} + +/* Handle a ref counted flag that should be set on the lagg port as well */ +static int +lagg_setflag(struct lagg_port *lp, int flag, int status, + int (*func)(struct ifnet *, int)) +{ + struct lagg_softc *sc = lp->lp_softc; + struct ifnet *scifp = sc->sc_ifp; + struct ifnet *ifp = lp->lp_ifp; + int error; + + LAGG_WLOCK_ASSERT(sc); + ASSERT_IFNET_NOT_SERIALIZED_ALL(ifp); + + status = status ? (scifp->if_flags & flag) : 0; + /* Now "status" contains the flag value or 0 */ + + /* + * See if recorded ports status is different from what + * we want it to be. If it is, flip it. We record ports + * status in lp_ifflags so that we won't clear ports flag + * we haven't set. In fact, we don't clear or set ports + * flags directly, but get or release references to them. + * That's why we can be sure that recorded flags still are + * in accord with actual ports flags. + */ + if (status != (lp->lp_ifflags & flag)) { + error = (*func)(ifp, status); + if (error) + return (error); + lp->lp_ifflags &= ~flag; + lp->lp_ifflags |= status; + } + return (0); +} + +/* + * Handle IFF_* flags that require certain changes on the lagg port + * if "status" is true, update ports flags respective to the lagg + * if "status" is false, forcedly clear the flags set on port. + */ +static int +lagg_setflags(struct lagg_port *lp, int status) +{ + int error, i; + + ASSERT_IFNET_NOT_SERIALIZED_ALL(lp->lp_ifp); + + for (i = 0; lagg_pflags[i].flag; i++) { + error = lagg_setflag(lp, lagg_pflags[i].flag, + status, lagg_pflags[i].func); + if (error) + return (error); + } + return (0); +} + + +#ifdef XXX /* not needed? */ +static int +lagg_output(struct ifnet *ifp, struct mbuf *m) +{ + struct lagg_softc *sc = ifp->if_softc; + int error, len, mcast; + + len = m->m_pkthdr.len; + mcast = (m->m_flags & (M_MCAST | M_BCAST)) ? 1 : 0; + + LAGG_RLOCK(sc); + /* We need a Tx algorithm and at least one port */ + if (sc->sc_proto == LAGG_PROTO_NONE || sc->sc_count == 0) { + LAGG_RUNLOCK(sc); + m_freem(m); + ifp->if_oerrors++; + return (ENXIO); + } + + BPF_MTAP(ifp, m); + + error = (*sc->sc_start)(sc, m); + LAGG_RUNLOCK(sc); + + if (error == 0) { + IFNET_STAT_INC(ifp, opackets, 1); + IFNET_STAT_INC(ifp, obytes, len); + ifp->if_omcasts += mcast; + } else + ifp->if_oerrors++; + + return error; +} +#endif + +#ifdef XXX +/* + * The ifp->if_qflush entry point for lagg(4) is no-op. + */ +static void +lagg_qflush(struct ifnet *ifp __unused) +{ +} +#endif +static void +lagg_input(struct ifnet *ifp, struct mbuf *m) +{ + struct lagg_port *lp = ifp->if_lagg; + struct lagg_softc *sc = lp->lp_softc; + struct ifnet *scifp = sc->sc_ifp; + + LAGG_RLOCK(sc); + if ((scifp->if_flags & IFF_RUNNING) == 0 || + (lp->lp_flags & LAGG_PORT_DISABLED) || + sc->sc_proto == LAGG_PROTO_NONE) { + LAGG_RUNLOCK(sc); + m_freem(m); + return; + } + + BPF_MTAP(scifp, m); + m = (*sc->sc_input)(sc, lp, m); + + LAGG_RUNLOCK(sc); + + if (m != NULL) { + IFNET_STAT_INC(ifp, ipackets, 1); + IFNET_STAT_INC(ifp, ibytes, m->m_pkthdr.len); + + if (scifp->if_flags & IFF_MONITOR) { + m_freem(m); + m = NULL; + } + ether_reinput_oncpu(scifp, m, REINPUT_RUNBPF); + } +} + +static int +lagg_media_change(struct ifnet *ifp) +{ + struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; + + if (sc->sc_ifflags & IFF_DEBUG) + kprintf("%s\n", __func__); + /* Ignore */ + return (0); +} + +static void +lagg_media_status(struct ifnet *ifp, struct ifmediareq *imr) +{ + struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; + struct lagg_port *lp; + + imr->ifm_status = IFM_AVALID; + imr->ifm_active = IFM_ETHER | IFM_AUTO; + + LAGG_RLOCK(sc); + SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { + if (LAGG_PORTACTIVE(lp)) + imr->ifm_status |= IFM_ACTIVE; + } + LAGG_RUNLOCK(sc); +} + +static void +lagg_linkstate(struct lagg_softc *sc) +{ + struct lagg_port *lp; + int new_link = LINK_STATE_DOWN; + uint64_t speed; + + /* Our link is considered up if at least one of our ports is active */ + SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { + if (lp->lp_link_state == LINK_STATE_UP) { + new_link = LINK_STATE_UP; + break; + } + } + if_link_state_change(sc->sc_ifp); + + /* Update if_baudrate to reflect the max possible speed */ + switch (sc->sc_proto) { + case LAGG_PROTO_FAILOVER: + sc->sc_ifp->if_baudrate = sc->sc_primary != NULL ? + sc->sc_primary->lp_ifp->if_baudrate : 0; + break; + case LAGG_PROTO_ROUNDROBIN: + case LAGG_PROTO_LOADBALANCE: + case LAGG_PROTO_ETHERCHANNEL: + speed = 0; + SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) + speed += lp->lp_ifp->if_baudrate; + sc->sc_ifp->if_baudrate = speed; + break; + case LAGG_PROTO_LACP: + /* LACP updates if_baudrate itself */ + break; + } +} + +#if XXX +static void +lagg_port_state(struct ifnet *ifp, int state) +{ + struct lagg_port *lp = (struct lagg_port *)ifp->if_lagg; + struct lagg_softc *sc = NULL; + + if (lp != NULL) + sc = lp->lp_softc; + if (sc == NULL) + return; + + LAGG_WLOCK(sc); + lagg_linkstate(sc); + if (sc->sc_linkstate != NULL) + (*sc->sc_linkstate)(lp); + LAGG_WUNLOCK(sc); +} +#endif + +struct lagg_port * +lagg_link_active(struct lagg_softc *sc, struct lagg_port *lp) +{ + struct lagg_port *lp_next, *rval = NULL; + // int new_link = LINK_STATE_DOWN; + + LAGG_RLOCK_ASSERT(sc); + /* + * Search a port which reports an active link state. + */ + + if (lp == NULL) + goto search; + if (LAGG_PORTACTIVE(lp)) { + rval = lp; + goto found; + } + if ((lp_next = SLIST_NEXT(lp, lp_entries)) != NULL && + LAGG_PORTACTIVE(lp_next)) { + rval = lp_next; + goto found; + } + +search: + SLIST_FOREACH(lp_next, &sc->sc_ports, lp_entries) { + if (LAGG_PORTACTIVE(lp_next)) { + rval = lp_next; + goto found; + } + } + +found: + if (rval != NULL) { + /* + * The IEEE 802.1D standard assumes that a lagg with + * multiple ports is always full duplex. This is valid + * for load sharing laggs and if at least two links + * are active. Unfortunately, checking the latter would + * be too expensive at this point. + XXX + if ((sc->sc_capabilities & IFCAP_LAGG_FULLDUPLEX) && + (sc->sc_count > 1)) + new_link = LINK_STATE_FULL_DUPLEX; + else + new_link = rval->lp_link_state; + */ + } + + return (rval); +} + +static const void * +lagg_gethdr(struct mbuf *m, u_int off, u_int len, void *buf) +{ + if (m->m_pkthdr.len < (off + len)) { + return (NULL); + } else if (m->m_len < (off + len)) { + m_copydata(m, off, len, buf); + return (buf); + } + return (mtod(m, char *) + off); +} + +static int +lagg_sysctl_active(SYSCTL_HANDLER_ARGS) +{ + struct lagg_softc *sc = (struct lagg_softc *)arg1; + struct lagg_port *lp; + int error; + + /* LACP tracks active links automatically, the others do not */ + if (sc->sc_proto != LAGG_PROTO_LACP) { + sc->sc_active = 0; + SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) + sc->sc_active += LAGG_PORTACTIVE(lp); + } + + error = sysctl_handle_int(oidp, &sc->sc_active, 0, req); + if ((error) || (req->newptr == NULL)) + return (error); + + return (0); +} + +uint32_t +lagg_hashmbuf(struct lagg_softc *sc, struct mbuf *m, uint32_t key) +{ + uint16_t etype; + uint32_t p = key; + int off; + struct ether_header *eh; + const struct ether_vlan_header *vlan; +#ifdef INET + const struct ip *ip; + const uint32_t *ports; + int iphlen; +#endif +#ifdef INET6 + const struct ip6_hdr *ip6; + uint32_t flow; +#endif + union { +#ifdef INET + struct ip ip; +#endif +#ifdef INET6 + struct ip6_hdr ip6; +#endif + struct ether_vlan_header vlan; + uint32_t port; + } buf; + + + off = sizeof(*eh); + if (m->m_len < off) + goto out; + eh = mtod(m, struct ether_header *); + etype = ntohs(eh->ether_type); + if (sc->sc_flags & LAGG_F_HASHL2) { + p = hash32_buf(&eh->ether_shost, ETHER_ADDR_LEN, p); + p = hash32_buf(&eh->ether_dhost, ETHER_ADDR_LEN, p); + } + + /* Special handling for encapsulating VLAN frames */ +#if XXX + if ((m->m_flags & M_VLANTAG) && (sc->sc_flags & LAGG_F_HASHL2)) { + p = hash32_buf(&m->m_pkthdr.ether_vtag, + sizeof(m->m_pkthdr.ether_vtag), p); + } else +#endif + if (etype == ETHERTYPE_VLAN) { + vlan = lagg_gethdr(m, off, sizeof(*vlan), &buf); + if (vlan == NULL) + goto out; + + if (sc->sc_flags & LAGG_F_HASHL2) + p = hash32_buf(&vlan->evl_tag, sizeof(vlan->evl_tag), p); + etype = ntohs(vlan->evl_proto); + off += sizeof(*vlan) - sizeof(*eh); + } + + switch (etype) { +#ifdef INET + case ETHERTYPE_IP: + ip = lagg_gethdr(m, off, sizeof(*ip), &buf); + if (ip == NULL) + goto out; + + if (sc->sc_flags & LAGG_F_HASHL3) { + p = hash32_buf(&ip->ip_src, sizeof(struct in_addr), p); + p = hash32_buf(&ip->ip_dst, sizeof(struct in_addr), p); + } + if (!(sc->sc_flags & LAGG_F_HASHL4)) + break; + switch (ip->ip_p) { + case IPPROTO_TCP: + case IPPROTO_UDP: + case IPPROTO_SCTP: + iphlen = ip->ip_hl << 2; + if (iphlen < sizeof(*ip)) + break; + off += iphlen; + ports = lagg_gethdr(m, off, sizeof(*ports), &buf); + if (ports == NULL) + break; + p = hash32_buf(ports, sizeof(*ports), p); + break; + } + break; +#endif +#ifdef INET6 + case ETHERTYPE_IPV6: + if (!(sc->sc_flags & LAGG_F_HASHL3)) + break; + ip6 = lagg_gethdr(m, off, sizeof(*ip6), &buf); + if (ip6 == NULL) + goto out; + + p = hash32_buf(&ip6->ip6_src, sizeof(struct in6_addr), p); + p = hash32_buf(&ip6->ip6_dst, sizeof(struct in6_addr), p); + flow = ip6->ip6_flow & IPV6_FLOWLABEL_MASK; + p = hash32_buf(&flow, sizeof(flow), p); /* IPv6 flow label */ + break; +#endif + } +out: + return (p); +} + +static void +lagg_start(struct ifnet *ifp, struct ifaltq_subque *ifsq) +{ + struct lagg_softc *sc = ifp->if_softc; + struct mbuf *m; + struct ifnet *ifp_p; + struct netmsg_packet *nmp; + lwkt_port_t p_port; + + ASSERT_ALTQ_SQ_DEFAULT(ifp, ifsq); + ASSERT_ALTQ_SQ_SERIALIZED_HW(ifsq); + + if (((ifp->if_flags & IFF_RUNNING) == 0) + || (sc->sc_proto == LAGG_PROTO_NONE) + || (sc->sc_count == 0)) { + ifsq_purge(ifsq); + return; + } + + + LAGG_RLOCK(sc); + for (;;) { + m = ifsq_dequeue(ifsq); + if (m == NULL){ + break; + } + // Choose output port + ifp_p = (*sc->sc_select_tx_port)(sc, m); + + if (ifp_p == NULL) { + ifsq_purge(ifsq); + break; + } + p_port = netisr_cpuport( + ifsq_get_cpuid(ifq_get_subq_default(&ifp_p->if_snd))); + + BPF_MTAP(ifp, m); + + nmp = &m->m_hdr.mh_netmsg; + + netmsg_init(&nmp->base, NULL, &netisr_apanic_rport, + 0, lagg_start_dispatch); + nmp->nm_packet = m; + nmp->base.lmsg.u.ms_resultp = ifp_p; + + lwkt_sendmsg(p_port, &nmp->base.lmsg); + IFNET_STAT_INC(ifp, opackets, 1); + } + LAGG_RUNLOCK(sc); +} + + +static void +lagg_start_dispatch(netmsg_t msg) +{ + struct netmsg_packet *nmp = &msg->packet; + struct mbuf *m; + struct ifnet *ifp; + struct altq_pktattr pktattr; + + m = nmp->nm_packet; + ifp = msg->lmsg.u.ms_resultp; + + M_ASSERTPKTHDR(m); + + /* Does altq mix with lacp? */ + if (ifq_is_enabled(&ifp->if_snd)) + altq_etherclassify(&ifp->if_snd, m, &pktattr); + + ifq_dispatch(ifp, m, &pktattr); +} + + +int +lagg_enqueue(struct ifnet *ifp, struct mbuf *m) +{ + struct altq_pktattr pktattr; + + if (ifq_is_enabled(&ifp->if_snd)) + altq_etherclassify(&ifp->if_snd, m, &pktattr); + + ifq_dispatch(ifp, m, &pktattr); + return 0; +} + +/* + * Simple round robin aggregation + */ +static int +lagg_rr_attach(struct lagg_softc *sc) +{ + sc->sc_detach = lagg_rr_detach; + sc->sc_input = lagg_rr_input; + sc->sc_select_tx_port = lagg_rr_select_tx_port; + sc->sc_port_create = NULL; + sc->sc_capabilities = IFCAP_LAGG_FULLDUPLEX; + sc->sc_seq = 0; + + return (0); +} + +static int +lagg_rr_detach(struct lagg_softc *sc) +{ + return (0); +} + +static struct ifnet * +lagg_rr_select_tx_port(struct lagg_softc *sc, struct mbuf *m) +{ + struct lagg_port *lp; + uint32_t p; + + p = atomic_fetchadd_32(&sc->sc_seq, 1); + p %= sc->sc_count; + lp = SLIST_FIRST(&sc->sc_ports); + while (p--) + lp = SLIST_NEXT(lp, lp_entries); + + /* + * Check the port's link state. This will return the next active + * port if the link is down or the port is NULL. + */ + if ((lp = lagg_link_active(sc, lp)) == NULL) { + return (NULL); + } + + return (lp->lp_ifp); +} + +static struct mbuf * +lagg_rr_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) +{ + struct ifnet *ifp = sc->sc_ifp; + + /* Just pass in the packet to our lagg device */ + m->m_pkthdr.rcvif = ifp; + + return (m); +} + +/* + * Active failover + */ + +static int +lagg_fail_attach(struct lagg_softc *sc) +{ + sc->sc_detach = lagg_fail_detach; + sc->sc_select_tx_port = lagg_fail_select_tx_port; + sc->sc_input = lagg_fail_input; + sc->sc_port_create = NULL; + sc->sc_port_destroy = NULL; + + return (0); +} + +static int +lagg_fail_detach(struct lagg_softc *sc) +{ + return (0); +} + +struct ifnet * +lagg_fail_select_tx_port(struct lagg_softc *sc, struct mbuf *m) +{ + struct lagg_port *lp; + + if ((lp = lagg_link_active(sc, sc->sc_primary)) == NULL) + return NULL; + + return lp->lp_ifp; +} + +static struct mbuf * +lagg_fail_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) +{ + struct ifnet *ifp = sc->sc_ifp; + struct lagg_port *tmp_tp; + + if (lp == sc->sc_primary || lagg_failover_rx_all) { + m->m_pkthdr.rcvif = ifp; + return (m); + } + + if (!LAGG_PORTACTIVE(sc->sc_primary)) { + tmp_tp = lagg_link_active(sc, sc->sc_primary); + /* + * If tmp_tp is null, we've recieved a packet when all + * our links are down. Weird, but process it anyways. + */ + if ((tmp_tp == NULL || tmp_tp == lp)) { + m->m_pkthdr.rcvif = ifp; + return (m); + } + } + + m_freem(m); + return (NULL); +} + +/* + * Loadbalancing + */ + +static int +lagg_lb_attach(struct lagg_softc *sc) +{ + struct lagg_port *lp; + struct lagg_lb *lb; + + if ((lb = (struct lagg_lb *)kmalloc(sizeof(struct lagg_lb), + M_DEVBUF, M_NOWAIT|M_ZERO)) == NULL) + return (ENOMEM); + + sc->sc_detach = lagg_lb_detach; + sc->sc_select_tx_port = lagg_lb_select_tx_port; + sc->sc_input = lagg_lb_input; + sc->sc_port_create = lagg_lb_port_create; + sc->sc_port_destroy = lagg_lb_port_destroy; + sc->sc_capabilities = IFCAP_LAGG_FULLDUPLEX; + + lb->lb_key = karc4random(); + sc->sc_psc = (caddr_t)lb; + + SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) + lagg_lb_port_create(lp); + + return (0); +} + +static int +lagg_lb_detach(struct lagg_softc *sc) +{ + struct lagg_lb *lb = (struct lagg_lb *)sc->sc_psc; + if (lb != NULL) + kfree(lb, M_DEVBUF); + return (0); +} + +static int +lagg_lb_porttable(struct lagg_softc *sc, struct lagg_port *lp) +{ + struct lagg_lb *lb = (struct lagg_lb *)sc->sc_psc; + struct lagg_port *lp_next; + int i = 0; + + bzero(&lb->lb_ports, sizeof(lb->lb_ports)); + SLIST_FOREACH(lp_next, &sc->sc_ports, lp_entries) { + if (lp_next == lp) + continue; + if (i >= LAGG_MAX_PORTS) + return (EINVAL); + if (sc->sc_ifflags & IFF_DEBUG) + kprintf("%s: port %s at index %d\n", + sc->sc_ifname, lp_next->lp_ifname, i); + lb->lb_ports[i++] = lp_next; + } + + return (0); +} + +static int +lagg_lb_port_create(struct lagg_port *lp) +{ + struct lagg_softc *sc = lp->lp_softc; + return (lagg_lb_porttable(sc, NULL)); +} + +static void +lagg_lb_port_destroy(struct lagg_port *lp) +{ + struct lagg_softc *sc = lp->lp_softc; + lagg_lb_porttable(sc, lp); +} + + +struct ifnet * +lagg_lb_select_tx_port(struct lagg_softc *sc, struct mbuf *m) +{ + struct lagg_lb *lb = (struct lagg_lb *)sc->sc_psc; + struct lagg_port *lp = NULL; + uint32_t p = 0; + + /* XXX + if (sc->use_flowid && (m->m_flags & M_FLOWID)) + p = m->m_pkthdr.flowid; + else + */ + p = lagg_hashmbuf(sc, m, lb->lb_key); + p %= sc->sc_count; + lp = lb->lb_ports[p]; + + /* + * Check the port's link state. This will return the next active + * port if the link is down or the port is NULL. + */ + if ((lp = lagg_link_active(sc, lp)) == NULL) + return NULL; + + return lp->lp_ifp; +} + +static struct mbuf * +lagg_lb_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) +{ + struct ifnet *ifp = sc->sc_ifp; + + /* Just pass in the packet to our lagg device */ + m->m_pkthdr.rcvif = ifp; + + return (m); +} + +/* + * 802.3ad LACP + */ +static int +lagg_lacp_attach(struct lagg_softc *sc) +{ + struct lagg_port *lp; + int error; + + sc->sc_detach = lagg_lacp_detach; + sc->sc_port_create = lacp_port_create; + sc->sc_port_destroy = lacp_port_destroy; + sc->sc_linkstate = lacp_linkstate; + sc->sc_select_tx_port = lagg_lacp_select_tx_port; + sc->sc_input = lagg_lacp_input; + sc->sc_init = lacp_init; + sc->sc_stop = lacp_stop; + sc->sc_lladdr = lagg_lacp_lladdr; + sc->sc_req = lacp_req; + sc->sc_portreq = lacp_portreq; + + error = lacp_attach(sc); + if (error) + return (error); + + SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) + lacp_port_create(lp); + + return (error); +} + +static int +lagg_lacp_detach(struct lagg_softc *sc) +{ + struct lagg_port *lp; + int error; + + SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) + lacp_port_destroy(lp); + + /* unlocking is safe here */ + LAGG_WUNLOCK(sc); + error = lacp_detach(sc); + LAGG_WLOCK(sc); + + return (error); +} + +static void +lagg_lacp_lladdr(struct lagg_softc *sc) +{ + struct lagg_port *lp; + + /* purge all the lacp ports */ + SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) + lacp_port_destroy(lp); + + /* add them back in */ + SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) + lacp_port_create(lp); +} + +struct ifnet * +lagg_lacp_select_tx_port(struct lagg_softc *sc, struct mbuf *m) +{ + struct lagg_port *lp; + lp = lacp_select_tx_port(sc, m); + if (lp == NULL) + return NULL; + + return lp->lp_ifp; +} + +static struct mbuf * +lagg_lacp_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) +{ + struct ifnet *ifp = sc->sc_ifp; + struct ether_header *eh; + u_short etype; + + eh = mtod(m, struct ether_header *); + etype = ntohs(eh->ether_type); + + /* Tap off LACP control messages */ + if ((m->m_flags & M_VLANTAG) == 0 && etype == ETHERTYPE_SLOW) { + m = lacp_input(lp, m); + if (m == NULL) + return (NULL); + } + + /* + * If the port is not collecting or not in the active aggregator then + * free and return. + */ + if (lacp_iscollecting(lp) == 0 || lacp_isactive(lp) == 0) { + m_freem(m); + return (NULL); + } + + m->m_pkthdr.rcvif = ifp; + return (m); +} + +static void +lagg_callout(void *arg) +{ + struct lagg_softc *sc = (struct lagg_softc *)arg; +#if XXX + struct ifnet *ifp = sc->sc_ifp; + + ifp->if_ipackets = counter_u64_fetch(sc->sc_ipackets); + ifp->if_opackets = counter_u64_fetch(sc->sc_opackets); + ifp->if_ibytes = counter_u64_fetch(sc->sc_ibytes); + ifp->if_obytes = counter_u64_fetch(sc->sc_obytes); +#endif + + callout_reset(&sc->sc_callout, hz, lagg_callout, sc); +} diff --git a/sys/net/lagg/if_lagg.h b/sys/net/lagg/if_lagg.h new file mode 100644 index 0000000000..3707bb4d4c --- /dev/null +++ b/sys/net/lagg/if_lagg.h @@ -0,0 +1,295 @@ +/* $OpenBSD: if_trunk.h,v 1.11 2007/01/31 06:20:19 reyk Exp $ */ + +/* + * Copyright (c) 2005, 2006 Reyk Floeter + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * $FreeBSD$ + */ + +#ifndef _NET_LAGG_H +#define _NET_LAGG_H + +#include +#include + +/* + * Global definitions + */ + +#define LAGG_MAX_PORTS 32 /* logically */ +#define LAGG_MAX_NAMESIZE 32 /* name of a protocol */ +#define LAGG_MAX_STACKING 4 /* maximum number of stacked laggs */ + +/* Lagg flags */ +#define LAGG_F_HASHL2 0x00000001 /* hash layer 2 */ +#define LAGG_F_HASHL3 0x00000002 /* hash layer 3 */ +#define LAGG_F_HASHL4 0x00000004 /* hash layer 4 */ +#define LAGG_F_HASHMASK 0x00000007 + +/* Port flags */ +#define LAGG_PORT_SLAVE 0x00000000 /* normal enslaved port */ +#define LAGG_PORT_MASTER 0x00000001 /* primary port */ +#define LAGG_PORT_STACK 0x00000002 /* stacked lagg port */ +#define LAGG_PORT_ACTIVE 0x00000004 /* port is active */ +#define LAGG_PORT_COLLECTING 0x00000008 /* port is receiving frames */ +#define LAGG_PORT_DISTRIBUTING 0x00000010 /* port is sending frames */ +#define LAGG_PORT_DISABLED 0x00000020 /* port is disabled */ +#define LAGG_PORT_BITS "\20\01MASTER\02STACK\03ACTIVE\04COLLECTING" \ + "\05DISTRIBUTING\06DISABLED" + +/* Supported lagg PROTOs */ +#define LAGG_PROTO_NONE 0 /* no lagg protocol defined */ +#define LAGG_PROTO_ROUNDROBIN 1 /* simple round robin */ +#define LAGG_PROTO_FAILOVER 2 /* active failover */ +#define LAGG_PROTO_LOADBALANCE 3 /* loadbalance */ +#define LAGG_PROTO_LACP 4 /* 802.3ad lacp */ +#define LAGG_PROTO_ETHERCHANNEL 5 /* Cisco FEC */ +#define LAGG_PROTO_MAX 6 + +struct lagg_protos { + const char *lpr_name; + int lpr_proto; +}; + +#define LAGG_PROTO_DEFAULT LAGG_PROTO_FAILOVER +#define LAGG_PROTOS { \ + { "failover", LAGG_PROTO_FAILOVER }, \ + { "fec", LAGG_PROTO_ETHERCHANNEL }, \ + { "lacp", LAGG_PROTO_LACP }, \ + { "loadbalance", LAGG_PROTO_LOADBALANCE }, \ + { "roundrobin", LAGG_PROTO_ROUNDROBIN }, \ + { "none", LAGG_PROTO_NONE }, \ + { "default", LAGG_PROTO_DEFAULT } \ +} + +/* + * lagg ioctls. + */ + +/* + * LACP current operational parameters structure. + */ +struct lacp_opreq { + uint16_t actor_prio; + uint8_t actor_mac[ETHER_ADDR_LEN]; + uint16_t actor_key; + uint16_t actor_portprio; + uint16_t actor_portno; + uint8_t actor_state; + uint16_t partner_prio; + uint8_t partner_mac[ETHER_ADDR_LEN]; + uint16_t partner_key; + uint16_t partner_portprio; + uint16_t partner_portno; + uint8_t partner_state; +}; + +/* lagg port settings */ +struct lagg_reqport { + char rp_ifname[IFNAMSIZ]; /* name of the lagg */ + char rp_portname[IFNAMSIZ]; /* name of the port */ + u_int32_t rp_prio; /* port priority */ + u_int32_t rp_flags; /* port flags */ + union { + struct lacp_opreq rpsc_lacp; + } rp_psc; +#define rp_lacpreq rp_psc.rpsc_lacp +}; + +#define SIOCGLAGGPORT _IOWR('i', 140, struct lagg_reqport) +#define SIOCSLAGGPORT _IOW('i', 141, struct lagg_reqport) +#define SIOCSLAGGDELPORT _IOW('i', 142, struct lagg_reqport) + +/* lagg, ports and options */ +struct lagg_reqall { + char ra_ifname[IFNAMSIZ]; /* name of the lagg */ + u_int ra_proto; /* lagg protocol */ + + size_t ra_size; /* size of buffer */ + struct lagg_reqport *ra_port; /* allocated buffer */ + int ra_ports; /* total port count */ + union { + struct lacp_opreq rpsc_lacp; + } ra_psc; +#define ra_lacpreq ra_psc.rpsc_lacp +}; + +#define SIOCGLAGG _IOWR('i', 143, struct lagg_reqall) +#define SIOCSLAGG _IOW('i', 144, struct lagg_reqall) + +struct lagg_reqflags { + char rf_ifname[IFNAMSIZ]; /* name of the lagg */ + uint32_t rf_flags; /* lagg protocol */ +}; + +#define SIOCGLAGGFLAGS _IOWR('i', 145, struct lagg_reqflags) +#define SIOCSLAGGHASH _IOW('i', 146, struct lagg_reqflags) + +#ifdef _KERNEL + +#if 0 +#include +#endif + +/* + * Internal kernel part + */ + +#define lp_ifname lp_ifp->if_xname /* interface name */ +#define lp_link_state lp_ifp->if_link_state /* link state */ + +#define LAGG_PORTACTIVE(_tp) ( \ + ((_tp)->lp_link_state == LINK_STATE_UP) && \ + ((_tp)->lp_ifp->if_flags & IFF_UP) \ +) + +struct lagg_ifreq { + union { + struct ifreq ifreq; + struct { + char ifr_name[IFNAMSIZ]; + struct sockaddr_storage ifr_ss; + } ifreq_storage; + } ifreq; +}; + +#define sc_ifflags sc_ifp->if_flags /* flags */ +#define sc_ifname sc_ifp->if_xname /* name */ +#define sc_capabilities sc_ifp->if_capabilities /* capabilities */ + +#define IFCAP_LAGG_MASK 0xffff0000 /* private capabilities */ +#define IFCAP_LAGG_FULLDUPLEX 0x00010000 /* full duplex with >1 ports */ + +/* Private data used by the loadbalancing protocol */ +struct lagg_lb { + u_int32_t lb_key; + struct lagg_port *lb_ports[LAGG_MAX_PORTS]; +}; + +struct lagg_mc { + struct ifmultiaddr *mc_ifma; + SLIST_ENTRY(lagg_mc) mc_entries; +}; + +/* List of interfaces to have the MAC address modified */ +struct lagg_llq { + struct ifnet *llq_ifp; + uint8_t llq_lladdr[ETHER_ADDR_LEN]; + SLIST_ENTRY(lagg_llq) llq_entries; +}; + +struct lagg_softc { + struct arpcom sc_arp; + struct ifnet *sc_ifp; /* virtual interface */ + struct lock sc_lock; + struct lock sc_call_lock; + int sc_proto; /* lagg protocol */ + u_int sc_count; /* number of ports */ + u_int sc_active; /* active port count */ + u_int sc_flapping; /* number of flapping + * events */ + struct lagg_port *sc_primary; /* primary port */ + struct ifmedia sc_media; /* media config */ + caddr_t sc_psc; /* protocol data */ + uint32_t sc_seq; /* sequence counter */ + uint32_t sc_flags; + +#if 0 + counter_u64_t sc_ipackets; + counter_u64_t sc_opackets; + counter_u64_t sc_ibytes; + counter_u64_t sc_obytes; +#endif + + SLIST_HEAD(__tplhd, lagg_port) sc_ports; /* list of interfaces */ + SLIST_ENTRY(lagg_softc) sc_entries; + + struct task sc_lladdr_task; + SLIST_HEAD(__llqhd, lagg_llq) sc_llq_head; /* interfaces to program + the lladdr on */ + + /* lagg protocol callbacks */ + int (*sc_detach)(struct lagg_softc *); + int (*sc_start)(struct lagg_softc *, struct mbuf *); + void (*sc_start_dispatch)(netmsg_t msg); + struct ifnet *(*sc_select_tx_port)(struct lagg_softc *, struct mbuf *); + struct mbuf *(*sc_input)(struct lagg_softc *, struct lagg_port *, + struct mbuf *); + int (*sc_port_create)(struct lagg_port *); + void (*sc_port_destroy)(struct lagg_port *); + void (*sc_linkstate)(struct lagg_port *); + void (*sc_init)(struct lagg_softc *); + void (*sc_stop)(struct lagg_softc *); + void (*sc_lladdr)(struct lagg_softc *); + void (*sc_req)(struct lagg_softc *, caddr_t); + void (*sc_portreq)(struct lagg_port *, caddr_t); + eventhandler_tag vlan_attach; + eventhandler_tag vlan_detach; + struct callout sc_callout; + struct sysctl_ctx_list ctx; /* sysctl variables */ + struct sysctl_oid *sc_oid; /* sysctl tree oid */ + int use_flowid; /* use M_FLOWID */ +}; +#define sc_if sc_arp.ac_if + +struct lagg_port { + struct ifnet *lp_ifp; /* physical interface */ + struct lagg_softc *lp_softc; /* parent lagg */ + uint8_t lp_lladdr[ETHER_ADDR_LEN]; + + u_char lp_iftype; /* interface type */ + uint32_t lp_prio; /* port priority */ + uint32_t lp_flags; /* port flags */ + int lp_ifflags; /* saved ifp flags */ + void *lh_cookie; /* if state hook */ + caddr_t lp_psc; /* protocol data */ + int lp_detaching; /* ifnet is detaching */ + + SLIST_HEAD(__mclhd, lagg_mc) lp_mc_head; /* multicast addresses */ + + /* Redirected callbacks */ + int (*lp_ioctl)(struct ifnet *, u_long, caddr_t, struct ucred *); + int (*lp_output)(struct ifnet *, struct mbuf *, + struct sockaddr *, struct rtentry *); + + SLIST_ENTRY(lagg_port) lp_entries; +}; + +#define LAGG_LOCK_INIT(_sc) lockinit(&(_sc)->sc_lock, "if_lagg rmlock", 0, LK_CANRECURSE) +#define LAGG_LOCK_DESTROY(_sc) lockuninit(&(_sc)->sc_lock) +#define LAGG_RLOCK(_sc) lockmgr(&(_sc)->sc_lock, LK_SHARED) +#define LAGG_WLOCK(_sc) lockmgr(&(_sc)->sc_lock, LK_EXCLUSIVE) +#define LAGG_RUNLOCK(_sc) lockmgr(&(_sc)->sc_lock, LK_RELEASE) +#define LAGG_WUNLOCK(_sc) lockmgr(&(_sc)->sc_lock, LK_RELEASE) +#define LAGG_RLOCK_ASSERT(_sc) KKASSERT(lockstatus(&(_sc)->sc_lock, curthread) == LK_SHARED) +#define LAGG_WLOCK_ASSERT(_sc) KKASSERT(lockstatus(&(_sc)->sc_lock, curthread) == LK_EXCLUSIVE) + +#define LAGG_CALLOUT_LOCK_INIT(_sc) \ + lockinit(&(_sc)->sc_call_lock, "if_lagg callout mutex", 0, 0) +#define LAGG_CALLOUT_LOCK_DESTROY(_sc) lockuninit(&(_sc)->sc_call_lock) + +extern void (*lagg_input_p)(struct ifnet *, struct mbuf *); +extern int (*lagg_output_p)(struct ifnet *, struct mbuf *); +extern struct ifnet *(lagg_interface_p)(void *if_lagg); +extern void (*lagg_linkstate_p)(struct ifnet *, int ); + +int lagg_enqueue(struct ifnet *, struct mbuf *); +uint32_t lagg_hashmbuf(struct lagg_softc *, struct mbuf *, uint32_t); + +SYSCTL_DECL(_net_link_lagg); + +#endif /* _KERNEL */ + +#endif /* _NET_LAGG_H */ diff --git a/sys/netinet6/in6.c b/sys/netinet6/in6.c index cf5bc6c993..952b1616d2 100644 --- a/sys/netinet6/in6.c +++ b/sys/netinet6/in6.c @@ -1823,6 +1823,32 @@ in6ifa_ifpwithaddr(struct ifnet *ifp, struct in6_addr *addr) return (NULL); } +/* + * Find a link-local scoped address on ifp and return it if any. + */ +struct in6_ifaddr * +in6ifa_llaonifp(struct ifnet *ifp) +{ + struct sockaddr_in6 *sin6; + struct ifaddr_container *ifac; + + TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) { + struct ifaddr *ifa = ifac->ifa; + + if (ifa->ifa_addr->sa_family != AF_INET6) + continue; + sin6 = (struct sockaddr_in6 *)ifa->ifa_addr; + if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr) || + IN6_IS_ADDR_MC_INTFACELOCAL(&sin6->sin6_addr) || + IN6_IS_ADDR_MC_NODELOCAL(&sin6->sin6_addr)) + break; + } + if (ifac != NULL) + return ((struct in6_ifaddr *)(ifac->ifa)); + else + return (NULL); +} + /* * find the internet address on a given interface corresponding to a neighbor's * address. diff --git a/sys/netinet6/in6_var.h b/sys/netinet6/in6_var.h index 7c7ae4c141..961a5fb8d3 100644 --- a/sys/netinet6/in6_var.h +++ b/sys/netinet6/in6_var.h @@ -630,6 +630,7 @@ struct in6_ifaddr *in6ifa_ifpwithaddr (struct ifnet *, struct in6_addr *); struct in6_ifaddr *in6ifa_ifplocaladdr(const struct ifnet *, const struct in6_addr *); +struct in6_ifaddr *in6ifa_llaonifp(struct ifnet *); char *ip6_sprintf (const struct in6_addr *); int in6_addr2scopeid (struct ifnet *, struct in6_addr *); int in6_matchlen (struct in6_addr *, struct in6_addr *);