From: Joerg Sonnenberger Date: Fri, 11 Feb 2005 22:26:35 +0000 (+0000) Subject: Import ALTQ support from KAME. This is based on the FreeBSD 4 snapshot. X-Git-Tag: v2.0.1~8826 X-Git-Url: https://gitweb.dragonflybsd.org/dragonfly.git/commitdiff_plain/4d723e5a41b92a42a089a7782abac804f66a0134 Import ALTQ support from KAME. This is based on the FreeBSD 4 snapshot. This includes neither the ALTQ3 compat code nor the !DragonFly defines. The macros have been replaced with inline functions in net/ifq_var.h. This also renames pkthdr.pf_flags as it is intended as general flag bit. Currently supported are ppp(4), sppp(4), tun(4) and wi(4), more drivers are coming later. Reviewed-by: corecode, dillon, hsu Comments-from: hmp --- diff --git a/sys/conf/files b/sys/conf/files index a6717a6ed7..552b442051 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -1,5 +1,5 @@ # $FreeBSD: src/sys/conf/files,v 1.340.2.137 2003/06/04 17:10:30 sam Exp $ -# $DragonFly: src/sys/conf/files,v 1.85 2005/01/31 23:44:34 joerg Exp $ +# $DragonFly: src/sys/conf/files,v 1.86 2005/02/11 22:25:56 joerg Exp $ # # The long compile-with and dependency lines are required because of # limitations in config: backslash-newline doesn't work in strings, and @@ -777,6 +777,13 @@ vfs/ntfs/ntfs_vnops.c optional ntfs vfs/ntfs/ntfs_subr.c optional ntfs vfs/ntfs/ntfs_compr.c optional ntfs vfs/ntfs/ntfs_ihash.c optional ntfs +net/altq/altq_cbq.c optional altq +net/altq/altq_hfsc.c optional altq +net/altq/altq_priq.c optional altq +net/altq/altq_red.c optional altq +net/altq/altq_rio.c optional altq +net/altq/altq_rmclass.c optional altq +net/altq/altq_subr.c optional altq net/bpf.c standard net/bpf_filter.c optional bpf net/bridge/bridge.c optional bridge diff --git a/sys/conf/options b/sys/conf/options index 072bf399e9..9d51423b5d 100644 --- a/sys/conf/options +++ b/sys/conf/options @@ -1,5 +1,5 @@ # $FreeBSD: src/sys/conf/options,v 1.191.2.53 2003/06/04 17:56:58 sam Exp $ -# $DragonFly: src/sys/conf/options,v 1.29 2005/01/31 23:44:34 joerg Exp $ +# $DragonFly: src/sys/conf/options,v 1.30 2005/02/11 22:25:56 joerg Exp $ # # On the handling of kernel options # @@ -256,6 +256,16 @@ ISP_TARGET_MODE opt_isp.h ATA_STATIC_ID opt_ata.h # Net stuff. +# altq stuff +ALTQ opt_global.h +ALTQ_CBQ opt_altq.h +ALTQ_RED opt_altq.h +ALTQ_RIO opt_altq.h +ALTQ_HFSC opt_altq.h +ALTQ_PRIQ opt_altq.h +ALTQ_NOPCC opt_altq.h +ALTQ_DEBUG opt_altq.h + ACCEPT_FILTER_DATA ACCEPT_FILTER_HTTP BOOTP opt_bootp.h diff --git a/sys/config/LINT b/sys/config/LINT index 07b04b8831..1a39e0bb18 100644 --- a/sys/config/LINT +++ b/sys/config/LINT @@ -3,7 +3,7 @@ # as much of the source tree as it can. # # $FreeBSD: src/sys/i386/conf/LINT,v 1.749.2.144 2003/06/04 17:56:59 sam Exp $ -# $DragonFly: src/sys/config/LINT,v 1.46 2005/01/31 23:44:35 joerg Exp $ +# $DragonFly: src/sys/config/LINT,v 1.47 2005/02/11 22:25:56 joerg Exp $ # # NB: You probably don't want to try running a kernel built from this # file. Instead, you should start from GENERIC, and add options from @@ -2796,3 +2796,16 @@ options KTR_COMPILE=(KTR_ALL) # Every trace class, see sys/ktr.h for # the different class numbers options KTR_ENTRIES=1024 options KTR_VERBOSE=1 + +# ALTQ +options ALTQ #alternate queueing +options ALTQ_CBQ #class based queueing +options ALTQ_RED #random early detection +options ALTQ_RIO #triple red for diffserv (needs RED) +options ALTQ_HFSC #hierarchical fair service curve +options ALTQ_PRIQ #priority queue +#options ALTQ_NOPCC #don't use processor cycle counter +options ALTQ_DEBUG #for debugging +# you might want to set kernel timer to 1kHz if you use CBQ, +# especially with 100baseT +#options HZ=1000 diff --git a/sys/dev/netif/cm/smc90cx6.c b/sys/dev/netif/cm/smc90cx6.c index b73f2a57a4..20ac62b971 100644 --- a/sys/dev/netif/cm/smc90cx6.c +++ b/sys/dev/netif/cm/smc90cx6.c @@ -1,6 +1,6 @@ /* $NetBSD: smc90cx6.c,v 1.38 2001/07/07 15:57:53 thorpej Exp $ */ /* $FreeBSD: src/sys/dev/cm/smc90cx6.c,v 1.1.2.3 2003/02/05 18:42:14 fjoe Exp $ */ -/* $DragonFly: src/sys/dev/netif/cm/Attic/smc90cx6.c,v 1.10 2004/07/23 07:16:25 joerg Exp $ */ +/* $DragonFly: src/sys/dev/netif/cm/Attic/smc90cx6.c,v 1.11 2005/02/11 22:25:56 joerg Exp $ */ /*- * Copyright (c) 1994, 1995, 1998 The NetBSD Foundation, Inc. @@ -318,7 +318,6 @@ cm_attach(dev) ifp->if_ioctl = cm_ioctl; ifp->if_watchdog = cm_watchdog; ifp->if_init = cm_init; - /* XXX IFQ_SET_READY(&ifp->if_snd); */ ifp->if_snd.ifq_maxlen = IFQ_MAXLEN; ifp->if_timer = 0; ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX; diff --git a/sys/dev/netif/owi/owi_hostap.c b/sys/dev/netif/owi/owi_hostap.c index 5ffee953da..6b6fa171b1 100644 --- a/sys/dev/netif/owi/owi_hostap.c +++ b/sys/dev/netif/owi/owi_hostap.c @@ -30,7 +30,7 @@ * THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD: src/sys/dev/wi/wi_hostap.c,v 1.7.2.4 2002/08/02 07:11:34 imp Exp $ - * $DragonFly: src/sys/dev/netif/owi/Attic/owi_hostap.c,v 1.2 2004/09/15 00:21:09 joerg Exp $ + * $DragonFly: src/sys/dev/netif/owi/Attic/owi_hostap.c,v 1.3 2005/02/11 22:25:56 joerg Exp $ */ /* This is experimental Host AP software for Prism 2 802.11b interfaces. @@ -65,6 +65,7 @@ #include #include +#include #include #include #include @@ -1113,7 +1114,7 @@ owihap_data_input(struct wi_softc *sc, struct wi_frame *rxfrm, struct mbuf *m) /* Queue up for repeating. */ - IF_HANDOFF(&ifp->if_snd, m, ifp); + ifq_handoff(ifp, m, NULL); return (!mcast); } diff --git a/sys/dev/netif/pdq_layer/pdq_ifsubr.c b/sys/dev/netif/pdq_layer/pdq_ifsubr.c index 7e4105c6c1..c2b4a884df 100644 --- a/sys/dev/netif/pdq_layer/pdq_ifsubr.c +++ b/sys/dev/netif/pdq_layer/pdq_ifsubr.c @@ -22,7 +22,7 @@ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD: src/sys/dev/pdq/pdq_ifsubr.c,v 1.11.2.1 2000/08/02 22:39:30 peter Exp $ - * $DragonFly: src/sys/dev/netif/pdq_layer/Attic/pdq_ifsubr.c,v 1.9 2005/02/10 00:09:17 joerg Exp $ + * $DragonFly: src/sys/dev/netif/pdq_layer/Attic/pdq_ifsubr.c,v 1.10 2005/02/11 22:25:56 joerg Exp $ * */ @@ -364,7 +364,7 @@ pdq_ifattach( ifp->if_ioctl = pdq_ifioctl; ifp->if_start = pdq_ifstart; - IFQ_SET_MAXLEN(&ifp->if_snd, IFQ_MAXLEN); + ifp->if_snd.ifq_maxlen = IFQ_MAXLEN; fddi_ifattach(ifp); } diff --git a/sys/dev/netif/re/if_re.c b/sys/dev/netif/re/if_re.c index 878a27b0c7..1bc70a24d9 100644 --- a/sys/dev/netif/re/if_re.c +++ b/sys/dev/netif/re/if_re.c @@ -33,7 +33,7 @@ * THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD: src/sys/dev/re/if_re.c,v 1.25 2004/06/09 14:34:01 naddy Exp $ - * $DragonFly: src/sys/dev/netif/re/if_re.c,v 1.7 2005/01/25 19:35:11 dillon Exp $ + * $DragonFly: src/sys/dev/netif/re/if_re.c,v 1.8 2005/02/11 22:25:56 joerg Exp $ */ /* @@ -122,6 +122,7 @@ #include #include +#include #include #include #include @@ -630,11 +631,15 @@ re_diag(struct re_softc *sc) /* * Queue the packet, start transmission. - * Note: IF_HANDOFF() ultimately calls re_start() for us. + * Note: ifq_handoff() ultimately calls re_start() for us. */ CSR_WRITE_2(sc, RE_ISR, 0xFFFF); - IF_HANDOFF(&ifp->if_snd, m0, ifp); + error = ifq_handoff(ifp, m0, NULL); + if (error) { + m0 = NULL; + goto done; + } m0 = NULL; /* Wait for it to propagate through the chip */ diff --git a/sys/dev/netif/wi/if_wi.c b/sys/dev/netif/wi/if_wi.c index 5c05db4020..bfdb18a1cb 100644 --- a/sys/dev/netif/wi/if_wi.c +++ b/sys/dev/netif/wi/if_wi.c @@ -32,7 +32,7 @@ * THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD: src/sys/dev/wi/if_wi.c,v 1.166 2004/04/01 00:38:45 sam Exp $ - * $DragonFly: src/sys/dev/netif/wi/if_wi.c,v 1.19 2005/02/02 14:14:20 joerg Exp $ + * $DragonFly: src/sys/dev/netif/wi/if_wi.c,v 1.20 2005/02/11 22:25:56 joerg Exp $ */ /* @@ -93,6 +93,7 @@ #include #include #include +#include #include #include @@ -291,7 +292,8 @@ wi_attach(device_t dev) ifp->if_start = wi_start; ifp->if_watchdog = wi_watchdog; ifp->if_init = wi_init; - ifp->if_snd.ifq_maxlen = IFQ_MAXLEN; + ifq_set_maxlen(&ifp->if_snd, IFQ_MAXLEN); + ifq_set_ready(&ifp->if_snd); #ifdef DEVICE_POLLING ifp->if_capabilities |= IFCAP_POLLING; #endif @@ -548,8 +550,7 @@ wi_poll(struct ifnet *ifp, enum poll_cmd cmd, int count) } if ((ifp->if_flags & IFF_OACTIVE) == 0 && - (sc->sc_flags & WI_FLAGS_OUTRANGE) == 0 && - IF_QLEN(&ifp->if_snd) != NULL) + (sc->sc_flags & WI_FLAGS_OUTRANGE) == 0 && !ifq_is_empty(&ifp->if_snd)) wi_start(ifp); } #endif /* DEVICE_POLLING */ @@ -595,7 +596,7 @@ wi_intr(void *arg) wi_info_intr(sc); if ((ifp->if_flags & IFF_OACTIVE) == 0 && (sc->sc_flags & WI_FLAGS_OUTRANGE) == 0 && - IF_QLEN(&ifp->if_snd) != 0) + !ifq_is_empty(&ifp->if_snd)) wi_start(ifp); /* Re-enable interrupts. */ @@ -891,14 +892,14 @@ wi_start(struct ifnet *ifp) } else { if (ic->ic_state != IEEE80211_S_RUN) break; - IFQ_POLL(&ifp->if_snd, m0); + m0 = ifq_poll(&ifp->if_snd); if (m0 == NULL) break; if (sc->sc_txd[cur].d_len != 0) { ifp->if_flags |= IFF_OACTIVE; break; } - IFQ_DEQUEUE(&ifp->if_snd, m0); + m0 = ifq_dequeue(&ifp->if_snd); ifp->if_opackets++; m_copydata(m0, 0, ETHER_HDR_LEN, (caddr_t)&frmhdr.wi_ehdr); diff --git a/sys/i386/conf/LINT b/sys/i386/conf/LINT index 5c2ac527fe..92c6010ee5 100644 --- a/sys/i386/conf/LINT +++ b/sys/i386/conf/LINT @@ -3,7 +3,7 @@ # as much of the source tree as it can. # # $FreeBSD: src/sys/i386/conf/LINT,v 1.749.2.144 2003/06/04 17:56:59 sam Exp $ -# $DragonFly: src/sys/i386/conf/Attic/LINT,v 1.46 2005/01/31 23:44:35 joerg Exp $ +# $DragonFly: src/sys/i386/conf/Attic/LINT,v 1.47 2005/02/11 22:25:56 joerg Exp $ # # NB: You probably don't want to try running a kernel built from this # file. Instead, you should start from GENERIC, and add options from @@ -2796,3 +2796,16 @@ options KTR_COMPILE=(KTR_ALL) # Every trace class, see sys/ktr.h for # the different class numbers options KTR_ENTRIES=1024 options KTR_VERBOSE=1 + +# ALTQ +options ALTQ #alternate queueing +options ALTQ_CBQ #class based queueing +options ALTQ_RED #random early detection +options ALTQ_RIO #triple red for diffserv (needs RED) +options ALTQ_HFSC #hierarchical fair service curve +options ALTQ_PRIQ #priority queue +#options ALTQ_NOPCC #don't use processor cycle counter +options ALTQ_DEBUG #for debugging +# you might want to set kernel timer to 1kHz if you use CBQ, +# especially with 100baseT +#options HZ=1000 diff --git a/sys/kern/uipc_mbuf.c b/sys/kern/uipc_mbuf.c index 56ecf03562..b86355f66b 100644 --- a/sys/kern/uipc_mbuf.c +++ b/sys/kern/uipc_mbuf.c @@ -82,7 +82,7 @@ * * @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94 * $FreeBSD: src/sys/kern/uipc_mbuf.c,v 1.51.2.24 2003/04/15 06:59:29 silby Exp $ - * $DragonFly: src/sys/kern/uipc_mbuf.c,v 1.31 2005/02/04 19:16:00 dillon Exp $ + * $DragonFly: src/sys/kern/uipc_mbuf.c,v 1.32 2005/02/11 22:25:57 joerg Exp $ */ #include "opt_param.h" @@ -770,7 +770,7 @@ m_gethdr(int how, int type) m->m_pkthdr.rcvif = NULL; SLIST_INIT(&m->m_pkthdr.tags); m->m_pkthdr.csum_flags = 0; - m->m_pkthdr.pf_flags = 0; + m->m_pkthdr.fw_flags = 0; return (m); } diff --git a/sys/net/altq/altq.h b/sys/net/altq/altq.h new file mode 100644 index 0000000000..2216303137 --- /dev/null +++ b/sys/net/altq/altq.h @@ -0,0 +1,63 @@ +/* $KAME: altq.h,v 1.10 2003/07/10 12:07:47 kjc Exp $ */ +/* $DragonFly: src/sys/net/altq/altq.h,v 1.1 2005/02/11 22:25:57 joerg Exp $ */ + +/* + * Copyright (C) 1998-2003 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#ifndef _ALTQ_ALTQ_H_ +#define _ALTQ_ALTQ_H_ + +/* altq discipline type */ +#define ALTQT_NONE 0 /* reserved */ +#define ALTQT_CBQ 1 /* cbq */ +#define ALTQT_RED 2 /* red */ +#define ALTQT_RIO 3 /* rio */ +#define ALTQT_HFSC 4 /* hfsc */ +#define ALTQT_PRIQ 5 /* priority queue */ +#define ALTQT_MAX 6 /* should be max discipline type + 1 */ + +/* simple token packet meter profile */ +struct tb_profile { + u_int rate; /* rate in bit-per-sec */ + u_int depth; /* depth in bytes */ +}; + +/* + * generic packet counter + */ +struct pktcntr { + uint64_t packets; + uint64_t bytes; +}; + +#define PKTCNTR_ADD(cntr, len) do { \ + (cntr)->packets++; (cntr)->bytes += len; \ +} while (0) + +#ifdef _KERNEL +#include +#endif + +#endif /* _ALTQ_ALTQ_H_ */ diff --git a/sys/net/altq/altq_cbq.c b/sys/net/altq/altq_cbq.c new file mode 100644 index 0000000000..5e237c04f7 --- /dev/null +++ b/sys/net/altq/altq_cbq.c @@ -0,0 +1,542 @@ +/* $KAME: altq_cbq.c,v 1.20 2004/04/17 10:54:48 kjc Exp $ */ +/* $DragonFly: src/sys/net/altq/altq_cbq.c,v 1.1 2005/02/11 22:25:57 joerg Exp $ */ + +/* + * Copyright (c) Sun Microsystems, Inc. 1993-1998 All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the SMCC Technology + * Development Group at Sun Microsystems, Inc. + * + * 4. The name of the Sun Microsystems, Inc nor may not be used to endorse or + * promote products derived from this software without specific prior + * written permission. + * + * SUN MICROSYSTEMS DOES NOT CLAIM MERCHANTABILITY OF THIS SOFTWARE OR THE + * SUITABILITY OF THIS SOFTWARE FOR ANY PARTICULAR PURPOSE. The software is + * provided "as is" without express or implied warranty of any kind. + * + * These notices must be retained in any copies of any part of this software. + */ + +#include "opt_altq.h" +#include "opt_inet.h" +#include "opt_inet6.h" + +#ifdef ALTQ_CBQ /* cbq is enabled by ALTQ_CBQ option in opt_altq.h */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +/* + * Forward Declarations. + */ +static int cbq_class_destroy(cbq_state_t *, struct rm_class *); +static struct rm_class *clh_to_clp(cbq_state_t *, uint32_t); +static int cbq_clear_interface(cbq_state_t *); +static int cbq_request(struct ifaltq *, int, void *); +static int cbq_enqueue(struct ifaltq *, struct mbuf *, + struct altq_pktattr *); +static struct mbuf *cbq_dequeue(struct ifaltq *, int); +static void cbqrestart(struct ifaltq *); +static void get_class_stats(class_stats_t *, struct rm_class *); +static void cbq_purge(cbq_state_t *); + +/* + * int + * cbq_class_destroy(cbq_mod_state_t *, struct rm_class *) - This + * function destroys a given traffic class. Before destroying + * the class, all traffic for that class is released. + */ +static int +cbq_class_destroy(cbq_state_t *cbqp, struct rm_class *cl) +{ + int i; + + /* delete the class */ + rmc_delete_class(&cbqp->ifnp, cl); + + /* + * free the class handle + */ + for (i = 0; i < CBQ_MAX_CLASSES; i++) + if (cbqp->cbq_class_tbl[i] == cl) + cbqp->cbq_class_tbl[i] = NULL; + + if (cl == cbqp->ifnp.root_) + cbqp->ifnp.root_ = NULL; + if (cl == cbqp->ifnp.default_) + cbqp->ifnp.default_ = NULL; + return (0); +} + +/* convert class handle to class pointer */ +static struct rm_class * +clh_to_clp(cbq_state_t *cbqp, uint32_t chandle) +{ + int i; + struct rm_class *cl; + + if (chandle == 0) + return (NULL); + /* + * first, try optimistically the slot matching the lower bits of + * the handle. if it fails, do the linear table search. + */ + i = chandle % CBQ_MAX_CLASSES; + if ((cl = cbqp->cbq_class_tbl[i]) != NULL && + cl->stats_.handle == chandle) + return (cl); + for (i = 0; i < CBQ_MAX_CLASSES; i++) + if ((cl = cbqp->cbq_class_tbl[i]) != NULL && + cl->stats_.handle == chandle) + return (cl); + return (NULL); +} + +static int +cbq_clear_interface(cbq_state_t *cbqp) +{ + int again, i; + struct rm_class *cl; + + /* clear out the classes now */ + do { + again = 0; + for (i = 0; i < CBQ_MAX_CLASSES; i++) { + if ((cl = cbqp->cbq_class_tbl[i]) != NULL) { + if (is_a_parent_class(cl)) + again++; + else { + cbq_class_destroy(cbqp, cl); + cbqp->cbq_class_tbl[i] = NULL; + if (cl == cbqp->ifnp.root_) + cbqp->ifnp.root_ = NULL; + if (cl == cbqp->ifnp.default_) + cbqp->ifnp.default_ = NULL; + } + } + } + } while (again); + + return (0); +} + +static int +cbq_request(struct ifaltq *ifq, int req, void *arg) +{ + cbq_state_t *cbqp = (cbq_state_t *)ifq->altq_disc; + + switch (req) { + case ALTRQ_PURGE: + cbq_purge(cbqp); + break; + } + return (0); +} + +/* copy the stats info in rm_class to class_states_t */ +static void +get_class_stats(class_stats_t *statsp, struct rm_class *cl) +{ + statsp->xmit_cnt = cl->stats_.xmit_cnt; + statsp->drop_cnt = cl->stats_.drop_cnt; + statsp->over = cl->stats_.over; + statsp->borrows = cl->stats_.borrows; + statsp->overactions = cl->stats_.overactions; + statsp->delays = cl->stats_.delays; + + statsp->depth = cl->depth_; + statsp->priority = cl->pri_; + statsp->maxidle = cl->maxidle_; + statsp->minidle = cl->minidle_; + statsp->offtime = cl->offtime_; + statsp->qmax = qlimit(cl->q_); + statsp->ns_per_byte = cl->ns_per_byte_; + statsp->wrr_allot = cl->w_allotment_; + statsp->qcnt = qlen(cl->q_); + statsp->avgidle = cl->avgidle_; + + statsp->qtype = qtype(cl->q_); +#ifdef ALTQ_RED + if (q_is_red(cl->q_)) + red_getstats(cl->red_, &statsp->red[0]); +#endif +#ifdef ALTQ_RIO + if (q_is_rio(cl->q_)) + rio_getstats((rio_t *)cl->red_, &statsp->red[0]); +#endif +} + +int +cbq_pfattach(struct pf_altq *a) +{ + struct ifnet *ifp; + int s, error; + + if ((ifp = ifunit(a->ifname)) == NULL || a->altq_disc == NULL) + return (EINVAL); + s = splimp(); + error = altq_attach(&ifp->if_snd, ALTQT_CBQ, a->altq_disc, + cbq_enqueue, cbq_dequeue, cbq_request, NULL, NULL); + splx(s); + return (error); +} + +int +cbq_add_altq(struct pf_altq *a) +{ + cbq_state_t *cbqp; + struct ifnet *ifp; + + if ((ifp = ifunit(a->ifname)) == NULL) + return (EINVAL); + if (!ifq_is_ready(&ifp->if_snd)) + return (ENODEV); + + /* allocate and initialize cbq_state_t */ + cbqp = malloc(sizeof(*cbqp), M_ALTQ, M_WAITOK | M_ZERO); + callout_init(&cbqp->cbq_callout); + cbqp->cbq_qlen = 0; + cbqp->ifnp.ifq_ = &ifp->if_snd; /* keep the ifq */ + + /* keep the state in pf_altq */ + a->altq_disc = cbqp; + + return (0); +} + +int +cbq_remove_altq(struct pf_altq *a) +{ + cbq_state_t *cbqp; + + if ((cbqp = a->altq_disc) == NULL) + return (EINVAL); + a->altq_disc = NULL; + + cbq_clear_interface(cbqp); + + if (cbqp->ifnp.default_) + cbq_class_destroy(cbqp, cbqp->ifnp.default_); + if (cbqp->ifnp.root_) + cbq_class_destroy(cbqp, cbqp->ifnp.root_); + + /* deallocate cbq_state_t */ + free(cbqp, M_ALTQ); + + return (0); +} + +int +cbq_add_queue(struct pf_altq *a) +{ + struct rm_class *borrow, *parent; + cbq_state_t *cbqp; + struct rm_class *cl; + struct cbq_opts *opts; + int i; + + if ((cbqp = a->altq_disc) == NULL) + return (EINVAL); + if (a->qid == 0) + return (EINVAL); + + /* + * find a free slot in the class table. if the slot matching + * the lower bits of qid is free, use this slot. otherwise, + * use the first free slot. + */ + i = a->qid % CBQ_MAX_CLASSES; + if (cbqp->cbq_class_tbl[i] != NULL) { + for (i = 0; i < CBQ_MAX_CLASSES; i++) + if (cbqp->cbq_class_tbl[i] == NULL) + break; + if (i == CBQ_MAX_CLASSES) + return (EINVAL); + } + + opts = &a->pq_u.cbq_opts; + /* check parameters */ + if (a->priority >= CBQ_MAXPRI) + return (EINVAL); + + /* Get pointers to parent and borrow classes. */ + parent = clh_to_clp(cbqp, a->parent_qid); + if (opts->flags & CBQCLF_BORROW) + borrow = parent; + else + borrow = NULL; + + /* + * A class must borrow from it's parent or it can not + * borrow at all. Hence, borrow can be null. + */ + if (parent == NULL && (opts->flags & CBQCLF_ROOTCLASS) == 0) { + printf("cbq_add_queue: no parent class!\n"); + return (EINVAL); + } + + if ((borrow != parent) && (borrow != NULL)) { + printf("cbq_add_class: borrow class != parent\n"); + return (EINVAL); + } + + /* + * check parameters + */ + switch (opts->flags & CBQCLF_CLASSMASK) { + case CBQCLF_ROOTCLASS: + if (parent != NULL) + return (EINVAL); + if (cbqp->ifnp.root_) + return (EINVAL); + break; + case CBQCLF_DEFCLASS: + if (cbqp->ifnp.default_) + return (EINVAL); + break; + case 0: + if (a->qid == 0) + return (EINVAL); + break; + default: + /* more than two flags bits set */ + return (EINVAL); + } + + /* + * create a class. if this is a root class, initialize the + * interface. + */ + if ((opts->flags & CBQCLF_CLASSMASK) == CBQCLF_ROOTCLASS) { + rmc_init(cbqp->ifnp.ifq_, &cbqp->ifnp, opts->ns_per_byte, + cbqrestart, a->qlimit, RM_MAXQUEUED, + opts->maxidle, opts->minidle, opts->offtime, + opts->flags); + cl = cbqp->ifnp.root_; + } else { + cl = rmc_newclass(a->priority, + &cbqp->ifnp, opts->ns_per_byte, + rmc_delay_action, a->qlimit, parent, borrow, + opts->maxidle, opts->minidle, opts->offtime, + opts->pktsize, opts->flags); + } + if (cl == NULL) + return (ENOMEM); + + /* return handle to user space. */ + cl->stats_.handle = a->qid; + cl->stats_.depth = cl->depth_; + + /* save the allocated class */ + cbqp->cbq_class_tbl[i] = cl; + + if ((opts->flags & CBQCLF_CLASSMASK) == CBQCLF_DEFCLASS) + cbqp->ifnp.default_ = cl; + + return (0); +} + +int +cbq_remove_queue(struct pf_altq *a) +{ + struct rm_class *cl; + cbq_state_t *cbqp; + int i; + + if ((cbqp = a->altq_disc) == NULL) + return (EINVAL); + + if ((cl = clh_to_clp(cbqp, a->qid)) == NULL) + return (EINVAL); + + /* if we are a parent class, then return an error. */ + if (is_a_parent_class(cl)) + return (EINVAL); + + /* delete the class */ + rmc_delete_class(&cbqp->ifnp, cl); + + /* + * free the class handle + */ + for (i = 0; i < CBQ_MAX_CLASSES; i++) + if (cbqp->cbq_class_tbl[i] == cl) { + cbqp->cbq_class_tbl[i] = NULL; + if (cl == cbqp->ifnp.root_) + cbqp->ifnp.root_ = NULL; + if (cl == cbqp->ifnp.default_) + cbqp->ifnp.default_ = NULL; + break; + } + + return (0); +} + +int +cbq_getqstats(struct pf_altq *a, void *ubuf, int *nbytes) +{ + cbq_state_t *cbqp; + struct rm_class *cl; + class_stats_t stats; + int error = 0; + + if ((cbqp = altq_lookup(a->ifname, ALTQT_CBQ)) == NULL) + return (EBADF); + + if ((cl = clh_to_clp(cbqp, a->qid)) == NULL) + return (EINVAL); + + if (*nbytes < sizeof(stats)) + return (EINVAL); + + get_class_stats(&stats, cl); + + if ((error = copyout((caddr_t)&stats, ubuf, sizeof(stats))) != 0) + return (error); + *nbytes = sizeof(stats); + return (0); +} + +/* + * int + * cbq_enqueue(struct ifaltq *ifq, struct mbuf *m, struct altq_pktattr *pattr) + * - Queue data packets. + * + * cbq_enqueue is set to ifp->if_altqenqueue and called by an upper + * layer (e.g. ether_output). cbq_enqueue queues the given packet + * to the cbq, then invokes the driver's start routine. + * + * Assumptions: called in splimp + * Returns: 0 if the queueing is successful. + * ENOBUFS if a packet dropping occurred as a result of + * the queueing. + */ + +static int +cbq_enqueue(struct ifaltq *ifq, struct mbuf *m, struct altq_pktattr *pktattr) +{ + cbq_state_t *cbqp = (cbq_state_t *)ifq->altq_disc; + struct rm_class *cl; + int len; + + /* grab class set by classifier */ + if ((m->m_flags & M_PKTHDR) == 0) { + /* should not happen */ + if_printf(ifq->altq_ifp, "altq: packet does not have pkthdr\n"); + m_freem(m); + return (ENOBUFS); + } + if (m->m_pkthdr.fw_flags & ALTQ_MBUF_TAGGED) + cl = clh_to_clp(cbqp, m->m_pkthdr.altq_qid); + else + cl = NULL; + if (cl == NULL) { + cl = cbqp->ifnp.default_; + if (cl == NULL) { + m_freem(m); + return (ENOBUFS); + } + } + cl->pktattr_ = NULL; + len = m_pktlen(m); + if (rmc_queue_packet(cl, m) != 0) { + /* drop occurred. some mbuf was freed in rmc_queue_packet. */ + PKTCNTR_ADD(&cl->stats_.drop_cnt, len); + return (ENOBUFS); + } + + /* successfully queued. */ + ++cbqp->cbq_qlen; + ++ifq->ifq_len; + return (0); +} + +static struct mbuf * +cbq_dequeue(struct ifaltq *ifq, int op) +{ + cbq_state_t *cbqp = (cbq_state_t *)ifq->altq_disc; + struct mbuf *m; + + m = rmc_dequeue_next(&cbqp->ifnp, op); + + if (m && op == ALTDQ_REMOVE) { + --cbqp->cbq_qlen; /* decrement # of packets in cbq */ + --ifq->ifq_len; + + /* Update the class. */ + rmc_update_class_util(&cbqp->ifnp); + } + return (m); +} + +/* + * void + * cbqrestart(queue_t *) - Restart sending of data. + * called from rmc_restart in splimp via timeout after waking up + * a suspended class. + * Returns: NONE + */ + +static void +cbqrestart(struct ifaltq *ifq) +{ + cbq_state_t *cbqp; + struct ifnet *ifp; + + if (!ifq_is_enabled(ifq)) + /* cbq must have been detached */ + return; + + if ((cbqp = (cbq_state_t *)ifq->altq_disc) == NULL) + /* should not happen */ + return; + + ifp = ifq->altq_ifp; + if (ifp->if_start && + cbqp->cbq_qlen > 0 && (ifp->if_flags & IFF_OACTIVE) == 0) + (*ifp->if_start)(ifp); +} + +static void +cbq_purge(cbq_state_t *cbqp) +{ + struct rm_class *cl; + int i; + + for (i = 0; i < CBQ_MAX_CLASSES; i++) + if ((cl = cbqp->cbq_class_tbl[i]) != NULL) + rmc_dropall(cl); + if (ifq_is_enabled(cbqp->ifnp.ifq_)) + cbqp->ifnp.ifq_->ifq_len = 0; +} + +#endif /* ALTQ_CBQ */ diff --git a/sys/net/altq/altq_cbq.h b/sys/net/altq/altq_cbq.h new file mode 100644 index 0000000000..ef28af2f86 --- /dev/null +++ b/sys/net/altq/altq_cbq.h @@ -0,0 +1,114 @@ +/* $KAME: altq_cbq.h,v 1.12 2003/10/03 05:05:15 kjc Exp $ */ +/* $DragonFly: src/sys/net/altq/altq_cbq.h,v 1.1 2005/02/11 22:25:57 joerg Exp $ */ + +/* + * Copyright (c) Sun Microsystems, Inc. 1993-1998 All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the SMCC Technology + * Development Group at Sun Microsystems, Inc. + * + * 4. The name of the Sun Microsystems, Inc nor may not be used to endorse or + * promote products derived from this software without specific prior + * written permission. + * + * SUN MICROSYSTEMS DOES NOT CLAIM MERCHANTABILITY OF THIS SOFTWARE OR THE + * SUITABILITY OF THIS SOFTWARE FOR ANY PARTICULAR PURPOSE. The software is + * provided "as is" without express or implied warranty of any kind. + * + * These notices must be retained in any copies of any part of this software. + */ + +#ifndef _ALTQ_ALTQ_CBQ_H_ +#define _ALTQ_ALTQ_CBQ_H_ + +#include +#include +#include +#include + +#define NULL_CLASS_HANDLE 0 + +/* class flags should be same as class flags in rm_class.h */ +#define CBQCLF_RED 0x0001 /* use RED */ +#define CBQCLF_ECN 0x0002 /* use RED/ECN */ +#define CBQCLF_RIO 0x0004 /* use RIO */ +#define CBQCLF_CLEARDSCP 0x0008 /* clear diffserv codepoint */ +#define CBQCLF_BORROW 0x0010 /* borrow from parent */ + +/* class flags only for root class */ +#define CBQCLF_WRR 0x0100 /* weighted-round robin */ +#define CBQCLF_EFFICIENT 0x0200 /* work-conserving */ + +/* class flags for special classes */ +#define CBQCLF_ROOTCLASS 0x1000 /* root class */ +#define CBQCLF_DEFCLASS 0x2000 /* default class */ +#define CBQCLF_CLASSMASK 0xf000 /* class mask */ + +#define CBQ_MAXQSIZE 200 +#define CBQ_MAXPRI RM_MAXPRIO + +typedef struct _cbq_class_stats_ { + uint32_t handle; + u_int depth; + + struct pktcntr xmit_cnt; /* packets sent in this class */ + struct pktcntr drop_cnt; /* dropped packets */ + u_int over; /* # times went over limit */ + u_int borrows; /* # times tried to borrow */ + u_int overactions; /* # times invoked overlimit action */ + u_int delays; /* # times invoked delay actions */ + + /* other static class parameters useful for debugging */ + int priority; + int maxidle; + int minidle; + int offtime; + int qmax; + int ns_per_byte; + int wrr_allot; + + int qcnt; /* # packets in queue */ + int avgidle; + + /* red and rio related info */ + int qtype; + struct redstats red[3]; +} class_stats_t; + +#ifdef _KERNEL +/* + * Define macros only good for kernel drivers and modules. + */ +#define CBQ_WATCHDOG (hz / 20) +#define CBQ_TIMEOUT 10 +#define CBQ_LS_TIMEOUT (20 * hz / 1000) + +#define CBQ_MAX_CLASSES 256 + +/* + * Define State structures. + */ +typedef struct cbqstate { + int cbq_qlen; /* # of packets in cbq */ + struct rm_class *cbq_class_tbl[CBQ_MAX_CLASSES]; + + struct rm_ifdat ifnp; + struct callout cbq_callout; /* for timeouts */ +} cbq_state_t; + +#endif /* _KERNEL */ + +#endif /* !_ALTQ_ALTQ_CBQ_H_ */ diff --git a/sys/net/altq/altq_classq.h b/sys/net/altq/altq_classq.h new file mode 100644 index 0000000000..3924c79420 --- /dev/null +++ b/sys/net/altq/altq_classq.h @@ -0,0 +1,184 @@ +/* $KAME: altq_classq.h,v 1.6 2003/01/07 07:33:38 kjc Exp $ */ +/* $DragonFly: src/sys/net/altq/altq_classq.h,v 1.1 2005/02/11 22:25:57 joerg Exp $ */ + +/* + * Copyright (c) 1991-1997 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the Network Research + * Group at Lawrence Berkeley Laboratory. + * 4. Neither the name of the University nor of the Laboratory may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * class queue definitions extracted from rm_class.h. + */ +#ifndef _ALTQ_ALTQ_CLASSQ_H_ +#define _ALTQ_ALTQ_CLASSQ_H_ + +/* + * Packet Queue types: RED or DROPHEAD. + */ +#define Q_DROPHEAD 0x00 +#define Q_RED 0x01 +#define Q_RIO 0x02 +#define Q_DROPTAIL 0x03 + +#ifdef _KERNEL + +/* + * Packet Queue structures and macros to manipulate them. + */ +struct _class_queue_ { + struct mbuf *tail_; /* Tail of packet queue */ + int qlen_; /* Queue length (in number of packets) */ + int qlim_; /* Queue limit (in number of packets*) */ + int qtype_; /* Queue type */ +}; + +typedef struct _class_queue_ class_queue_t; + +#define qtype(q) (q)->qtype_ /* Get queue type */ +#define qlimit(q) (q)->qlim_ /* Max packets to be queued */ +#define qlen(q) (q)->qlen_ /* Current queue length. */ +#define qtail(q) (q)->tail_ /* Tail of the queue */ +#define qhead(q) ((q)->tail_ ? (q)->tail_->m_nextpkt : NULL) + +#define qempty(q) ((q)->qlen_ == 0) /* Is the queue empty?? */ +#define q_is_red(q) ((q)->qtype_ == Q_RED) /* Is the queue a red queue */ +#define q_is_rio(q) ((q)->qtype_ == Q_RIO) /* Is the queue a rio queue */ +#define q_is_red_or_rio(q) ((q)->qtype_ == Q_RED || (q)->qtype_ == Q_RIO) + +static __inline void +_addq(class_queue_t *q, struct mbuf *m) +{ + struct mbuf *m0; + + if ((m0 = qtail(q)) != NULL) + m->m_nextpkt = m0->m_nextpkt; + else + m0 = m; + m0->m_nextpkt = m; + qtail(q) = m; + qlen(q)++; +} + +static __inline struct mbuf * +_getq(class_queue_t *q) +{ + struct mbuf *m, *m0; + + if ((m = qtail(q)) == NULL) + return (NULL); + if ((m0 = m->m_nextpkt) != m) + m->m_nextpkt = m0->m_nextpkt; + else + qtail(q) = NULL; + qlen(q)--; + m0->m_nextpkt = NULL; + return (m0); +} + +/* drop a packet at the tail of the queue */ +static __inline struct mbuf * +_getq_tail(class_queue_t *q) +{ + struct mbuf *m, *m0, *prev; + + if ((m = m0 = qtail(q)) == NULL) + return NULL; + do { + prev = m0; + m0 = m0->m_nextpkt; + } while (m0 != m); + prev->m_nextpkt = m->m_nextpkt; + if (prev == m) + qtail(q) = NULL; + else + qtail(q) = prev; + qlen(q)--; + m->m_nextpkt = NULL; + return (m); +} + +/* randomly select a packet in the queue */ +static __inline struct mbuf * +_getq_random(class_queue_t *q) +{ + struct mbuf *m; + int i, n; + + if ((m = qtail(q)) == NULL) + return NULL; + if (m->m_nextpkt == m) + qtail(q) = NULL; + else { + struct mbuf *prev = NULL; + + n = random() % qlen(q) + 1; + for (i = 0; i < n; i++) { + prev = m; + m = m->m_nextpkt; + } + prev->m_nextpkt = m->m_nextpkt; + if (m == qtail(q)) + qtail(q) = prev; + } + qlen(q)--; + m->m_nextpkt = NULL; + return (m); +} + +static __inline void +_removeq(class_queue_t *q, struct mbuf *m) +{ + struct mbuf *m0, *prev; + + m0 = qtail(q); + do { + prev = m0; + m0 = m0->m_nextpkt; + } while (m0 != m); + prev->m_nextpkt = m->m_nextpkt; + if (prev == m) + qtail(q) = NULL; + else if (qtail(q) == m) + qtail(q) = prev; + qlen(q)--; +} + +static __inline void +_flushq(class_queue_t *q) +{ + struct mbuf *m; + + while ((m = _getq(q)) != NULL) + m_freem(m); +} + +#endif /* _KERNEL */ + +#endif /* _ALTQ_ALTQ_CLASSQ_H_ */ diff --git a/sys/net/altq/altq_hfsc.c b/sys/net/altq/altq_hfsc.c new file mode 100644 index 0000000000..0efcbe413a --- /dev/null +++ b/sys/net/altq/altq_hfsc.c @@ -0,0 +1,1624 @@ +/* $KAME: altq_hfsc.c,v 1.25 2004/04/17 10:54:48 kjc Exp $ */ +/* $DragonFly: src/sys/net/altq/altq_hfsc.c,v 1.1 2005/02/11 22:25:57 joerg Exp $ */ + +/* + * Copyright (c) 1997-1999 Carnegie Mellon University. All Rights Reserved. + * + * Permission to use, copy, modify, and distribute this software and + * its documentation is hereby granted (including for commercial or + * for-profit use), provided that both the copyright notice and this + * permission notice appear in all copies of the software, derivative + * works, or modified versions, and any portions thereof. + * + * THIS SOFTWARE IS EXPERIMENTAL AND IS KNOWN TO HAVE BUGS, SOME OF + * WHICH MAY HAVE SERIOUS CONSEQUENCES. CARNEGIE MELLON PROVIDES THIS + * SOFTWARE IN ITS ``AS IS'' CONDITION, AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + * Carnegie Mellon encourages (but does not require) users of this + * software to return any improvements or extensions that they make, + * and to grant Carnegie Mellon the rights to redistribute these + * changes without encumbrance. + */ +/* + * H-FSC is described in Proceedings of SIGCOMM'97, + * "A Hierarchical Fair Service Curve Algorithm for Link-Sharing, + * Real-Time and Priority Service" + * by Ion Stoica, Hui Zhang, and T. S. Eugene Ng. + * + * Oleg Cherevko added the upperlimit for link-sharing. + * when a class has an upperlimit, the fit-time is computed from the + * upperlimit service curve. the link-sharing scheduler does not schedule + * a class whose fit-time exceeds the current time. + */ + +#include "opt_altq.h" +#include "opt_inet.h" +#include "opt_inet6.h" + +#ifdef ALTQ_HFSC /* hfsc is enabled by ALTQ_HFSC option in opt_altq.h */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +/* + * function prototypes + */ +static int hfsc_clear_interface(struct hfsc_if *); +static int hfsc_request(struct ifaltq *, int, void *); +static void hfsc_purge(struct hfsc_if *); +static struct hfsc_class *hfsc_class_create(struct hfsc_if *, + struct service_curve *, + struct service_curve *, + struct service_curve *, + struct hfsc_class *, int, int, int); +static int hfsc_class_destroy(struct hfsc_class *); +static struct hfsc_class *hfsc_nextclass(struct hfsc_class *); +static int hfsc_enqueue(struct ifaltq *, struct mbuf *, + struct altq_pktattr *); +static struct mbuf *hfsc_dequeue(struct ifaltq *, int); + +static int hfsc_addq(struct hfsc_class *, struct mbuf *); +static struct mbuf *hfsc_getq(struct hfsc_class *); +static struct mbuf *hfsc_pollq(struct hfsc_class *); +static void hfsc_purgeq(struct hfsc_class *); + +static void update_cfmin(struct hfsc_class *); +static void set_active(struct hfsc_class *, int); +static void set_passive(struct hfsc_class *); + +static void init_ed(struct hfsc_class *, int); +static void update_ed(struct hfsc_class *, int); +static void update_d(struct hfsc_class *, int); +static void init_vf(struct hfsc_class *, int); +static void update_vf(struct hfsc_class *, int, uint64_t); +static ellist_t *ellist_alloc(void); +static void ellist_destroy(ellist_t *); +static void ellist_insert(struct hfsc_class *); +static void ellist_remove(struct hfsc_class *); +static void ellist_update(struct hfsc_class *); +struct hfsc_class *ellist_get_mindl(ellist_t *, uint64_t); +static actlist_t *actlist_alloc(void); +static void actlist_destroy(actlist_t *); +static void actlist_insert(struct hfsc_class *); +static void actlist_remove(struct hfsc_class *); +static void actlist_update(struct hfsc_class *); + +static struct hfsc_class *actlist_firstfit(struct hfsc_class *, uint64_t); + +static __inline uint64_t seg_x2y(uint64_t, uint64_t); +static __inline uint64_t seg_y2x(uint64_t, uint64_t); +static __inline uint64_t m2sm(u_int); +static __inline uint64_t m2ism(u_int); +static __inline uint64_t d2dx(u_int); +static u_int sm2m(uint64_t); +static u_int dx2d(uint64_t); + +static void sc2isc(struct service_curve *, struct internal_sc *); +static void rtsc_init(struct runtime_sc *, struct internal_sc *, + uint64_t, uint64_t); +static uint64_t rtsc_y2x(struct runtime_sc *, uint64_t); +static uint64_t rtsc_x2y(struct runtime_sc *, uint64_t); +static void rtsc_min(struct runtime_sc *, struct internal_sc *, + uint64_t, uint64_t); + +static void get_class_stats(struct hfsc_classstats *, struct hfsc_class *); +static struct hfsc_class *clh_to_clp(struct hfsc_if *, uint32_t); + +/* + * macros + */ +#define is_a_parent_class(cl) ((cl)->cl_children != NULL) + +#define HT_INFINITY 0xffffffffffffffffLL /* infinite time value */ + +int +hfsc_pfattach(struct pf_altq *a) +{ + struct ifnet *ifp; + int s, error; + + if ((ifp = ifunit(a->ifname)) == NULL || a->altq_disc == NULL) + return (EINVAL); + s = splimp(); + error = altq_attach(&ifp->if_snd, ALTQT_HFSC, a->altq_disc, + hfsc_enqueue, hfsc_dequeue, hfsc_request, NULL, NULL); + splx(s); + return (error); +} + +int +hfsc_add_altq(struct pf_altq *a) +{ + struct hfsc_if *hif; + struct ifnet *ifp; + + if ((ifp = ifunit(a->ifname)) == NULL) + return (EINVAL); + if (!ifq_is_ready(&ifp->if_snd)) + return (ENODEV); + + hif = malloc(sizeof(struct hfsc_if), M_ALTQ, M_WAITOK | M_ZERO); + + hif->hif_eligible = ellist_alloc(); + hif->hif_ifq = &ifp->if_snd; + + /* keep the state in pf_altq */ + a->altq_disc = hif; + + return (0); +} + +int +hfsc_remove_altq(struct pf_altq *a) +{ + struct hfsc_if *hif; + + if ((hif = a->altq_disc) == NULL) + return (EINVAL); + a->altq_disc = NULL; + + hfsc_clear_interface(hif); + hfsc_class_destroy(hif->hif_rootclass); + + ellist_destroy(hif->hif_eligible); + + free(hif, M_ALTQ); + + return (0); +} + +int +hfsc_add_queue(struct pf_altq *a) +{ + struct hfsc_if *hif; + struct hfsc_class *cl, *parent; + struct hfsc_opts *opts; + struct service_curve rtsc, lssc, ulsc; + + if ((hif = a->altq_disc) == NULL) + return (EINVAL); + + opts = &a->pq_u.hfsc_opts; + + if (a->parent_qid == HFSC_NULLCLASS_HANDLE && hif->hif_rootclass == NULL) + parent = NULL; + else if ((parent = clh_to_clp(hif, a->parent_qid)) == NULL) + return (EINVAL); + + if (a->qid == 0) + return (EINVAL); + + if (clh_to_clp(hif, a->qid) != NULL) + return (EBUSY); + + rtsc.m1 = opts->rtsc_m1; + rtsc.d = opts->rtsc_d; + rtsc.m2 = opts->rtsc_m2; + lssc.m1 = opts->lssc_m1; + lssc.d = opts->lssc_d; + lssc.m2 = opts->lssc_m2; + ulsc.m1 = opts->ulsc_m1; + ulsc.d = opts->ulsc_d; + ulsc.m2 = opts->ulsc_m2; + + cl = hfsc_class_create(hif, &rtsc, &lssc, &ulsc, parent, a->qlimit, + opts->flags, a->qid); + if (cl == NULL) + return (ENOMEM); + + return (0); +} + +int +hfsc_remove_queue(struct pf_altq *a) +{ + struct hfsc_if *hif; + struct hfsc_class *cl; + + if ((hif = a->altq_disc) == NULL) + return (EINVAL); + + if ((cl = clh_to_clp(hif, a->qid)) == NULL) + return (EINVAL); + + return (hfsc_class_destroy(cl)); +} + +int +hfsc_getqstats(struct pf_altq *a, void *ubuf, int *nbytes) +{ + struct hfsc_if *hif; + struct hfsc_class *cl; + struct hfsc_classstats stats; + int error = 0; + + if ((hif = altq_lookup(a->ifname, ALTQT_HFSC)) == NULL) + return (EBADF); + + if ((cl = clh_to_clp(hif, a->qid)) == NULL) + return (EINVAL); + + if (*nbytes < sizeof(stats)) + return (EINVAL); + + get_class_stats(&stats, cl); + + if ((error = copyout((caddr_t)&stats, ubuf, sizeof(stats))) != 0) + return (error); + *nbytes = sizeof(stats); + return (0); +} + +/* + * bring the interface back to the initial state by discarding + * all the filters and classes except the root class. + */ +static int +hfsc_clear_interface(struct hfsc_if *hif) +{ + struct hfsc_class *cl; + + if (hif->hif_rootclass == NULL) + return (0); + + + /* clear out the classes */ + while ((cl = hif->hif_rootclass->cl_children) != NULL) { + /* + * remove the first leaf class found in the hierarchy + * then start over + */ + for (; cl != NULL; cl = hfsc_nextclass(cl)) { + if (!is_a_parent_class(cl)) { + hfsc_class_destroy(cl); + break; + } + } + } + + return (0); +} + +static int +hfsc_request(struct ifaltq *ifq, int req, void *arg) +{ + struct hfsc_if *hif = (struct hfsc_if *)ifq->altq_disc; + + switch (req) { + case ALTRQ_PURGE: + hfsc_purge(hif); + break; + } + return (0); +} + +/* discard all the queued packets on the interface */ +static void +hfsc_purge(struct hfsc_if *hif) +{ + struct hfsc_class *cl; + + for (cl = hif->hif_rootclass; cl != NULL; cl = hfsc_nextclass(cl)) { + if (!qempty(cl->cl_q)) + hfsc_purgeq(cl); + } + if (ifq_is_enabled(hif->hif_ifq)) + hif->hif_ifq->ifq_len = 0; +} + +struct hfsc_class * +hfsc_class_create(struct hfsc_if *hif, struct service_curve *rsc, + struct service_curve *fsc, struct service_curve *usc, + struct hfsc_class *parent, int qlimit, int flags, int qid) +{ + struct hfsc_class *cl, *p; + int i, s; + + if (hif->hif_classes >= HFSC_MAX_CLASSES) + return (NULL); + +#ifndef ALTQ_RED + if (flags & HFCF_RED) { +#ifdef ALTQ_DEBUG + printf("hfsc_class_create: RED not configured for HFSC!\n"); +#endif + return (NULL); + } +#endif + + cl = malloc(sizeof(*cl), M_ALTQ, M_WAITOK | M_ZERO); + cl->cl_q = malloc(sizeof(*cl->cl_q), M_ALTQ, M_WAITOK | M_ZERO); + cl->cl_actc = actlist_alloc(); + + if (qlimit == 0) + qlimit = 50; /* use default */ + qlimit(cl->cl_q) = qlimit; + qtype(cl->cl_q) = Q_DROPTAIL; + qlen(cl->cl_q) = 0; + cl->cl_flags = flags; +#ifdef ALTQ_RED + if (flags & (HFCF_RED|HFCF_RIO)) { + int red_flags, red_pkttime; + u_int m2; + + m2 = 0; + if (rsc != NULL && rsc->m2 > m2) + m2 = rsc->m2; + if (fsc != NULL && fsc->m2 > m2) + m2 = fsc->m2; + if (usc != NULL && usc->m2 > m2) + m2 = usc->m2; + + red_flags = 0; + if (flags & HFCF_ECN) + red_flags |= REDF_ECN; +#ifdef ALTQ_RIO + if (flags & HFCF_CLEARDSCP) + red_flags |= RIOF_CLEARDSCP; +#endif + if (m2 < 8) + red_pkttime = 1000 * 1000 * 1000; /* 1 sec */ + else + red_pkttime = (int64_t)hif->hif_ifq->altq_ifp->if_mtu + * 1000 * 1000 * 1000 / (m2 / 8); + if (flags & HFCF_RED) { + cl->cl_red = red_alloc(0, 0, + qlimit(cl->cl_q) * 10/100, + qlimit(cl->cl_q) * 30/100, + red_flags, red_pkttime); + if (cl->cl_red != NULL) + qtype(cl->cl_q) = Q_RED; + } +#ifdef ALTQ_RIO + else { + cl->cl_red = (red_t *)rio_alloc(0, NULL, + red_flags, red_pkttime); + if (cl->cl_red != NULL) + qtype(cl->cl_q) = Q_RIO; + } +#endif + } +#endif /* ALTQ_RED */ + + if (rsc != NULL && (rsc->m1 != 0 || rsc->m2 != 0)) { + cl->cl_rsc = malloc(sizeof(*cl->cl_rsc), M_ALTQ, M_WAITOK); + sc2isc(rsc, cl->cl_rsc); + rtsc_init(&cl->cl_deadline, cl->cl_rsc, 0, 0); + rtsc_init(&cl->cl_eligible, cl->cl_rsc, 0, 0); + } + if (fsc != NULL && (fsc->m1 != 0 || fsc->m2 != 0)) { + cl->cl_fsc = malloc(sizeof(*cl->cl_fsc), M_ALTQ, M_WAITOK); + if (cl->cl_fsc == NULL) + goto err_ret; + sc2isc(fsc, cl->cl_fsc); + rtsc_init(&cl->cl_virtual, cl->cl_fsc, 0, 0); + } + if (usc != NULL && (usc->m1 != 0 || usc->m2 != 0)) { + cl->cl_usc = malloc(sizeof(*cl->cl_usc), M_ALTQ, M_WAITOK); + if (cl->cl_usc == NULL) + goto err_ret; + sc2isc(usc, cl->cl_usc); + rtsc_init(&cl->cl_ulimit, cl->cl_usc, 0, 0); + } + + cl->cl_id = hif->hif_classid++; + cl->cl_handle = qid; + cl->cl_hif = hif; + cl->cl_parent = parent; + + s = splimp(); + hif->hif_classes++; + + /* + * find a free slot in the class table. if the slot matching + * the lower bits of qid is free, use this slot. otherwise, + * use the first free slot. + */ + i = qid % HFSC_MAX_CLASSES; + if (hif->hif_class_tbl[i] == NULL) + hif->hif_class_tbl[i] = cl; + else { + for (i = 0; i < HFSC_MAX_CLASSES; i++) { + if (hif->hif_class_tbl[i] == NULL) { + hif->hif_class_tbl[i] = cl; + break; + } + } + if (i == HFSC_MAX_CLASSES) { + splx(s); + goto err_ret; + } + } + + if (flags & HFCF_DEFAULTCLASS) + hif->hif_defaultclass = cl; + + if (parent == NULL) { + /* this is root class */ + hif->hif_rootclass = cl; + } else if (parent->cl_children == NULL) { + /* add this class to the children list of the parent */ + parent->cl_children = cl; + } else { + p = parent->cl_children; + while (p->cl_siblings != NULL) + p = p->cl_siblings; + p->cl_siblings = cl; + } + splx(s); + + return (cl); + + err_ret: + if (cl->cl_actc != NULL) + actlist_destroy(cl->cl_actc); + if (cl->cl_red != NULL) { +#ifdef ALTQ_RIO + if (q_is_rio(cl->cl_q)) + rio_destroy((rio_t *)cl->cl_red); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->cl_q)) + red_destroy(cl->cl_red); +#endif + } + if (cl->cl_fsc != NULL) + free(cl->cl_fsc, M_ALTQ); + if (cl->cl_rsc != NULL) + free(cl->cl_rsc, M_ALTQ); + if (cl->cl_usc != NULL) + free(cl->cl_usc, M_ALTQ); + if (cl->cl_q != NULL) + free(cl->cl_q, M_ALTQ); + free(cl, M_ALTQ); + return (NULL); +} + +static int +hfsc_class_destroy(struct hfsc_class *cl) +{ + int i, s; + + if (cl == NULL) + return (0); + + if (is_a_parent_class(cl)) + return (EBUSY); + + s = splimp(); + + if (!qempty(cl->cl_q)) + hfsc_purgeq(cl); + + if (cl->cl_parent == NULL) { + /* this is root class */ + } else { + struct hfsc_class *p = cl->cl_parent->cl_children; + + if (p == cl) { + cl->cl_parent->cl_children = cl->cl_siblings; + } else { + do { + if (p->cl_siblings == cl) { + p->cl_siblings = cl->cl_siblings; + break; + } + } while ((p = p->cl_siblings) != NULL); + } + KKASSERT(p != NULL); + } + + for (i = 0; i < HFSC_MAX_CLASSES; i++) { + if (cl->cl_hif->hif_class_tbl[i] == cl) { + cl->cl_hif->hif_class_tbl[i] = NULL; + break; + } + } + + cl->cl_hif->hif_classes--; + splx(s); + + actlist_destroy(cl->cl_actc); + + if (cl->cl_red != NULL) { +#ifdef ALTQ_RIO + if (q_is_rio(cl->cl_q)) + rio_destroy((rio_t *)cl->cl_red); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->cl_q)) + red_destroy(cl->cl_red); +#endif + } + + if (cl == cl->cl_hif->hif_rootclass) + cl->cl_hif->hif_rootclass = NULL; + if (cl == cl->cl_hif->hif_defaultclass) + cl->cl_hif->hif_defaultclass = NULL; + + if (cl->cl_usc != NULL) + free(cl->cl_usc, M_ALTQ); + if (cl->cl_fsc != NULL) + free(cl->cl_fsc, M_ALTQ); + if (cl->cl_rsc != NULL) + free(cl->cl_rsc, M_ALTQ); + free(cl->cl_q, M_ALTQ); + free(cl, M_ALTQ); + + return (0); +} + +/* + * hfsc_nextclass returns the next class in the tree. + * usage: + * for (cl = hif->hif_rootclass; cl != NULL; cl = hfsc_nextclass(cl)) + * do_something; + */ +static struct hfsc_class * +hfsc_nextclass(struct hfsc_class *cl) +{ + if (cl->cl_children != NULL) { + cl = cl->cl_children; + } else if (cl->cl_siblings != NULL) { + cl = cl->cl_siblings; + } else { + while ((cl = cl->cl_parent) != NULL) { + if (cl->cl_siblings != NULL) { + cl = cl->cl_siblings; + break; + } + } + } + + return (cl); +} + +/* + * hfsc_enqueue is an enqueue function to be registered to + * (*altq_enqueue) in struct ifaltq. + */ +static int +hfsc_enqueue(struct ifaltq *ifq, struct mbuf *m, struct altq_pktattr *pktattr) +{ + struct hfsc_if *hif = (struct hfsc_if *)ifq->altq_disc; + struct hfsc_class *cl; + int len; + + /* grab class set by classifier */ + if ((m->m_flags & M_PKTHDR) == 0) { + /* should not happen */ + if_printf(ifq->altq_ifp, "altq: packet does not have pkthdr\n"); + m_freem(m); + return (ENOBUFS); + } + if (m->m_pkthdr.fw_flags & ALTQ_MBUF_TAGGED) + cl = clh_to_clp(hif, m->m_pkthdr.altq_qid); + else + cl = NULL; + if (cl == NULL || is_a_parent_class(cl)) { + cl = hif->hif_defaultclass; + if (cl == NULL) { + m_freem(m); + return (ENOBUFS); + } + } + cl->cl_pktattr = NULL; + len = m_pktlen(m); + if (hfsc_addq(cl, m) != 0) { + /* drop occurred. mbuf was freed in hfsc_addq. */ + PKTCNTR_ADD(&cl->cl_stats.drop_cnt, len); + return (ENOBUFS); + } + ifq->ifq_len++; + cl->cl_hif->hif_packets++; + + /* successfully queued. */ + if (qlen(cl->cl_q) == 1) + set_active(cl, m_pktlen(m)); + + return (0); +} + +/* + * hfsc_dequeue is a dequeue function to be registered to + * (*altq_dequeue) in struct ifaltq. + * + * note: ALTDQ_POLL returns the next packet without removing the packet + * from the queue. ALTDQ_REMOVE is a normal dequeue operation. + * ALTDQ_REMOVE must return the same packet if called immediately + * after ALTDQ_POLL. + */ +static struct mbuf * +hfsc_dequeue(struct ifaltq *ifq, int op) +{ + struct hfsc_if *hif = (struct hfsc_if *)ifq->altq_disc; + struct hfsc_class *cl; + struct mbuf *m; + int len, next_len; + int realtime = 0; + uint64_t cur_time; + + if (hif->hif_packets == 0) { + /* no packet in the tree */ + return (NULL); + } + + cur_time = read_machclk(); + + if (op == ALTDQ_REMOVE && hif->hif_pollcache != NULL) { + cl = hif->hif_pollcache; + hif->hif_pollcache = NULL; + /* check if the class was scheduled by real-time criteria */ + if (cl->cl_rsc != NULL) + realtime = (cl->cl_e <= cur_time); + } else { + /* + * if there are eligible classes, use real-time criteria. + * find the class with the minimum deadline among + * the eligible classes. + */ + if ((cl = ellist_get_mindl(hif->hif_eligible, cur_time)) != NULL) { + realtime = 1; + } else { +#ifdef ALTQ_DEBUG + int fits = 0; +#endif + /* + * use link-sharing criteria + * get the class with the minimum vt in the hierarchy + */ + cl = hif->hif_rootclass; + while (is_a_parent_class(cl)) { + + cl = actlist_firstfit(cl, cur_time); + if (cl == NULL) { +#ifdef ALTQ_DEBUG + if (fits > 0) + printf("%d fit but none found\n",fits); +#endif + return (NULL); + } + /* + * update parent's cl_cvtmin. + * don't update if the new vt is smaller. + */ + if (cl->cl_parent->cl_cvtmin < cl->cl_vt) + cl->cl_parent->cl_cvtmin = cl->cl_vt; +#ifdef ALTQ_DEBUG + fits++; +#endif + } + } + + if (op == ALTDQ_POLL) { + hif->hif_pollcache = cl; + m = hfsc_pollq(cl); + return (m); + } + } + + m = hfsc_getq(cl); + if (m == NULL) + panic("hfsc_dequeue:"); + len = m_pktlen(m); + cl->cl_hif->hif_packets--; + ifq->ifq_len--; + PKTCNTR_ADD(&cl->cl_stats.xmit_cnt, len); + + update_vf(cl, len, cur_time); + if (realtime) + cl->cl_cumul += len; + + if (!qempty(cl->cl_q)) { + if (cl->cl_rsc != NULL) { + /* update ed */ + next_len = m_pktlen(qhead(cl->cl_q)); + + if (realtime) + update_ed(cl, next_len); + else + update_d(cl, next_len); + } + } else { + /* the class becomes passive */ + set_passive(cl); + } + + return (m); +} + +static int +hfsc_addq(struct hfsc_class *cl, struct mbuf *m) +{ + +#ifdef ALTQ_RIO + if (q_is_rio(cl->cl_q)) + return rio_addq((rio_t *)cl->cl_red, cl->cl_q, + m, cl->cl_pktattr); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->cl_q)) + return red_addq(cl->cl_red, cl->cl_q, m, cl->cl_pktattr); +#endif + if (qlen(cl->cl_q) >= qlimit(cl->cl_q)) { + m_freem(m); + return (-1); + } + + if (cl->cl_flags & HFCF_CLEARDSCP) + write_dsfield(m, cl->cl_pktattr, 0); + + _addq(cl->cl_q, m); + + return (0); +} + +static struct mbuf * +hfsc_getq(struct hfsc_class *cl) +{ +#ifdef ALTQ_RIO + if (q_is_rio(cl->cl_q)) + return rio_getq((rio_t *)cl->cl_red, cl->cl_q); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->cl_q)) + return red_getq(cl->cl_red, cl->cl_q); +#endif + return _getq(cl->cl_q); +} + +static struct mbuf * +hfsc_pollq(struct hfsc_class *cl) +{ + return qhead(cl->cl_q); +} + +static void +hfsc_purgeq(struct hfsc_class *cl) +{ + struct mbuf *m; + + if (qempty(cl->cl_q)) + return; + + while ((m = _getq(cl->cl_q)) != NULL) { + PKTCNTR_ADD(&cl->cl_stats.drop_cnt, m_pktlen(m)); + m_freem(m); + cl->cl_hif->hif_packets--; + cl->cl_hif->hif_ifq->ifq_len--; + } + KKASSERT(qlen(cl->cl_q) == 0); + + update_vf(cl, 0, 0); /* remove cl from the actlist */ + set_passive(cl); +} + +static void +set_active(struct hfsc_class *cl, int len) +{ + if (cl->cl_rsc != NULL) + init_ed(cl, len); + if (cl->cl_fsc != NULL) + init_vf(cl, len); + + cl->cl_stats.period++; +} + +static void +set_passive(struct hfsc_class *cl) +{ + if (cl->cl_rsc != NULL) + ellist_remove(cl); + + /* + * actlist is now handled in update_vf() so that update_vf(cl, 0, 0) + * needs to be called explicitly to remove a class from actlist + */ +} + +static void +init_ed(struct hfsc_class *cl, int next_len) +{ + uint64_t cur_time; + + cur_time = read_machclk(); + + /* update the deadline curve */ + rtsc_min(&cl->cl_deadline, cl->cl_rsc, cur_time, cl->cl_cumul); + + /* + * update the eligible curve. + * for concave, it is equal to the deadline curve. + * for convex, it is a linear curve with slope m2. + */ + cl->cl_eligible = cl->cl_deadline; + if (cl->cl_rsc->sm1 <= cl->cl_rsc->sm2) { + cl->cl_eligible.dx = 0; + cl->cl_eligible.dy = 0; + } + + /* compute e and d */ + cl->cl_e = rtsc_y2x(&cl->cl_eligible, cl->cl_cumul); + cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len); + + ellist_insert(cl); +} + +static void +update_ed(struct hfsc_class *cl, int next_len) +{ + cl->cl_e = rtsc_y2x(&cl->cl_eligible, cl->cl_cumul); + cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len); + + ellist_update(cl); +} + +static void +update_d(struct hfsc_class *cl, int next_len) +{ + cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len); +} + +static void +init_vf(struct hfsc_class *cl, int len) +{ + struct hfsc_class *max_cl, *p; + uint64_t vt, f, cur_time; + int go_active; + + cur_time = 0; + go_active = 1; + for ( ; cl->cl_parent != NULL; cl = cl->cl_parent) { + if (go_active && cl->cl_nactive++ == 0) + go_active = 1; + else + go_active = 0; + + if (go_active) { + max_cl = actlist_last(cl->cl_parent->cl_actc); + if (max_cl != NULL) { + /* + * set vt to the average of the min and max + * classes. if the parent's period didn't + * change, don't decrease vt of the class. + */ + vt = max_cl->cl_vt; + if (cl->cl_parent->cl_cvtmin != 0) + vt = (cl->cl_parent->cl_cvtmin + vt)/2; + + if (cl->cl_parent->cl_vtperiod != + cl->cl_parentperiod || vt > cl->cl_vt) + cl->cl_vt = vt; + } else { + /* + * first child for a new parent backlog period. + * add parent's cvtmax to vtoff of children + * to make a new vt (vtoff + vt) larger than + * the vt in the last period for all children. + */ + vt = cl->cl_parent->cl_cvtmax; + for (p = cl->cl_parent->cl_children; p != NULL; + p = p->cl_siblings) + p->cl_vtoff += vt; + cl->cl_vt = 0; + cl->cl_parent->cl_cvtmax = 0; + cl->cl_parent->cl_cvtmin = 0; + } + cl->cl_initvt = cl->cl_vt; + + /* update the virtual curve */ + vt = cl->cl_vt + cl->cl_vtoff; + rtsc_min(&cl->cl_virtual, cl->cl_fsc, vt, cl->cl_total); + if (cl->cl_virtual.x == vt) { + cl->cl_virtual.x -= cl->cl_vtoff; + cl->cl_vtoff = 0; + } + cl->cl_vtadj = 0; + + cl->cl_vtperiod++; /* increment vt period */ + cl->cl_parentperiod = cl->cl_parent->cl_vtperiod; + if (cl->cl_parent->cl_nactive == 0) + cl->cl_parentperiod++; + cl->cl_f = 0; + + actlist_insert(cl); + + if (cl->cl_usc != NULL) { + /* class has upper limit curve */ + if (cur_time == 0) + cur_time = read_machclk(); + + /* update the ulimit curve */ + rtsc_min(&cl->cl_ulimit, cl->cl_usc, cur_time, + cl->cl_total); + /* compute myf */ + cl->cl_myf = rtsc_y2x(&cl->cl_ulimit, + cl->cl_total); + cl->cl_myfadj = 0; + } + } + + if (cl->cl_myf > cl->cl_cfmin) + f = cl->cl_myf; + else + f = cl->cl_cfmin; + if (f != cl->cl_f) { + cl->cl_f = f; + update_cfmin(cl->cl_parent); + } + } +} + +static void +update_vf(struct hfsc_class *cl, int len, uint64_t cur_time) +{ + uint64_t f, myf_bound, delta; + int go_passive; + + go_passive = qempty(cl->cl_q); + + for (; cl->cl_parent != NULL; cl = cl->cl_parent) { + cl->cl_total += len; + + if (cl->cl_fsc == NULL || cl->cl_nactive == 0) + continue; + + if (go_passive && --cl->cl_nactive == 0) + go_passive = 1; + else + go_passive = 0; + + if (go_passive) { + /* no more active child, going passive */ + + /* update cvtmax of the parent class */ + if (cl->cl_vt > cl->cl_parent->cl_cvtmax) + cl->cl_parent->cl_cvtmax = cl->cl_vt; + + /* remove this class from the vt list */ + actlist_remove(cl); + + update_cfmin(cl->cl_parent); + + continue; + } + + /* + * update vt and f + */ + cl->cl_vt = rtsc_y2x(&cl->cl_virtual, cl->cl_total) + - cl->cl_vtoff + cl->cl_vtadj; + + /* + * if vt of the class is smaller than cvtmin, + * the class was skipped in the past due to non-fit. + * if so, we need to adjust vtadj. + */ + if (cl->cl_vt < cl->cl_parent->cl_cvtmin) { + cl->cl_vtadj += cl->cl_parent->cl_cvtmin - cl->cl_vt; + cl->cl_vt = cl->cl_parent->cl_cvtmin; + } + + /* update the vt list */ + actlist_update(cl); + + if (cl->cl_usc != NULL) { + cl->cl_myf = cl->cl_myfadj + + rtsc_y2x(&cl->cl_ulimit, cl->cl_total); + + /* + * if myf lags behind by more than one clock tick + * from the current time, adjust myfadj to prevent + * a rate-limited class from going greedy. + * in a steady state under rate-limiting, myf + * fluctuates within one clock tick. + */ + myf_bound = cur_time - machclk_per_tick; + if (cl->cl_myf < myf_bound) { + delta = cur_time - cl->cl_myf; + cl->cl_myfadj += delta; + cl->cl_myf += delta; + } + } + + /* cl_f is max(cl_myf, cl_cfmin) */ + if (cl->cl_myf > cl->cl_cfmin) + f = cl->cl_myf; + else + f = cl->cl_cfmin; + if (f != cl->cl_f) { + cl->cl_f = f; + update_cfmin(cl->cl_parent); + } + } +} + +static void +update_cfmin(struct hfsc_class *cl) +{ + struct hfsc_class *p; + uint64_t cfmin; + + if (TAILQ_EMPTY(cl->cl_actc)) { + cl->cl_cfmin = 0; + return; + } + cfmin = HT_INFINITY; + TAILQ_FOREACH(p, cl->cl_actc, cl_actlist) { + if (p->cl_f == 0) { + cl->cl_cfmin = 0; + return; + } + if (p->cl_f < cfmin) + cfmin = p->cl_f; + } + cl->cl_cfmin = cfmin; +} + +/* + * TAILQ based ellist and actlist implementation + * (ion wanted to make a calendar queue based implementation) + */ +/* + * eligible list holds backlogged classes being sorted by their eligible times. + * there is one eligible list per interface. + */ + +static ellist_t * +ellist_alloc(void) +{ + ellist_t *head; + + head = malloc(sizeof(ellist_t *), M_ALTQ, M_WAITOK); + TAILQ_INIT(head); + return (head); +} + +static void +ellist_destroy(ellist_t *head) +{ + free(head, M_ALTQ); +} + +static void +ellist_insert(struct hfsc_class *cl) +{ + struct hfsc_if *hif = cl->cl_hif; + struct hfsc_class *p; + + /* check the last entry first */ + if ((p = TAILQ_LAST(hif->hif_eligible, _eligible)) == NULL || + p->cl_e <= cl->cl_e) { + TAILQ_INSERT_TAIL(hif->hif_eligible, cl, cl_ellist); + return; + } + + TAILQ_FOREACH(p, hif->hif_eligible, cl_ellist) { + if (cl->cl_e < p->cl_e) { + TAILQ_INSERT_BEFORE(p, cl, cl_ellist); + return; + } + } + KKASSERT(0); /* should not reach here */ +} + +static void +ellist_remove(struct hfsc_class *cl) +{ + struct hfsc_if *hif = cl->cl_hif; + + TAILQ_REMOVE(hif->hif_eligible, cl, cl_ellist); +} + +static void +ellist_update(struct hfsc_class *cl) +{ + struct hfsc_if *hif = cl->cl_hif; + struct hfsc_class *p, *last; + + /* + * the eligible time of a class increases monotonically. + * if the next entry has a larger eligible time, nothing to do. + */ + p = TAILQ_NEXT(cl, cl_ellist); + if (p == NULL || cl->cl_e <= p->cl_e) + return; + + /* check the last entry */ + last = TAILQ_LAST(hif->hif_eligible, _eligible); + KKASSERT(last != NULL); + if (last->cl_e <= cl->cl_e) { + TAILQ_REMOVE(hif->hif_eligible, cl, cl_ellist); + TAILQ_INSERT_TAIL(hif->hif_eligible, cl, cl_ellist); + return; + } + + /* + * the new position must be between the next entry + * and the last entry + */ + while ((p = TAILQ_NEXT(p, cl_ellist)) != NULL) { + if (cl->cl_e < p->cl_e) { + TAILQ_REMOVE(hif->hif_eligible, cl, cl_ellist); + TAILQ_INSERT_BEFORE(p, cl, cl_ellist); + return; + } + } + KKASSERT(0); /* should not reach here */ +} + +/* find the class with the minimum deadline among the eligible classes */ +struct hfsc_class * +ellist_get_mindl(ellist_t *head, uint64_t cur_time) +{ + struct hfsc_class *p, *cl = NULL; + + TAILQ_FOREACH(p, head, cl_ellist) { + if (p->cl_e > cur_time) + break; + if (cl == NULL || p->cl_d < cl->cl_d) + cl = p; + } + return (cl); +} + +/* + * active children list holds backlogged child classes being sorted + * by their virtual time. + * each intermediate class has one active children list. + */ +static actlist_t * +actlist_alloc(void) +{ + actlist_t *head; + + head = malloc(sizeof(*head), M_ALTQ, M_WAITOK); + TAILQ_INIT(head); + return (head); +} + +static void +actlist_destroy(actlist_t *head) +{ + free(head, M_ALTQ); +} +static void +actlist_insert(struct hfsc_class *cl) +{ + struct hfsc_class *p; + + /* check the last entry first */ + if ((p = TAILQ_LAST(cl->cl_parent->cl_actc, _active)) == NULL + || p->cl_vt <= cl->cl_vt) { + TAILQ_INSERT_TAIL(cl->cl_parent->cl_actc, cl, cl_actlist); + return; + } + + TAILQ_FOREACH(p, cl->cl_parent->cl_actc, cl_actlist) { + if (cl->cl_vt < p->cl_vt) { + TAILQ_INSERT_BEFORE(p, cl, cl_actlist); + return; + } + } + KKASSERT(0); /* should not reach here */ +} + +static void +actlist_remove(struct hfsc_class *cl) +{ + TAILQ_REMOVE(cl->cl_parent->cl_actc, cl, cl_actlist); +} + +static void +actlist_update(struct hfsc_class *cl) +{ + struct hfsc_class *p, *last; + + /* + * the virtual time of a class increases monotonically during its + * backlogged period. + * if the next entry has a larger virtual time, nothing to do. + */ + p = TAILQ_NEXT(cl, cl_actlist); + if (p == NULL || cl->cl_vt < p->cl_vt) + return; + + /* check the last entry */ + last = TAILQ_LAST(cl->cl_parent->cl_actc, _active); + KKASSERT(last != NULL); + if (last->cl_vt <= cl->cl_vt) { + TAILQ_REMOVE(cl->cl_parent->cl_actc, cl, cl_actlist); + TAILQ_INSERT_TAIL(cl->cl_parent->cl_actc, cl, cl_actlist); + return; + } + + /* + * the new position must be between the next entry + * and the last entry + */ + while ((p = TAILQ_NEXT(p, cl_actlist)) != NULL) { + if (cl->cl_vt < p->cl_vt) { + TAILQ_REMOVE(cl->cl_parent->cl_actc, cl, cl_actlist); + TAILQ_INSERT_BEFORE(p, cl, cl_actlist); + return; + } + } + KKASSERT(0); /* should not reach here */ +} + +static struct hfsc_class * +actlist_firstfit(struct hfsc_class *cl, uint64_t cur_time) +{ + struct hfsc_class *p; + + TAILQ_FOREACH(p, cl->cl_actc, cl_actlist) { + if (p->cl_f <= cur_time) + return (p); + } + return (NULL); +} + +/* + * service curve support functions + * + * external service curve parameters + * m: bits/sec + * d: msec + * internal service curve parameters + * sm: (bytes/tsc_interval) << SM_SHIFT + * ism: (tsc_count/byte) << ISM_SHIFT + * dx: tsc_count + * + * SM_SHIFT and ISM_SHIFT are scaled in order to keep effective digits. + * we should be able to handle 100K-1Gbps linkspeed with 200Hz-1GHz CPU + * speed. SM_SHIFT and ISM_SHIFT are selected to have at least 3 effective + * digits in decimal using the following table. + * + * bits/sec 100Kbps 1Mbps 10Mbps 100Mbps 1Gbps + * ----------+------------------------------------------------------- + * bytes/nsec 12.5e-6 125e-6 1250e-6 12500e-6 125000e-6 + * sm(500MHz) 25.0e-6 250e-6 2500e-6 25000e-6 250000e-6 + * sm(200MHz) 62.5e-6 625e-6 6250e-6 62500e-6 625000e-6 + * + * nsec/byte 80000 8000 800 80 8 + * ism(500MHz) 40000 4000 400 40 4 + * ism(200MHz) 16000 1600 160 16 1.6 + */ +#define SM_SHIFT 24 +#define ISM_SHIFT 10 + +#define SM_MASK ((1LL << SM_SHIFT) - 1) +#define ISM_MASK ((1LL << ISM_SHIFT) - 1) + +static __inline uint64_t +seg_x2y(uint64_t x, uint64_t sm) +{ + uint64_t y; + + /* + * compute + * y = x * sm >> SM_SHIFT + * but divide it for the upper and lower bits to avoid overflow + */ + y = (x >> SM_SHIFT) * sm + (((x & SM_MASK) * sm) >> SM_SHIFT); + return (y); +} + +static __inline uint64_t +seg_y2x(uint64_t y, uint64_t ism) +{ + uint64_t x; + + if (y == 0) + x = 0; + else if (ism == HT_INFINITY) + x = HT_INFINITY; + else + x = (y >> ISM_SHIFT) * ism + (((y & ISM_MASK) * ism) >> ISM_SHIFT); + + return (x); +} + +static __inline uint64_t +m2sm(u_int m) +{ + uint64_t sm; + + sm = ((uint64_t)m << SM_SHIFT) / 8 / machclk_freq; + return (sm); +} + +static __inline uint64_t +m2ism(u_int m) +{ + uint64_t ism; + + if (m == 0) + ism = HT_INFINITY; + else + ism = ((uint64_t)machclk_freq << ISM_SHIFT) * 8 / m; + return (ism); +} + +static __inline uint64_t +d2dx(u_int d) +{ + uint64_t dx; + + dx = ((uint64_t)d * machclk_freq) / 1000; + return (dx); +} + +static u_int +sm2m(uint64_t sm) +{ + uint64_t m; + + m = (sm * 8 * machclk_freq) >> SM_SHIFT; + return ((u_int)m); +} + +static u_int +dx2d(uint64_t dx) +{ + uint64_t d; + + d = dx * 1000 / machclk_freq; + return ((u_int)d); +} + +static void +sc2isc(struct service_curve *sc, struct internal_sc *isc) +{ + isc->sm1 = m2sm(sc->m1); + isc->ism1 = m2ism(sc->m1); + isc->dx = d2dx(sc->d); + isc->dy = seg_x2y(isc->dx, isc->sm1); + isc->sm2 = m2sm(sc->m2); + isc->ism2 = m2ism(sc->m2); +} + +/* + * initialize the runtime service curve with the given internal + * service curve starting at (x, y). + */ +static void +rtsc_init(struct runtime_sc *rtsc, struct internal_sc *isc, uint64_t x, uint64_t y) +{ + rtsc->x = x; + rtsc->y = y; + rtsc->sm1 = isc->sm1; + rtsc->ism1 = isc->ism1; + rtsc->dx = isc->dx; + rtsc->dy = isc->dy; + rtsc->sm2 = isc->sm2; + rtsc->ism2 = isc->ism2; +} + +/* + * calculate the y-projection of the runtime service curve by the + * given x-projection value + */ +static uint64_t +rtsc_y2x(struct runtime_sc *rtsc, uint64_t y) +{ + uint64_t x; + + if (y < rtsc->y) { + x = rtsc->x; + } else if (y <= rtsc->y + rtsc->dy) { + /* x belongs to the 1st segment */ + if (rtsc->dy == 0) + x = rtsc->x + rtsc->dx; + else + x = rtsc->x + seg_y2x(y - rtsc->y, rtsc->ism1); + } else { + /* x belongs to the 2nd segment */ + x = rtsc->x + rtsc->dx + + seg_y2x(y - rtsc->y - rtsc->dy, rtsc->ism2); + } + return (x); +} + +static uint64_t +rtsc_x2y(struct runtime_sc *rtsc, uint64_t x) +{ + uint64_t y; + + if (x <= rtsc->x) { + y = rtsc->y; + } else if (x <= rtsc->x + rtsc->dx) { + /* y belongs to the 1st segment */ + y = rtsc->y + seg_x2y(x - rtsc->x, rtsc->sm1); + } else + /* y belongs to the 2nd segment */ + y = rtsc->y + rtsc->dy + + seg_x2y(x - rtsc->x - rtsc->dx, rtsc->sm2); + return (y); +} + +/* + * update the runtime service curve by taking the minimum of the current + * runtime service curve and the service curve starting at (x, y). + */ +static void +rtsc_min(struct runtime_sc *rtsc, struct internal_sc *isc, uint64_t x, uint64_t y) +{ + uint64_t y1, y2, dx, dy; + + if (isc->sm1 <= isc->sm2) { + /* service curve is convex */ + y1 = rtsc_x2y(rtsc, x); + if (y1 < y) + /* the current rtsc is smaller */ + return; + rtsc->x = x; + rtsc->y = y; + return; + } + + /* + * service curve is concave + * compute the two y values of the current rtsc + * y1: at x + * y2: at (x + dx) + */ + y1 = rtsc_x2y(rtsc, x); + if (y1 <= y) { + /* rtsc is below isc, no change to rtsc */ + return; + } + + y2 = rtsc_x2y(rtsc, x + isc->dx); + if (y2 >= y + isc->dy) { + /* rtsc is above isc, replace rtsc by isc */ + rtsc->x = x; + rtsc->y = y; + rtsc->dx = isc->dx; + rtsc->dy = isc->dy; + return; + } + + /* + * the two curves intersect + * compute the offsets (dx, dy) using the reverse + * function of seg_x2y() + * seg_x2y(dx, sm1) == seg_x2y(dx, sm2) + (y1 - y) + */ + dx = ((y1 - y) << SM_SHIFT) / (isc->sm1 - isc->sm2); + /* + * check if (x, y1) belongs to the 1st segment of rtsc. + * if so, add the offset. + */ + if (rtsc->x + rtsc->dx > x) + dx += rtsc->x + rtsc->dx - x; + dy = seg_x2y(dx, isc->sm1); + + rtsc->x = x; + rtsc->y = y; + rtsc->dx = dx; + rtsc->dy = dy; +} + +static void +get_class_stats(struct hfsc_classstats *sp, struct hfsc_class *cl) +{ + sp->class_id = cl->cl_id; + sp->class_handle = cl->cl_handle; + + if (cl->cl_rsc != NULL) { + sp->rsc.m1 = sm2m(cl->cl_rsc->sm1); + sp->rsc.d = dx2d(cl->cl_rsc->dx); + sp->rsc.m2 = sm2m(cl->cl_rsc->sm2); + } else { + sp->rsc.m1 = 0; + sp->rsc.d = 0; + sp->rsc.m2 = 0; + } + if (cl->cl_fsc != NULL) { + sp->fsc.m1 = sm2m(cl->cl_fsc->sm1); + sp->fsc.d = dx2d(cl->cl_fsc->dx); + sp->fsc.m2 = sm2m(cl->cl_fsc->sm2); + } else { + sp->fsc.m1 = 0; + sp->fsc.d = 0; + sp->fsc.m2 = 0; + } + if (cl->cl_usc != NULL) { + sp->usc.m1 = sm2m(cl->cl_usc->sm1); + sp->usc.d = dx2d(cl->cl_usc->dx); + sp->usc.m2 = sm2m(cl->cl_usc->sm2); + } else { + sp->usc.m1 = 0; + sp->usc.d = 0; + sp->usc.m2 = 0; + } + + sp->total = cl->cl_total; + sp->cumul = cl->cl_cumul; + + sp->d = cl->cl_d; + sp->e = cl->cl_e; + sp->vt = cl->cl_vt; + sp->f = cl->cl_f; + + sp->initvt = cl->cl_initvt; + sp->vtperiod = cl->cl_vtperiod; + sp->parentperiod = cl->cl_parentperiod; + sp->nactive = cl->cl_nactive; + sp->vtoff = cl->cl_vtoff; + sp->cvtmax = cl->cl_cvtmax; + sp->myf = cl->cl_myf; + sp->cfmin = cl->cl_cfmin; + sp->cvtmin = cl->cl_cvtmin; + sp->myfadj = cl->cl_myfadj; + sp->vtadj = cl->cl_vtadj; + + sp->cur_time = read_machclk(); + sp->machclk_freq = machclk_freq; + + sp->qlength = qlen(cl->cl_q); + sp->qlimit = qlimit(cl->cl_q); + sp->xmit_cnt = cl->cl_stats.xmit_cnt; + sp->drop_cnt = cl->cl_stats.drop_cnt; + sp->period = cl->cl_stats.period; + + sp->qtype = qtype(cl->cl_q); +#ifdef ALTQ_RED + if (q_is_red(cl->cl_q)) + red_getstats(cl->cl_red, &sp->red[0]); +#endif +#ifdef ALTQ_RIO + if (q_is_rio(cl->cl_q)) + rio_getstats((rio_t *)cl->cl_red, &sp->red[0]); +#endif +} + +/* convert a class handle to the corresponding class pointer */ +static struct hfsc_class * +clh_to_clp(struct hfsc_if *hif, uint32_t chandle) +{ + int i; + struct hfsc_class *cl; + + if (chandle == 0) + return (NULL); + /* + * first, try optimistically the slot matching the lower bits of + * the handle. if it fails, do the linear table search. + */ + i = chandle % HFSC_MAX_CLASSES; + if ((cl = hif->hif_class_tbl[i]) != NULL && cl->cl_handle == chandle) + return (cl); + for (i = 0; i < HFSC_MAX_CLASSES; i++) + if ((cl = hif->hif_class_tbl[i]) != NULL && + cl->cl_handle == chandle) + return (cl); + return (NULL); +} + +#endif /* ALTQ_HFSC */ diff --git a/sys/net/altq/altq_hfsc.h b/sys/net/altq/altq_hfsc.h new file mode 100644 index 0000000000..256ae60adb --- /dev/null +++ b/sys/net/altq/altq_hfsc.h @@ -0,0 +1,240 @@ +/* $KAME: altq_hfsc.h,v 1.12 2003/12/05 05:40:46 kjc Exp $ */ +/* $DragonFly: src/sys/net/altq/altq_hfsc.h,v 1.1 2005/02/11 22:25:57 joerg Exp $ */ + +/* + * Copyright (c) 1997-1999 Carnegie Mellon University. All Rights Reserved. + * + * Permission to use, copy, modify, and distribute this software and + * its documentation is hereby granted (including for commercial or + * for-profit use), provided that both the copyright notice and this + * permission notice appear in all copies of the software, derivative + * works, or modified versions, and any portions thereof. + * + * THIS SOFTWARE IS EXPERIMENTAL AND IS KNOWN TO HAVE BUGS, SOME OF + * WHICH MAY HAVE SERIOUS CONSEQUENCES. CARNEGIE MELLON PROVIDES THIS + * SOFTWARE IN ITS ``AS IS'' CONDITION, AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + * Carnegie Mellon encourages (but does not require) users of this + * software to return any improvements or extensions that they make, + * and to grant Carnegie Mellon the rights to redistribute these + * changes without encumbrance. + */ +#ifndef _ALTQ_ALTQ_HFSC_H_ +#define _ALTQ_ALTQ_HFSC_H_ + +#include +#include +#include +#include + +struct service_curve { + u_int m1; /* slope of the first segment in bits/sec */ + u_int d; /* the x-projection of the first segment in msec */ + u_int m2; /* slope of the second segment in bits/sec */ +}; + +/* special class handles */ +#define HFSC_NULLCLASS_HANDLE 0 +#define HFSC_MAX_CLASSES 64 + +/* hfsc class flags */ +#define HFCF_RED 0x0001 /* use RED */ +#define HFCF_ECN 0x0002 /* use RED/ECN */ +#define HFCF_RIO 0x0004 /* use RIO */ +#define HFCF_CLEARDSCP 0x0010 /* clear diffserv codepoint */ +#define HFCF_DEFAULTCLASS 0x1000 /* default class */ + +/* service curve types */ +#define HFSC_REALTIMESC 1 +#define HFSC_LINKSHARINGSC 2 +#define HFSC_UPPERLIMITSC 4 +#define HFSC_DEFAULTSC (HFSC_REALTIMESC|HFSC_LINKSHARINGSC) + +struct hfsc_classstats { + u_int class_id; + uint32_t class_handle; + struct service_curve rsc; + struct service_curve fsc; + struct service_curve usc; /* upper limit service curve */ + + uint64_t total; /* total work in bytes */ + uint64_t cumul; /* cumulative work in bytes + done by real-time criteria */ + uint64_t d; /* deadline */ + uint64_t e; /* eligible time */ + uint64_t vt; /* virtual time */ + uint64_t f; /* fit time for upper-limit */ + + /* info helpful for debugging */ + uint64_t initvt; /* init virtual time */ + uint64_t vtoff; /* cl_vt_ipoff */ + uint64_t cvtmax; /* cl_maxvt */ + uint64_t myf; /* cl_myf */ + uint64_t cfmin; /* cl_mincf */ + uint64_t cvtmin; /* cl_mincvt */ + uint64_t myfadj; /* cl_myfadj */ + uint64_t vtadj; /* cl_vtadj */ + uint64_t cur_time; + uint32_t machclk_freq; + + u_int qlength; + u_int qlimit; + struct pktcntr xmit_cnt; + struct pktcntr drop_cnt; + u_int period; + + u_int vtperiod; /* vt period sequence no */ + u_int parentperiod; /* parent's vt period seqno */ + int nactive; /* number of active children */ + + /* red and rio related info */ + int qtype; + struct redstats red[3]; +}; + +#ifdef _KERNEL +/* + * kernel internal service curve representation + * coordinates are given by 64 bit unsigned integers. + * x-axis: unit is clock count. for the intel x86 architecture, + * the raw Pentium TSC (Timestamp Counter) value is used. + * virtual time is also calculated in this time scale. + * y-axis: unit is byte. + * + * the service curve parameters are converted to the internal + * representation. + * the slope values are scaled to avoid overflow. + * the inverse slope values as well as the y-projection of the 1st + * segment are kept in order to to avoid 64-bit divide operations + * that are expensive on 32-bit architectures. + * + * note: Intel Pentium TSC never wraps around in several thousands of years. + * x-axis doesn't wrap around for 1089 years with 1GHz clock. + * y-axis doesn't wrap around for 4358 years with 1Gbps bandwidth. + */ + +/* kernel internal representation of a service curve */ +struct internal_sc { + uint64_t sm1; /* scaled slope of the 1st segment */ + uint64_t ism1; /* scaled inverse-slope of the 1st segment */ + uint64_t dx; /* the x-projection of the 1st segment */ + uint64_t dy; /* the y-projection of the 1st segment */ + uint64_t sm2; /* scaled slope of the 2nd segment */ + uint64_t ism2; /* scaled inverse-slope of the 2nd segment */ +}; + +/* runtime service curve */ +struct runtime_sc { + uint64_t x; /* current starting position on x-axis */ + uint64_t y; /* current starting position on x-axis */ + uint64_t sm1; /* scaled slope of the 1st segment */ + uint64_t ism1; /* scaled inverse-slope of the 1st segment */ + uint64_t dx; /* the x-projection of the 1st segment */ + uint64_t dy; /* the y-projection of the 1st segment */ + uint64_t sm2; /* scaled slope of the 2nd segment */ + uint64_t ism2; /* scaled inverse-slope of the 2nd segment */ +}; + +/* for TAILQ based ellist and actlist implementation */ +struct hfsc_class; +typedef TAILQ_HEAD(_eligible, hfsc_class) ellist_t; +typedef TAILQ_ENTRY(hfsc_class) elentry_t; +typedef TAILQ_HEAD(_active, hfsc_class) actlist_t; +typedef TAILQ_ENTRY(hfsc_class) actentry_t; +#define ellist_first(s) TAILQ_FIRST(s) +#define actlist_first(s) TAILQ_FIRST(s) +#define actlist_last(s) TAILQ_LAST(s, _active) + +struct hfsc_class { + u_int cl_id; /* class id (just for debug) */ + uint32_t cl_handle; /* class handle */ + struct hfsc_if *cl_hif; /* back pointer to struct hfsc_if */ + int cl_flags; /* misc flags */ + + struct hfsc_class *cl_parent; /* parent class */ + struct hfsc_class *cl_siblings; /* sibling classes */ + struct hfsc_class *cl_children; /* child classes */ + + class_queue_t *cl_q; /* class queue structure */ + struct red *cl_red; /* RED state */ + struct altq_pktattr *cl_pktattr; /* saved header used by ECN */ + + uint64_t cl_total; /* total work in bytes */ + uint64_t cl_cumul; /* cumulative work in bytes + done by real-time criteria */ + uint64_t cl_d; /* deadline */ + uint64_t cl_e; /* eligible time */ + uint64_t cl_vt; /* virtual time */ + uint64_t cl_f; /* time when this class will fit for + link-sharing, max(myf, cfmin) */ + uint64_t cl_myf; /* my fit-time (as calculated from this + class's own upperlimit curve) */ + uint64_t cl_myfadj; /* my fit-time adjustment + (to cancel history dependence) */ + uint64_t cl_cfmin; /* earliest children's fit-time (used + with cl_myf to obtain cl_f) */ + uint64_t cl_cvtmin; /* minimal virtual time among the + children fit for link-sharing + (monotonic within a period) */ + uint64_t cl_vtadj; /* intra-period cumulative vt + adjustment */ + uint64_t cl_vtoff; /* inter-period cumulative vt offset */ + uint64_t cl_cvtmax; /* max child's vt in the last period */ + + uint64_t cl_initvt; /* init virtual time (for debugging) */ + + struct internal_sc *cl_rsc; /* internal real-time service curve */ + struct internal_sc *cl_fsc; /* internal fair service curve */ + struct internal_sc *cl_usc; /* internal upperlimit service curve */ + struct runtime_sc cl_deadline; /* deadline curve */ + struct runtime_sc cl_eligible; /* eligible curve */ + struct runtime_sc cl_virtual; /* virtual curve */ + struct runtime_sc cl_ulimit; /* upperlimit curve */ + + u_int cl_vtperiod; /* vt period sequence no */ + u_int cl_parentperiod; /* parent's vt period seqno */ + int cl_nactive; /* number of active children */ + actlist_t *cl_actc; /* active children list */ + + actentry_t cl_actlist; /* active children list entry */ + elentry_t cl_ellist; /* eligible list entry */ + + struct { + struct pktcntr xmit_cnt; + struct pktcntr drop_cnt; + u_int period; + } cl_stats; +}; + +/* + * hfsc interface state + */ +struct hfsc_if { + struct hfsc_if *hif_next; /* interface state list */ + struct ifaltq *hif_ifq; /* backpointer to ifaltq */ + struct hfsc_class *hif_rootclass; /* root class */ + struct hfsc_class *hif_defaultclass; /* default class */ + struct hfsc_class *hif_class_tbl[HFSC_MAX_CLASSES]; + struct hfsc_class *hif_pollcache; /* cache for poll operation */ + + u_int hif_classes; /* # of classes in the tree */ + u_int hif_packets; /* # of packets in the tree */ + u_int hif_classid; /* class id sequence number */ + + ellist_t *hif_eligible; /* eligible list */ +}; + +#endif /* _KERNEL */ + +#endif /* _ALTQ_ALTQ_HFSC_H_ */ diff --git a/sys/net/altq/altq_priq.c b/sys/net/altq/altq_priq.c new file mode 100644 index 0000000000..b759ffbbc2 --- /dev/null +++ b/sys/net/altq/altq_priq.c @@ -0,0 +1,550 @@ +/* $KAME: altq_priq.c,v 1.12 2004/04/17 10:54:48 kjc Exp $ */ +/* $DragonFly: src/sys/net/altq/altq_priq.c,v 1.1 2005/02/11 22:25:57 joerg Exp $ */ + +/* + * Copyright (C) 2000-2003 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * priority queue + */ + +#include "opt_altq.h" +#include "opt_inet.h" +#include "opt_inet6.h" + +#ifdef ALTQ_PRIQ /* priq is enabled by ALTQ_PRIQ option in opt_altq.h */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +/* + * function prototypes + */ +static int priq_clear_interface(struct priq_if *); +static int priq_request(struct ifaltq *, int, void *); +static void priq_purge(struct priq_if *); +static struct priq_class *priq_class_create(struct priq_if *, int, int, int, int); +static int priq_class_destroy(struct priq_class *); +static int priq_enqueue(struct ifaltq *, struct mbuf *, struct altq_pktattr *); +static struct mbuf *priq_dequeue(struct ifaltq *, int); + +static int priq_addq(struct priq_class *, struct mbuf *); +static struct mbuf *priq_getq(struct priq_class *); +static struct mbuf *priq_pollq(struct priq_class *); +static void priq_purgeq(struct priq_class *); + +static void get_class_stats(struct priq_classstats *, struct priq_class *); +static struct priq_class *clh_to_clp(struct priq_if *, uint32_t); + +int +priq_pfattach(struct pf_altq *a) +{ + struct ifnet *ifp; + int s, error; + + if ((ifp = ifunit(a->ifname)) == NULL || a->altq_disc == NULL) + return (EINVAL); + s = splimp(); + error = altq_attach(&ifp->if_snd, ALTQT_PRIQ, a->altq_disc, + priq_enqueue, priq_dequeue, priq_request, NULL, NULL); + splx(s); + return (error); +} + +int +priq_add_altq(struct pf_altq *a) +{ + struct priq_if *pif; + struct ifnet *ifp; + + if ((ifp = ifunit(a->ifname)) == NULL) + return (EINVAL); + if (!ifq_is_ready(&ifp->if_snd)) + return (ENODEV); + + pif = malloc(sizeof(*pif), M_ALTQ, M_WAITOK | M_ZERO); + pif->pif_bandwidth = a->ifbandwidth; + pif->pif_maxpri = -1; + pif->pif_ifq = &ifp->if_snd; + + /* keep the state in pf_altq */ + a->altq_disc = pif; + + return (0); +} + +int +priq_remove_altq(struct pf_altq *a) +{ + struct priq_if *pif; + + if ((pif = a->altq_disc) == NULL) + return (EINVAL); + a->altq_disc = NULL; + + priq_clear_interface(pif); + + free(pif, M_ALTQ); + return (0); +} + +int +priq_add_queue(struct pf_altq *a) +{ + struct priq_if *pif; + struct priq_class *cl; + + if ((pif = a->altq_disc) == NULL) + return (EINVAL); + + /* check parameters */ + if (a->priority >= PRIQ_MAXPRI) + return (EINVAL); + if (a->qid == 0) + return (EINVAL); + if (pif->pif_classes[a->priority] != NULL) + return (EBUSY); + if (clh_to_clp(pif, a->qid) != NULL) + return (EBUSY); + + cl = priq_class_create(pif, a->priority, a->qlimit, + a->pq_u.priq_opts.flags, a->qid); + if (cl == NULL) + return (ENOMEM); + + return (0); +} + +int +priq_remove_queue(struct pf_altq *a) +{ + struct priq_if *pif; + struct priq_class *cl; + + if ((pif = a->altq_disc) == NULL) + return (EINVAL); + + if ((cl = clh_to_clp(pif, a->qid)) == NULL) + return (EINVAL); + + return (priq_class_destroy(cl)); +} + +int +priq_getqstats(struct pf_altq *a, void *ubuf, int *nbytes) +{ + struct priq_if *pif; + struct priq_class *cl; + struct priq_classstats stats; + int error = 0; + + if ((pif = altq_lookup(a->ifname, ALTQT_PRIQ)) == NULL) + return (EBADF); + + if ((cl = clh_to_clp(pif, a->qid)) == NULL) + return (EINVAL); + + if (*nbytes < sizeof(stats)) + return (EINVAL); + + get_class_stats(&stats, cl); + + if ((error = copyout((caddr_t)&stats, ubuf, sizeof(stats))) != 0) + return (error); + *nbytes = sizeof(stats); + return (0); +} + +/* + * bring the interface back to the initial state by discarding + * all the filters and classes. + */ +static int +priq_clear_interface(struct priq_if *pif) +{ + struct priq_class *cl; + int pri; + + /* clear out the classes */ + for (pri = 0; pri <= pif->pif_maxpri; pri++) { + if ((cl = pif->pif_classes[pri]) != NULL) + priq_class_destroy(cl); + } + + return (0); +} + +static int +priq_request(struct ifaltq *ifq, int req, void *arg) +{ + struct priq_if *pif = (struct priq_if *)ifq->altq_disc; + + switch (req) { + case ALTRQ_PURGE: + priq_purge(pif); + break; + } + return (0); +} + +/* discard all the queued packets on the interface */ +static void +priq_purge(struct priq_if *pif) +{ + struct priq_class *cl; + int pri; + + for (pri = 0; pri <= pif->pif_maxpri; pri++) { + if ((cl = pif->pif_classes[pri]) != NULL && !qempty(cl->cl_q)) + priq_purgeq(cl); + } + if (ifq_is_enabled(pif->pif_ifq)) + pif->pif_ifq->ifq_len = 0; +} + +static struct priq_class * +priq_class_create(struct priq_if *pif, int pri, int qlimit, int flags, int qid) +{ + struct priq_class *cl; + int s; + +#ifndef ALTQ_RED + if (flags & PRCF_RED) { +#ifdef ALTQ_DEBUG + printf("priq_class_create: RED not configured for PRIQ!\n"); +#endif + return (NULL); + } +#endif + + if ((cl = pif->pif_classes[pri]) != NULL) { + /* modify the class instead of creating a new one */ + s = splimp(); + if (!qempty(cl->cl_q)) + priq_purgeq(cl); + splx(s); +#ifdef ALTQ_RIO + if (q_is_rio(cl->cl_q)) + rio_destroy((rio_t *)cl->cl_red); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->cl_q)) + red_destroy(cl->cl_red); +#endif + } else { + cl = malloc(sizeof(*cl), M_ALTQ, M_WAITOK | M_ZERO); + cl->cl_q = malloc(sizeof(*cl->cl_q), M_ALTQ, M_WAITOK | M_ZERO); + } + + pif->pif_classes[pri] = cl; + if (flags & PRCF_DEFAULTCLASS) + pif->pif_default = cl; + if (qlimit == 0) + qlimit = 50; /* use default */ + qlimit(cl->cl_q) = qlimit; + qtype(cl->cl_q) = Q_DROPTAIL; + qlen(cl->cl_q) = 0; + cl->cl_flags = flags; + cl->cl_pri = pri; + if (pri > pif->pif_maxpri) + pif->pif_maxpri = pri; + cl->cl_pif = pif; + cl->cl_handle = qid; + +#ifdef ALTQ_RED + if (flags & (PRCF_RED|PRCF_RIO)) { + int red_flags, red_pkttime; + + red_flags = 0; + if (flags & PRCF_ECN) + red_flags |= REDF_ECN; +#ifdef ALTQ_RIO + if (flags & PRCF_CLEARDSCP) + red_flags |= RIOF_CLEARDSCP; +#endif + if (pif->pif_bandwidth < 8) + red_pkttime = 1000 * 1000 * 1000; /* 1 sec */ + else + red_pkttime = (int64_t)pif->pif_ifq->altq_ifp->if_mtu + * 1000 * 1000 * 1000 / (pif->pif_bandwidth / 8); +#ifdef ALTQ_RIO + if (flags & PRCF_RIO) { + cl->cl_red = (red_t *)rio_alloc(0, NULL, + red_flags, red_pkttime); + if (cl->cl_red != NULL) + qtype(cl->cl_q) = Q_RIO; + } else +#endif + if (flags & PRCF_RED) { + cl->cl_red = red_alloc(0, 0, + qlimit(cl->cl_q) * 10/100, + qlimit(cl->cl_q) * 30/100, + red_flags, red_pkttime); + if (cl->cl_red != NULL) + qtype(cl->cl_q) = Q_RED; + } + } +#endif /* ALTQ_RED */ + + return (cl); +} + +static int +priq_class_destroy(struct priq_class *cl) +{ + struct priq_if *pif; + int s, pri; + + s = splimp(); + + if (!qempty(cl->cl_q)) + priq_purgeq(cl); + + pif = cl->cl_pif; + pif->pif_classes[cl->cl_pri] = NULL; + if (pif->pif_maxpri == cl->cl_pri) { + for (pri = cl->cl_pri; pri >= 0; pri--) + if (pif->pif_classes[pri] != NULL) { + pif->pif_maxpri = pri; + break; + } + if (pri < 0) + pif->pif_maxpri = -1; + } + splx(s); + + if (cl->cl_red != NULL) { +#ifdef ALTQ_RIO + if (q_is_rio(cl->cl_q)) + rio_destroy((rio_t *)cl->cl_red); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->cl_q)) + red_destroy(cl->cl_red); +#endif + } + free(cl->cl_q, M_ALTQ); + free(cl, M_ALTQ); + return (0); +} + +/* + * priq_enqueue is an enqueue function to be registered to + * (*altq_enqueue) in struct ifaltq. + */ +static int +priq_enqueue(struct ifaltq *ifq, struct mbuf *m, struct altq_pktattr *pktattr) +{ + struct priq_if *pif = (struct priq_if *)ifq->altq_disc; + struct priq_class *cl; + int len; + + /* grab class set by classifier */ + if ((m->m_flags & M_PKTHDR) == 0) { + /* should not happen */ + if_printf(ifq->altq_ifp, "altq: packet does not have pkthdr\n"); + m_freem(m); + return (ENOBUFS); + } + if (m->m_pkthdr.fw_flags & ALTQ_MBUF_TAGGED) + cl = clh_to_clp(pif, m->m_pkthdr.altq_qid); + else + cl = NULL; + if (cl == NULL) { + cl = pif->pif_default; + if (cl == NULL) { + m_freem(m); + return (ENOBUFS); + } + } + cl->cl_pktattr = NULL; + len = m_pktlen(m); + if (priq_addq(cl, m) != 0) { + /* drop occurred. mbuf was freed in priq_addq. */ + PKTCNTR_ADD(&cl->cl_dropcnt, len); + return (ENOBUFS); + } + ifq->ifq_len++; + + /* successfully queued. */ + return (0); +} + +/* + * priq_dequeue is a dequeue function to be registered to + * (*altq_dequeue) in struct ifaltq. + * + * note: ALTDQ_POLL returns the next packet without removing the packet + * from the queue. ALTDQ_REMOVE is a normal dequeue operation. + * ALTDQ_REMOVE must return the same packet if called immediately + * after ALTDQ_POLL. + */ +static struct mbuf * +priq_dequeue(struct ifaltq *ifq, int op) +{ + struct priq_if *pif = (struct priq_if *)ifq->altq_disc; + struct priq_class *cl; + struct mbuf *m; + int pri; + + if (ifq_is_empty(ifq)) { + /* no packet in the queue */ + return (NULL); + } + + for (pri = pif->pif_maxpri; pri >= 0; pri--) { + if ((cl = pif->pif_classes[pri]) != NULL && !qempty(cl->cl_q)) { + if (op == ALTDQ_POLL) + return (priq_pollq(cl)); + + m = priq_getq(cl); + if (m != NULL) { + ifq->ifq_len--; + if (qempty(cl->cl_q)) + cl->cl_period++; + PKTCNTR_ADD(&cl->cl_xmitcnt, m_pktlen(m)); + } + return (m); + } + } + return (NULL); +} + +static int +priq_addq(struct priq_class *cl, struct mbuf *m) +{ +#ifdef ALTQ_RIO + if (q_is_rio(cl->cl_q)) + return rio_addq((rio_t *)cl->cl_red, cl->cl_q, m, + cl->cl_pktattr); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->cl_q)) + return red_addq(cl->cl_red, cl->cl_q, m, cl->cl_pktattr); +#endif + if (qlen(cl->cl_q) >= qlimit(cl->cl_q)) { + m_freem(m); + return (-1); + } + + if (cl->cl_flags & PRCF_CLEARDSCP) + write_dsfield(m, cl->cl_pktattr, 0); + + _addq(cl->cl_q, m); + + return (0); +} + +static struct mbuf * +priq_getq(struct priq_class *cl) +{ +#ifdef ALTQ_RIO + if (q_is_rio(cl->cl_q)) + return rio_getq((rio_t *)cl->cl_red, cl->cl_q); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->cl_q)) + return red_getq(cl->cl_red, cl->cl_q); +#endif + return _getq(cl->cl_q); +} + +static struct mbuf * +priq_pollq(struct priq_class *cl) +{ + return qhead(cl->cl_q); +} + +static void +priq_purgeq(struct priq_class *cl) +{ + struct mbuf *m; + + if (qempty(cl->cl_q)) + return; + + while ((m = _getq(cl->cl_q)) != NULL) { + PKTCNTR_ADD(&cl->cl_dropcnt, m_pktlen(m)); + m_freem(m); + } + KKASSERT(qlen(cl->cl_q) == 0); +} + +static void +get_class_stats(struct priq_classstats *sp, struct priq_class *cl) +{ + sp->class_handle = cl->cl_handle; + sp->qlength = qlen(cl->cl_q); + sp->qlimit = qlimit(cl->cl_q); + sp->period = cl->cl_period; + sp->xmitcnt = cl->cl_xmitcnt; + sp->dropcnt = cl->cl_dropcnt; + + sp->qtype = qtype(cl->cl_q); +#ifdef ALTQ_RED + if (q_is_red(cl->cl_q)) + red_getstats(cl->cl_red, &sp->red[0]); +#endif +#ifdef ALTQ_RIO + if (q_is_rio(cl->cl_q)) + rio_getstats((rio_t *)cl->cl_red, &sp->red[0]); +#endif +} + +/* convert a class handle to the corresponding class pointer */ +static struct priq_class * +clh_to_clp(struct priq_if *pif, uint32_t chandle) +{ + struct priq_class *cl; + int idx; + + if (chandle == 0) + return (NULL); + + for (idx = pif->pif_maxpri; idx >= 0; idx--) + if ((cl = pif->pif_classes[idx]) != NULL && + cl->cl_handle == chandle) + return (cl); + + return (NULL); +} + +#endif /* ALTQ_PRIQ */ diff --git a/sys/net/altq/altq_priq.h b/sys/net/altq/altq_priq.h new file mode 100644 index 0000000000..79b1b5a174 --- /dev/null +++ b/sys/net/altq/altq_priq.h @@ -0,0 +1,95 @@ +/* $KAME: altq_priq.h,v 1.7 2003/10/03 05:05:15 kjc Exp $ */ +/* $DragonFly: src/sys/net/altq/altq_priq.h,v 1.1 2005/02/11 22:25:57 joerg Exp $ */ + +/* + * Copyright (C) 2000-2003 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _ALTQ_ALTQ_PRIQ_H_ +#define _ALTQ_ALTQ_PRIQ_H_ + +#include +#include +#include +#include + +#define PRIQ_MAXPRI 16 /* upper limit of the number of priorities */ + +/* priq class flags */ +#define PRCF_RED 0x0001 /* use RED */ +#define PRCF_ECN 0x0002 /* use RED/ECN */ +#define PRCF_RIO 0x0004 /* use RIO */ +#define PRCF_CLEARDSCP 0x0010 /* clear diffserv codepoint */ +#define PRCF_DEFAULTCLASS 0x1000 /* default class */ + +/* special class handles */ +#define PRIQ_NULLCLASS_HANDLE 0 + +struct priq_classstats { + uint32_t class_handle; + + u_int qlength; + u_int qlimit; + u_int period; + struct pktcntr xmitcnt; /* transmitted packet counter */ + struct pktcntr dropcnt; /* dropped packet counter */ + + /* red and rio related info */ + int qtype; + struct redstats red[3]; /* rio has 3 red stats */ +}; + +#ifdef _KERNEL + +struct priq_class { + uint32_t cl_handle; /* class handle */ + class_queue_t *cl_q; /* class queue structure */ + struct red *cl_red; /* RED state */ + int cl_pri; /* priority */ + int cl_flags; /* class flags */ + struct priq_if *cl_pif; /* back pointer to pif */ + struct altq_pktattr *cl_pktattr; /* saved header used by ECN */ + + /* statistics */ + u_int cl_period; /* backlog period */ + struct pktcntr cl_xmitcnt; /* transmitted packet counter */ + struct pktcntr cl_dropcnt; /* dropped packet counter */ +}; + +/* + * priq interface state + */ +struct priq_if { + struct priq_if *pif_next; /* interface state list */ + struct ifaltq *pif_ifq; /* backpointer to ifaltq */ + u_int pif_bandwidth; /* link bandwidth in bps */ + int pif_maxpri; /* max priority in use */ + struct priq_class *pif_default; /* default class */ + struct priq_class *pif_classes[PRIQ_MAXPRI]; /* classes */ +}; + +#endif /* _KERNEL */ + +#endif /* _ALTQ_ALTQ_PRIQ_H_ */ diff --git a/sys/net/altq/altq_red.c b/sys/net/altq/altq_red.c new file mode 100644 index 0000000000..f93287f614 --- /dev/null +++ b/sys/net/altq/altq_red.c @@ -0,0 +1,601 @@ +/* $KAME: altq_red.c,v 1.19 2004/04/17 10:54:49 kjc Exp $ */ +/* $DragonFly: src/sys/net/altq/altq_red.c,v 1.1 2005/02/11 22:25:57 joerg Exp $ */ + +/* + * Copyright (C) 1997-2003 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ +/* + * Copyright (c) 1990-1994 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the Computer Systems + * Engineering Group at Lawrence Berkeley Laboratory. + * 4. Neither the name of the University nor of the Laboratory may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "opt_altq.h" +#include "opt_inet.h" +#include "opt_inet6.h" + +#ifdef ALTQ_RED /* red is enabled by ALTQ_RED option in opt_altq.h */ + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#ifdef INET6 +#include +#endif + +#include +#include +#include + +/* + * ALTQ/RED (Random Early Detection) implementation using 32-bit + * fixed-point calculation. + * + * written by kjc using the ns code as a reference. + * you can learn more about red and ns from Sally's home page at + * http://www-nrg.ee.lbl.gov/floyd/ + * + * most of the red parameter values are fixed in this implementation + * to prevent fixed-point overflow/underflow. + * if you change the parameters, watch out for overflow/underflow! + * + * the parameters used are recommended values by Sally. + * the corresponding ns config looks: + * q_weight=0.00195 + * minthresh=5 maxthresh=15 queue-size=60 + * linterm=30 + * dropmech=drop-tail + * bytes=false (can't be handled by 32-bit fixed-point) + * doubleq=false dqthresh=false + * wait=true + */ +/* + * alternative red parameters for a slow link. + * + * assume the queue length becomes from zero to L and keeps L, it takes + * N packets for q_avg to reach 63% of L. + * when q_weight is 0.002, N is about 500 packets. + * for a slow link like dial-up, 500 packets takes more than 1 minute! + * when q_weight is 0.008, N is about 127 packets. + * when q_weight is 0.016, N is about 63 packets. + * bursts of 50 packets are allowed for 0.002, bursts of 25 packets + * are allowed for 0.016. + * see Sally's paper for more details. + */ +/* normal red parameters */ +#define W_WEIGHT 512 /* inverse of weight of EWMA (511/512) */ + /* q_weight = 0.00195 */ + +/* red parameters for a slow link */ +#define W_WEIGHT_1 128 /* inverse of weight of EWMA (127/128) */ + /* q_weight = 0.0078125 */ + +/* red parameters for a very slow link (e.g., dialup) */ +#define W_WEIGHT_2 64 /* inverse of weight of EWMA (63/64) */ + /* q_weight = 0.015625 */ + +/* fixed-point uses 12-bit decimal places */ +#define FP_SHIFT 12 /* fixed-point shift */ + +/* red parameters for drop probability */ +#define INV_P_MAX 10 /* inverse of max drop probability */ +#define TH_MIN 5 /* min threshold */ +#define TH_MAX 15 /* max threshold */ + +#define RED_LIMIT 60 /* default max queue lenght */ +#define RED_STATS /* collect statistics */ + +/* + * our default policy for forced-drop is drop-tail. + * (in altq-1.1.2 or earlier, the default was random-drop. + * but it makes more sense to punish the cause of the surge.) + * to switch to the random-drop policy, define "RED_RANDOM_DROP". + */ + +/* default red parameter values */ +static int default_th_min = TH_MIN; +static int default_th_max = TH_MAX; +static int default_inv_pmax = INV_P_MAX; + +/* + * red support routines + */ +red_t * +red_alloc(int weight, int inv_pmax, int th_min, int th_max, int flags, int pkttime) +{ + red_t *rp; + int w, i; + int npkts_per_sec; + + rp = malloc(sizeof(*rp), M_ALTQ, M_WAITOK | M_ZERO); + rp->red_avg = 0; + rp->red_idle = 1; + + if (weight == 0) + rp->red_weight = W_WEIGHT; + else + rp->red_weight = weight; + if (inv_pmax == 0) + rp->red_inv_pmax = default_inv_pmax; + else + rp->red_inv_pmax = inv_pmax; + if (th_min == 0) + rp->red_thmin = default_th_min; + else + rp->red_thmin = th_min; + if (th_max == 0) + rp->red_thmax = default_th_max; + else + rp->red_thmax = th_max; + + rp->red_flags = flags; + + if (pkttime == 0) + /* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */ + rp->red_pkttime = 800; + else + rp->red_pkttime = pkttime; + + if (weight == 0) { + /* when the link is very slow, adjust red parameters */ + npkts_per_sec = 1000000 / rp->red_pkttime; + if (npkts_per_sec < 50) { + /* up to about 400Kbps */ + rp->red_weight = W_WEIGHT_2; + } else if (npkts_per_sec < 300) { + /* up to about 2.4Mbps */ + rp->red_weight = W_WEIGHT_1; + } + } + + /* calculate wshift. weight must be power of 2 */ + w = rp->red_weight; + for (i = 0; w > 1; i++) + w = w >> 1; + rp->red_wshift = i; + w = 1 << rp->red_wshift; + if (w != rp->red_weight) { + printf("invalid weight value %d for red! use %d\n", + rp->red_weight, w); + rp->red_weight = w; + } + + /* + * thmin_s and thmax_s are scaled versions of th_min and th_max + * to be compared with avg. + */ + rp->red_thmin_s = rp->red_thmin << (rp->red_wshift + FP_SHIFT); + rp->red_thmax_s = rp->red_thmax << (rp->red_wshift + FP_SHIFT); + + /* + * precompute probability denominator + * probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point + */ + rp->red_probd = (2 * (rp->red_thmax - rp->red_thmin) + * rp->red_inv_pmax) << FP_SHIFT; + + /* allocate weight table */ + rp->red_wtab = wtab_alloc(rp->red_weight); + + microtime(&rp->red_last); + return (rp); +} + +void +red_destroy(red_t *rp) +{ + wtab_destroy(rp->red_wtab); + free(rp, M_ALTQ); +} + +void +red_getstats(red_t *rp, struct redstats *sp) +{ + sp->q_avg = rp->red_avg >> rp->red_wshift; + sp->xmit_cnt = rp->red_stats.xmit_cnt; + sp->drop_cnt = rp->red_stats.drop_cnt; + sp->drop_forced = rp->red_stats.drop_forced; + sp->drop_unforced = rp->red_stats.drop_unforced; + sp->marked_packets = rp->red_stats.marked_packets; +} + +int +red_addq(red_t *rp, class_queue_t *q, struct mbuf *m, struct altq_pktattr *pktattr) +{ + int avg, droptype; + int n; + + avg = rp->red_avg; + + /* + * if we were idle, we pretend that n packets arrived during + * the idle period. + */ + if (rp->red_idle) { + struct timeval now; + int t; + + rp->red_idle = 0; + microtime(&now); + t = (now.tv_sec - rp->red_last.tv_sec); + if (t > 60) { + /* + * being idle for more than 1 minute, set avg to zero. + * this prevents t from overflow. + */ + avg = 0; + } else { + t = t * 1000000 + (now.tv_usec - rp->red_last.tv_usec); + n = t / rp->red_pkttime - 1; + + /* the following line does (avg = (1 - Wq)^n * avg) */ + if (n > 0) + avg = (avg >> FP_SHIFT) * + pow_w(rp->red_wtab, n); + } + } + + /* run estimator. (note: avg is scaled by WEIGHT in fixed-point) */ + avg += (qlen(q) << FP_SHIFT) - (avg >> rp->red_wshift); + rp->red_avg = avg; /* save the new value */ + + /* + * red_count keeps a tally of arriving traffic that has not + * been dropped. + */ + rp->red_count++; + + /* see if we drop early */ + droptype = DTYPE_NODROP; + if (avg >= rp->red_thmin_s && qlen(q) > 1) { + if (avg >= rp->red_thmax_s) { + /* avg >= th_max: forced drop */ + droptype = DTYPE_FORCED; + } else if (rp->red_old == 0) { + /* first exceeds th_min */ + rp->red_count = 1; + rp->red_old = 1; + } else if (drop_early((avg - rp->red_thmin_s) >> rp->red_wshift, + rp->red_probd, rp->red_count)) { + /* mark or drop by red */ + if ((rp->red_flags & REDF_ECN) && + mark_ecn(m, pktattr, rp->red_flags)) { + /* successfully marked. do not drop. */ + rp->red_count = 0; +#ifdef RED_STATS + rp->red_stats.marked_packets++; +#endif + } else { + /* unforced drop by red */ + droptype = DTYPE_EARLY; + } + } + } else { + /* avg < th_min */ + rp->red_old = 0; + } + + /* + * if the queue length hits the hard limit, it's a forced drop. + */ + if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q)) + droptype = DTYPE_FORCED; + +#ifdef RED_RANDOM_DROP + /* if successful or forced drop, enqueue this packet. */ + if (droptype != DTYPE_EARLY) + _addq(q, m); +#else + /* if successful, enqueue this packet. */ + if (droptype == DTYPE_NODROP) + _addq(q, m); +#endif + if (droptype != DTYPE_NODROP) { + if (droptype == DTYPE_EARLY) { + /* drop the incoming packet */ +#ifdef RED_STATS + rp->red_stats.drop_unforced++; +#endif + } else { + /* forced drop, select a victim packet in the queue. */ +#ifdef RED_RANDOM_DROP + m = _getq_random(q); +#endif +#ifdef RED_STATS + rp->red_stats.drop_forced++; +#endif + } +#ifdef RED_STATS + PKTCNTR_ADD(&rp->red_stats.drop_cnt, m_pktlen(m)); +#endif + rp->red_count = 0; + m_freem(m); + return (-1); + } + /* successfully queued */ +#ifdef RED_STATS + PKTCNTR_ADD(&rp->red_stats.xmit_cnt, m_pktlen(m)); +#endif + return (0); +} + +/* + * early-drop probability is calculated as follows: + * prob = p_max * (avg - th_min) / (th_max - th_min) + * prob_a = prob / (2 - count*prob) + * = (avg-th_min) / (2*(th_max-th_min)*inv_p_max - count*(avg-th_min)) + * here prob_a increases as successive undrop count increases. + * (prob_a starts from prob/2, becomes prob when (count == (1 / prob)), + * becomes 1 when (count >= (2 / prob))). + */ +int +drop_early(int fp_len, int fp_probd, int count) +{ + int d; /* denominator of drop-probability */ + + d = fp_probd - count * fp_len; + if (d <= 0) { + /* count exceeds the hard limit: drop or mark */ + return (1); + } + + /* + * now the range of d is [1..600] in fixed-point. (when + * th_max-th_min=10 and p_max=1/30) + * drop probability = (avg - TH_MIN) / d + */ + + if ((arc4random() % d) < fp_len) { + /* drop or mark */ + return (1); + } + /* no drop/mark */ + return (0); +} + +/* + * try to mark CE bit to the packet. + * returns 1 if successfully marked, 0 otherwise. + */ +int +mark_ecn(struct mbuf *m, struct altq_pktattr *pktattr, int flags) +{ + struct mbuf *m0; + void *hdr; + int af; + + if ((m->m_pkthdr.fw_flags & ALTQ_MBUF_TAGGED) == 0) + return (0); + af = m->m_pkthdr.ecn_af; + hdr = m->m_pkthdr.header; + + if (af != AF_INET && af != AF_INET6) + return (0); + + /* verify that pattr_hdr is within the mbuf data */ + for (m0 = m; m0 != NULL; m0 = m0->m_next) { + if (((caddr_t)hdr >= m0->m_data) && + ((caddr_t)hdr < m0->m_data + m0->m_len)) + break; + } + if (m0 == NULL) { + /* ick, tag info is stale */ + return (0); + } + + switch (af) { + case AF_INET: + if (flags & REDF_ECN4) { + struct ip *ip = hdr; + uint8_t otos; + int sum; + + if (ip->ip_v != 4) + return (0); /* version mismatch! */ + + if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT) + return (0); /* not-ECT */ + if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) + return (1); /* already marked */ + + /* + * ecn-capable but not marked, + * mark CE and update checksum + */ + otos = ip->ip_tos; + ip->ip_tos |= IPTOS_ECN_CE; + /* + * update checksum (from RFC1624) + * HC' = ~(~HC + ~m + m') + */ + sum = ~ntohs(ip->ip_sum) & 0xffff; + sum += (~otos & 0xffff) + ip->ip_tos; + sum = (sum >> 16) + (sum & 0xffff); + sum += (sum >> 16); /* add carry */ + ip->ip_sum = htons(~sum & 0xffff); + return (1); + } + break; +#ifdef INET6 + case AF_INET6: + if (flags & REDF_ECN6) { + struct ip6_hdr *ip6 = hdr; + uint32_t flowlabel; + + flowlabel = ntohl(ip6->ip6_flow); + if ((flowlabel >> 28) != 6) + return (0); /* version mismatch! */ + if ((flowlabel & (IPTOS_ECN_MASK << 20)) == + (IPTOS_ECN_NOTECT << 20)) + return (0); /* not-ECT */ + if ((flowlabel & (IPTOS_ECN_MASK << 20)) == + (IPTOS_ECN_CE << 20)) + return (1); /* already marked */ + /* + * ecn-capable but not marked, mark CE + */ + flowlabel |= (IPTOS_ECN_CE << 20); + ip6->ip6_flow = htonl(flowlabel); + return (1); + } + break; +#endif /* INET6 */ + } + + /* not marked */ + return (0); +} + +struct mbuf * +red_getq(red_t *rp, class_queue_t *q) +{ + struct mbuf *m; + + if ((m = _getq(q)) == NULL) { + if (rp->red_idle == 0) { + rp->red_idle = 1; + microtime(&rp->red_last); + } + return NULL; + } + + rp->red_idle = 0; + return (m); +} + +/* + * helper routine to calibrate avg during idle. + * pow_w(wtab, n) returns (1 - Wq)^n in fixed-point + * here Wq = 1/weight and the code assumes Wq is close to zero. + * + * w_tab[n] holds ((1 - Wq)^(2^n)) in fixed-point. + */ +static SLIST_HEAD(, wtab) wtab_list = SLIST_HEAD_INITIALIZER(&wtab_list); + +struct wtab * +wtab_alloc(int weight) +{ + struct wtab *w; + int i; + + SLIST_FOREACH(w, &wtab_list, w_link) { + if (w->w_weight == weight) { + w->w_refcount++; + return (w); + } + } + + w = malloc(sizeof(*w), M_ALTQ, M_WAITOK | M_ZERO); + w->w_weight = weight; + w->w_refcount = 1; + SLIST_INSERT_HEAD(&wtab_list, w, w_link); + + /* initialize the weight table */ + w->w_tab[0] = ((weight - 1) << FP_SHIFT) / weight; + for (i = 1; i < 32; i++) { + w->w_tab[i] = (w->w_tab[i-1] * w->w_tab[i-1]) >> FP_SHIFT; + if (w->w_tab[i] == 0 && w->w_param_max == 0) + w->w_param_max = 1 << i; + } + + return (w); +} + +int +wtab_destroy(struct wtab *w) +{ + if (--w->w_refcount > 0) + return (0); + + SLIST_REMOVE(&wtab_list, w, wtab, w_link); + free(w, M_ALTQ); + + return (0); +} + +int32_t +pow_w(struct wtab *w, int n) +{ + int i, bit; + int32_t val; + + if (n >= w->w_param_max) + return (0); + + val = 1 << FP_SHIFT; + if (n <= 0) + return (val); + + bit = 1; + i = 0; + while (n) { + if (n & bit) { + val = (val * w->w_tab[i]) >> FP_SHIFT; + n &= ~bit; + } + i++; + bit <<= 1; + } + return (val); +} + +#endif /* ALTQ_RED */ diff --git a/sys/net/altq/altq_red.h b/sys/net/altq/altq_red.h new file mode 100644 index 0000000000..dffe5dd479 --- /dev/null +++ b/sys/net/altq/altq_red.h @@ -0,0 +1,123 @@ +/* $KAME: altq_red.h,v 1.8 2003/07/10 12:07:49 kjc Exp $ */ +/* $DragonFly: src/sys/net/altq/altq_red.h,v 1.1 2005/02/11 22:25:57 joerg Exp $ */ + +/* + * Copyright (C) 1997-2003 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _ALTQ_ALTQ_RED_H_ +#define _ALTQ_ALTQ_RED_H_ + +#include + +/* red flags */ +#define REDF_ECN4 0x01 /* use packet marking for IPv4 packets */ +#define REDF_ECN6 0x02 /* use packet marking for IPv6 packets */ +#define REDF_ECN (REDF_ECN4 | REDF_ECN6) + +/* + * simpler versions of red parameters and statistics used by other + * disciplines (e.g., CBQ) + */ +struct redparams { + int th_min; /* red min threshold */ + int th_max; /* red max threshold */ + int inv_pmax; /* inverse of max drop probability */ +}; + +struct redstats { + int q_avg; + struct pktcntr xmit_cnt; + struct pktcntr drop_cnt; + u_int drop_forced; + u_int drop_unforced; + u_int marked_packets; +}; + +#ifdef _KERNEL + +/* weight table structure for idle time calibration */ +struct wtab { + SLIST_ENTRY(wtab) w_link; + int w_weight; + int w_param_max; + int w_refcount; + int32_t w_tab[32]; +}; + +typedef struct red { + int red_pkttime; /* average packet time in micro sec + used for idle calibration */ + int red_flags; /* red flags */ + + /* red parameters */ + int red_weight; /* weight for EWMA */ + int red_inv_pmax; /* inverse of max drop probability */ + int red_thmin; /* red min threshold */ + int red_thmax; /* red max threshold */ + + /* variables for internal use */ + int red_wshift; /* log(red_weight) */ + int red_thmin_s; /* th_min scaled by avgshift */ + int red_thmax_s; /* th_max scaled by avgshift */ + int red_probd; /* drop probability denominator */ + + int red_avg; /* queue len avg scaled by avgshift */ + int red_count; /* packet count since last dropped/ + marked packet */ + int red_idle; /* queue was empty */ + int red_old; /* avg is above th_min */ + struct wtab *red_wtab; /* weight table */ + struct timeval red_last; /* time when the queue becomes idle */ + + struct { + struct pktcntr xmit_cnt; + struct pktcntr drop_cnt; + u_int drop_forced; + u_int drop_unforced; + u_int marked_packets; + } red_stats; +} red_t; + +/* red drop types */ +#define DTYPE_NODROP 0 /* no drop */ +#define DTYPE_FORCED 1 /* a "forced" drop */ +#define DTYPE_EARLY 2 /* an "unforced" (early) drop */ + +red_t *red_alloc(int, int, int, int, int, int); +void red_destroy(red_t *); +void red_getstats(red_t *, struct redstats *); +int red_addq(red_t *, class_queue_t *, struct mbuf *, + struct altq_pktattr *); +struct mbuf *red_getq(red_t *, class_queue_t *); +int drop_early(int, int, int); +int mark_ecn(struct mbuf *, struct altq_pktattr *, int); +struct wtab *wtab_alloc(int); +int wtab_destroy(struct wtab *); +int32_t pow_w(struct wtab *, int); + +#endif /* _KERNEL */ + +#endif /* _ALTQ_ALTQ_RED_H_ */ diff --git a/sys/net/altq/altq_rio.c b/sys/net/altq/altq_rio.c new file mode 100644 index 0000000000..5f98b3f5a2 --- /dev/null +++ b/sys/net/altq/altq_rio.c @@ -0,0 +1,424 @@ +/* $KAME: altq_rio.c,v 1.17 2003/07/10 12:07:49 kjc Exp $ */ +/* $DragonFly: src/sys/net/altq/altq_rio.c,v 1.1 2005/02/11 22:25:57 joerg Exp $ */ + +/* + * Copyright (C) 1998-2003 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * Copyright (c) 1990-1994 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the Computer Systems + * Engineering Group at Lawrence Berkeley Laboratory. + * 4. Neither the name of the University nor of the Laboratory may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "opt_altq.h" +#include "opt_inet.h" +#include "opt_inet6.h" + +#ifdef ALTQ_RIO /* rio is enabled by ALTQ_RIO option in opt_altq.h */ + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#ifdef INET6 +#include +#endif + +#include +#include +#include +#include + +/* + * RIO: RED with IN/OUT bit + * described in + * "Explicit Allocation of Best Effort Packet Delivery Service" + * David D. Clark and Wenjia Fang, MIT Lab for Computer Science + * http://diffserv.lcs.mit.edu/Papers/exp-alloc-ddc-wf.{ps,pdf} + * + * this implementation is extended to support more than 2 drop precedence + * values as described in RFC2597 (Assured Forwarding PHB Group). + * + */ +/* + * AF DS (differentiated service) codepoints. + * (classes can be mapped to CBQ or H-FSC classes.) + * + * 0 1 2 3 4 5 6 7 + * +---+---+---+---+---+---+---+---+ + * | CLASS |DropPre| 0 | CU | + * +---+---+---+---+---+---+---+---+ + * + * class 1: 001 + * class 2: 010 + * class 3: 011 + * class 4: 100 + * + * low drop prec: 01 + * medium drop prec: 10 + * high drop prec: 11 + */ + +/* normal red parameters */ +#define W_WEIGHT 512 /* inverse of weight of EWMA (511/512) */ + /* q_weight = 0.00195 */ + +/* red parameters for a slow link */ +#define W_WEIGHT_1 128 /* inverse of weight of EWMA (127/128) */ + /* q_weight = 0.0078125 */ + +/* red parameters for a very slow link (e.g., dialup) */ +#define W_WEIGHT_2 64 /* inverse of weight of EWMA (63/64) */ + /* q_weight = 0.015625 */ + +/* fixed-point uses 12-bit decimal places */ +#define FP_SHIFT 12 /* fixed-point shift */ + +/* red parameters for drop probability */ +#define INV_P_MAX 10 /* inverse of max drop probability */ +#define TH_MIN 5 /* min threshold */ +#define TH_MAX 15 /* max threshold */ + +#define RIO_LIMIT 60 /* default max queue lenght */ +#define RIO_STATS /* collect statistics */ + +/* default rio parameter values */ +static struct redparams default_rio_params[RIO_NDROPPREC] = { + /* th_min, th_max, inv_pmax */ + { TH_MAX * 2 + TH_MIN, TH_MAX * 3, INV_P_MAX }, /* low drop precedence */ + { TH_MAX + TH_MIN, TH_MAX * 2, INV_P_MAX }, /* medium drop precedence */ + { TH_MIN, TH_MAX, INV_P_MAX } /* high drop precedence */ +}; + +/* internal function prototypes */ +static int dscp2index(uint8_t); +/* YYY Do we really need this? */ +#define AF_DROPPRECMASK 0x18 +#define DSCP_MASK 0xfc + +rio_t * +rio_alloc(int weight, struct redparams *params, int flags, int pkttime) +{ + rio_t *rp; + int w, i; + int npkts_per_sec; + + rp = malloc(sizeof(*rp), M_ALTQ, M_WAITOK | M_ZERO); + + rp->rio_flags = flags; + if (pkttime == 0) + /* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */ + rp->rio_pkttime = 800; + else + rp->rio_pkttime = pkttime; + + if (weight != 0) + rp->rio_weight = weight; + else { + /* use default */ + rp->rio_weight = W_WEIGHT; + + /* when the link is very slow, adjust red parameters */ + npkts_per_sec = 1000000 / rp->rio_pkttime; + if (npkts_per_sec < 50) { + /* up to about 400Kbps */ + rp->rio_weight = W_WEIGHT_2; + } else if (npkts_per_sec < 300) { + /* up to about 2.4Mbps */ + rp->rio_weight = W_WEIGHT_1; + } + } + + /* calculate wshift. weight must be power of 2 */ + w = rp->rio_weight; + for (i = 0; w > 1; i++) + w = w >> 1; + rp->rio_wshift = i; + w = 1 << rp->rio_wshift; + if (w != rp->rio_weight) { + printf("invalid weight value %d for red! use %d\n", + rp->rio_weight, w); + rp->rio_weight = w; + } + + /* allocate weight table */ + rp->rio_wtab = wtab_alloc(rp->rio_weight); + + for (i = 0; i < RIO_NDROPPREC; i++) { + struct dropprec_state *prec = &rp->rio_precstate[i]; + + prec->avg = 0; + prec->idle = 1; + + if (params == NULL || params[i].inv_pmax == 0) + prec->inv_pmax = default_rio_params[i].inv_pmax; + else + prec->inv_pmax = params[i].inv_pmax; + if (params == NULL || params[i].th_min == 0) + prec->th_min = default_rio_params[i].th_min; + else + prec->th_min = params[i].th_min; + if (params == NULL || params[i].th_max == 0) + prec->th_max = default_rio_params[i].th_max; + else + prec->th_max = params[i].th_max; + + /* + * th_min_s and th_max_s are scaled versions of th_min + * and th_max to be compared with avg. + */ + prec->th_min_s = prec->th_min << (rp->rio_wshift + FP_SHIFT); + prec->th_max_s = prec->th_max << (rp->rio_wshift + FP_SHIFT); + + /* + * precompute probability denominator + * probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point + */ + prec->probd = (2 * (prec->th_max - prec->th_min) + * prec->inv_pmax) << FP_SHIFT; + + microtime(&prec->last); + } + + return (rp); +} + +void +rio_destroy(rio_t *rp) +{ + wtab_destroy(rp->rio_wtab); + free(rp, M_ALTQ); +} + +void +rio_getstats(rio_t *rp, struct redstats *sp) +{ + int i; + + for (i = 0; i < RIO_NDROPPREC; i++) { + bcopy(&rp->q_stats[i], sp, sizeof(struct redstats)); + sp->q_avg = rp->rio_precstate[i].avg >> rp->rio_wshift; + sp++; + } +} + +#if (RIO_NDROPPREC == 3) +/* + * internally, a drop precedence value is converted to an index + * starting from 0. + */ +static int +dscp2index(uint8_t dscp) +{ + int dpindex = dscp & AF_DROPPRECMASK; + + if (dpindex == 0) + return (0); + return ((dpindex >> 3) - 1); +} +#endif + +#if 1 +/* + * kludge: when a packet is dequeued, we need to know its drop precedence + * in order to keep the queue length of each drop precedence. + * use m_pkthdr.rcvif to pass this info. + */ +#define RIOM_SET_PRECINDEX(m, idx) \ + do { (m)->m_pkthdr.rcvif = (struct ifnet *)((long)(idx)); } while (0) +#define RIOM_GET_PRECINDEX(m) \ + ({ long idx; idx = (long)((m)->m_pkthdr.rcvif); \ + (m)->m_pkthdr.rcvif = NULL; idx; }) +#endif + +int +rio_addq(rio_t *rp, class_queue_t *q, struct mbuf *m, struct altq_pktattr *pktattr) +{ + int avg, droptype; + uint8_t dsfield, odsfield; + int dpindex, i, n, t; + struct timeval now; + struct dropprec_state *prec; + + dsfield = odsfield = read_dsfield(m, pktattr); + dpindex = dscp2index(dsfield); + + /* + * update avg of the precedence states whose drop precedence + * is larger than or equal to the drop precedence of the packet + */ + now.tv_sec = 0; + for (i = dpindex; i < RIO_NDROPPREC; i++) { + prec = &rp->rio_precstate[i]; + avg = prec->avg; + if (prec->idle) { + prec->idle = 0; + if (now.tv_sec == 0) + microtime(&now); + t = (now.tv_sec - prec->last.tv_sec); + if (t > 60) + avg = 0; + else { + t = t * 1000000 + + (now.tv_usec - prec->last.tv_usec); + n = t / rp->rio_pkttime; + /* calculate (avg = (1 - Wq)^n * avg) */ + if (n > 0) + avg = (avg >> FP_SHIFT) * + pow_w(rp->rio_wtab, n); + } + } + + /* run estimator. (avg is scaled by WEIGHT in fixed-point) */ + avg += (prec->qlen << FP_SHIFT) - (avg >> rp->rio_wshift); + prec->avg = avg; /* save the new value */ + /* + * count keeps a tally of arriving traffic that has not + * been dropped. + */ + prec->count++; + } + + prec = &rp->rio_precstate[dpindex]; + avg = prec->avg; + + /* see if we drop early */ + droptype = DTYPE_NODROP; + if (avg >= prec->th_min_s && prec->qlen > 1) { + if (avg >= prec->th_max_s) { + /* avg >= th_max: forced drop */ + droptype = DTYPE_FORCED; + } else if (prec->old == 0) { + /* first exceeds th_min */ + prec->count = 1; + prec->old = 1; + } else if (drop_early((avg - prec->th_min_s) >> rp->rio_wshift, + prec->probd, prec->count)) { + /* unforced drop by red */ + droptype = DTYPE_EARLY; + } + } else { + /* avg < th_min */ + prec->old = 0; + } + + /* + * if the queue length hits the hard limit, it's a forced drop. + */ + if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q)) + droptype = DTYPE_FORCED; + + if (droptype != DTYPE_NODROP) { + /* always drop incoming packet (as opposed to randomdrop) */ + for (i = dpindex; i < RIO_NDROPPREC; i++) + rp->rio_precstate[i].count = 0; +#ifdef RIO_STATS + if (droptype == DTYPE_EARLY) + rp->q_stats[dpindex].drop_unforced++; + else + rp->q_stats[dpindex].drop_forced++; + PKTCNTR_ADD(&rp->q_stats[dpindex].drop_cnt, m_pktlen(m)); +#endif + m_freem(m); + return (-1); + } + + for (i = dpindex; i < RIO_NDROPPREC; i++) + rp->rio_precstate[i].qlen++; + + /* save drop precedence index in mbuf hdr */ + RIOM_SET_PRECINDEX(m, dpindex); + + if (rp->rio_flags & RIOF_CLEARDSCP) + dsfield &= ~DSCP_MASK; + + if (dsfield != odsfield) + write_dsfield(m, pktattr, dsfield); + + _addq(q, m); + +#ifdef RIO_STATS + PKTCNTR_ADD(&rp->q_stats[dpindex].xmit_cnt, m_pktlen(m)); +#endif + return (0); +} + +struct mbuf * +rio_getq(rio_t *rp, class_queue_t *q) +{ + struct mbuf *m; + int dpindex, i; + + if ((m = _getq(q)) == NULL) + return (NULL); + + dpindex = RIOM_GET_PRECINDEX(m); + for (i = dpindex; i < RIO_NDROPPREC; i++) { + if (--rp->rio_precstate[i].qlen == 0) { + if (rp->rio_precstate[i].idle == 0) { + rp->rio_precstate[i].idle = 1; + microtime(&rp->rio_precstate[i].last); + } + } + } + return (m); +} + +#endif /* ALTQ_RIO */ diff --git a/sys/net/altq/altq_rio.h b/sys/net/altq/altq_rio.h new file mode 100644 index 0000000000..d5c23c923f --- /dev/null +++ b/sys/net/altq/altq_rio.h @@ -0,0 +1,94 @@ +/* $KAME: altq_rio.h,v 1.9 2003/07/10 12:07:49 kjc Exp $ */ +/* $DragonFly: src/sys/net/altq/altq_rio.h,v 1.1 2005/02/11 22:25:57 joerg Exp $ */ + +/* + * Copyright (C) 1998-2003 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _ALTQ_ALTQ_RIO_H_ +#define _ALTQ_ALTQ_RIO_H_ + +#include + +/* + * RIO: RED with IN/OUT bit + * (extended to support more than 2 drop precedence values) + */ +#define RIO_NDROPPREC 3 /* number of drop precedence values */ + +/* rio flags */ +#define RIOF_ECN4 0x01 /* use packet marking for IPv4 packets */ +#define RIOF_ECN6 0x02 /* use packet marking for IPv6 packets */ +#define RIOF_ECN (RIOF_ECN4 | RIOF_ECN6) +#define RIOF_CLEARDSCP 0x200 /* clear diffserv codepoint */ + +#ifdef _KERNEL + +typedef struct rio { + /* per drop precedence structure */ + struct dropprec_state { + /* red parameters */ + int inv_pmax; /* inverse of max drop probability */ + int th_min; /* red min threshold */ + int th_max; /* red max threshold */ + + /* variables for internal use */ + int th_min_s; /* th_min scaled by avgshift */ + int th_max_s; /* th_max scaled by avgshift */ + int probd; /* drop probability denominator */ + + int qlen; /* queue length */ + int avg; /* (scaled) queue length average */ + int count; /* packet count since the last dropped/ + marked packet */ + int idle; /* queue was empty */ + int old; /* avg is above th_min */ + struct timeval last; /* timestamp when queue becomes idle */ + } rio_precstate[RIO_NDROPPREC]; + + int rio_wshift; /* log(red_weight) */ + int rio_weight; /* weight for EWMA */ + struct wtab *rio_wtab; /* weight table */ + + int rio_pkttime; /* average packet time in micro sec + used for idle calibration */ + int rio_flags; /* rio flags */ + + uint8_t rio_codepoint; /* codepoint value to tag packets */ + uint8_t rio_codepointmask; /* codepoint mask bits */ + + struct redstats q_stats[RIO_NDROPPREC]; /* statistics */ +} rio_t; + +rio_t *rio_alloc(int, struct redparams *, int, int); +void rio_destroy(rio_t *); +void rio_getstats(rio_t *, struct redstats *); +int rio_addq(rio_t *, class_queue_t *, struct mbuf *, + struct altq_pktattr *); +struct mbuf *rio_getq(rio_t *, class_queue_t *); + +#endif /* _KERNEL */ + +#endif /* _ALTQ_ALTQ_RIO_H_ */ diff --git a/sys/net/altq/altq_rmclass.c b/sys/net/altq/altq_rmclass.c new file mode 100644 index 0000000000..f8006187c2 --- /dev/null +++ b/sys/net/altq/altq_rmclass.c @@ -0,0 +1,1652 @@ +/* $KAME: altq_rmclass.c,v 1.18 2003/11/06 06:32:53 kjc Exp $ */ +/* $DragonFly: src/sys/net/altq/altq_rmclass.c,v 1.1 2005/02/11 22:25:57 joerg Exp $ */ + +/* + * Copyright (c) 1991-1997 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the Network Research + * Group at Lawrence Berkeley Laboratory. + * 4. Neither the name of the University nor of the Laboratory may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * LBL code modified by speer@eng.sun.com, May 1977. + * For questions and/or comments, please send mail to cbq@ee.lbl.gov + */ + +#ident "@(#)rm_class.c 1.48 97/12/05 SMI" + +#include "opt_altq.h" +#include "opt_inet.h" +#include "opt_inet6.h" + +#ifdef ALTQ_CBQ /* cbq is enabled by ALTQ_CBQ option in opt_altq.h */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include + +#ifdef CBQ_TRACE +static struct cbqtrace cbqtrace_buffer[NCBQTRACE+1]; +static struct cbqtrace *cbqtrace_ptr = NULL; +static int cbqtrace_count; +#endif + +/* + * Local Macros + */ + +#define reset_cutoff(ifd) { ifd->cutoff_ = RM_MAXDEPTH; } + +/* + * Local routines. + */ + +static int rmc_satisfied(struct rm_class *, struct timeval *); +static void rmc_wrr_set_weights(struct rm_ifdat *); +static void rmc_depth_compute(struct rm_class *); +static void rmc_depth_recompute(rm_class_t *); + +static struct mbuf *_rmc_wrr_dequeue_next(struct rm_ifdat *, int); +static struct mbuf *_rmc_prr_dequeue_next(struct rm_ifdat *, int); + +static int _rmc_addq(rm_class_t *, struct mbuf *); +static void _rmc_dropq(rm_class_t *); +static struct mbuf *_rmc_getq(rm_class_t *); +static struct mbuf *_rmc_pollq(rm_class_t *); + +static int rmc_under_limit(struct rm_class *, struct timeval *); +static void rmc_tl_satisfied(struct rm_ifdat *, struct timeval *); +static void rmc_drop_action(struct rm_class *); +static void rmc_restart(void *); +static void rmc_root_overlimit(struct rm_class *, struct rm_class *); + +#define BORROW_OFFTIME +/* + * BORROW_OFFTIME (experimental): + * borrow the offtime of the class borrowing from. + * the reason is that when its own offtime is set, the class is unable + * to borrow much, especially when cutoff is taking effect. + * but when the borrowed class is overloaded (advidle is close to minidle), + * use the borrowing class's offtime to avoid overload. + */ +#define ADJUST_CUTOFF +/* + * ADJUST_CUTOFF (experimental): + * if no underlimit class is found due to cutoff, increase cutoff and + * retry the scheduling loop. + * also, don't invoke delay_actions while cutoff is taking effect, + * since a sleeping class won't have a chance to be scheduled in the + * next loop. + * + * now heuristics for setting the top-level variable (cutoff_) becomes: + * 1. if a packet arrives for a not-overlimit class, set cutoff + * to the depth of the class. + * 2. if cutoff is i, and a packet arrives for an overlimit class + * with an underlimit ancestor at a lower level than i (say j), + * then set cutoff to j. + * 3. at scheduling a packet, if there is no underlimit class + * due to the current cutoff level, increase cutoff by 1 and + * then try to schedule again. + */ + +/* + * rm_class_t * + * rmc_newclass(...) - Create a new resource management class at priority + * 'pri' on the interface given by 'ifd'. + * + * nsecPerByte is the data rate of the interface in nanoseconds/byte. + * E.g., 800 for a 10Mb/s ethernet. If the class gets less + * than 100% of the bandwidth, this number should be the + * 'effective' rate for the class. Let f be the + * bandwidth fraction allocated to this class, and let + * nsPerByte be the data rate of the output link in + * nanoseconds/byte. Then nsecPerByte is set to + * nsPerByte / f. E.g., 1600 (= 800 / .5) + * for a class that gets 50% of an ethernet's bandwidth. + * + * action the routine to call when the class is over limit. + * + * maxq max allowable queue size for class (in packets). + * + * parent parent class pointer. + * + * borrow class to borrow from (should be either 'parent' or null). + * + * maxidle max value allowed for class 'idle' time estimate (this + * parameter determines how large an initial burst of packets + * can be before overlimit action is invoked. + * + * offtime how long 'delay' action will delay when class goes over + * limit (this parameter determines the steady-state burst + * size when a class is running over its limit). + * + * Maxidle and offtime have to be computed from the following: If the + * average packet size is s, the bandwidth fraction allocated to this + * class is f, we want to allow b packet bursts, and the gain of the + * averaging filter is g (= 1 - 2^(-RM_FILTER_GAIN)), then: + * + * ptime = s * nsPerByte * (1 - f) / f + * maxidle = ptime * (1 - g^b) / g^b + * minidle = -ptime * (1 / (f - 1)) + * offtime = ptime * (1 + 1/(1 - g) * (1 - g^(b - 1)) / g^(b - 1) + * + * Operationally, it's convenient to specify maxidle & offtime in units + * independent of the link bandwidth so the maxidle & offtime passed to + * this routine are the above values multiplied by 8*f/(1000*nsPerByte). + * (The constant factor is a scale factor needed to make the parameters + * integers. This scaling also means that the 'unscaled' values of + * maxidle*nsecPerByte/8 and offtime*nsecPerByte/8 will be in microseconds, + * not nanoseconds.) Also note that the 'idle' filter computation keeps + * an estimate scaled upward by 2^RM_FILTER_GAIN so the passed value of + * maxidle also must be scaled upward by this value. Thus, the passed + * values for maxidle and offtime can be computed as follows: + * + * maxidle = maxidle * 2^RM_FILTER_GAIN * 8 / (1000 * nsecPerByte) + * offtime = offtime * 8 / (1000 * nsecPerByte) + * + * When USE_HRTIME is employed, then maxidle and offtime become: + * maxidle = maxilde * (8.0 / nsecPerByte); + * offtime = offtime * (8.0 / nsecPerByte); + */ +struct rm_class * +rmc_newclass(int pri, struct rm_ifdat *ifd, u_int nsecPerByte, + void (*action)(rm_class_t *, rm_class_t *), int maxq, + struct rm_class *parent, struct rm_class *borrow, u_int maxidle, + int minidle, u_int offtime, int pktsize, int flags) +{ + struct rm_class *cl; + struct rm_class *peer; + int s; + + if (pri >= RM_MAXPRIO) + return (NULL); +#ifndef ALTQ_RED + if (flags & RMCF_RED) { +#ifdef ALTQ_DEBUG + printf("rmc_newclass: RED not configured for CBQ!\n"); +#endif + return (NULL); + } +#endif +#ifndef ALTQ_RIO + if (flags & RMCF_RIO) { +#ifdef ALTQ_DEBUG + printf("rmc_newclass: RIO not configured for CBQ!\n"); +#endif + return (NULL); + } +#endif + + cl = malloc(sizeof(*cl), M_ALTQ, M_WAITOK | M_ZERO); + callout_init(&cl->callout_); + cl->q_ = malloc(sizeof(*cl->q_), M_ALTQ, M_WAITOK | M_ZERO); + + /* + * Class initialization. + */ + cl->children_ = NULL; + cl->parent_ = parent; + cl->borrow_ = borrow; + cl->leaf_ = 1; + cl->ifdat_ = ifd; + cl->pri_ = pri; + cl->allotment_ = RM_NS_PER_SEC / nsecPerByte; /* Bytes per sec */ + cl->depth_ = 0; + cl->qthresh_ = 0; + cl->ns_per_byte_ = nsecPerByte; + + qlimit(cl->q_) = maxq; + qtype(cl->q_) = Q_DROPHEAD; + qlen(cl->q_) = 0; + cl->flags_ = flags; + +#if 1 /* minidle is also scaled in ALTQ */ + cl->minidle_ = (minidle * (int)nsecPerByte) / 8; + if (cl->minidle_ > 0) + cl->minidle_ = 0; +#else + cl->minidle_ = minidle; +#endif + cl->maxidle_ = (maxidle * nsecPerByte) / 8; + if (cl->maxidle_ == 0) + cl->maxidle_ = 1; +#if 1 /* offtime is also scaled in ALTQ */ + cl->avgidle_ = cl->maxidle_; + cl->offtime_ = ((offtime * nsecPerByte) / 8) >> RM_FILTER_GAIN; + if (cl->offtime_ == 0) + cl->offtime_ = 1; +#else + cl->avgidle_ = 0; + cl->offtime_ = (offtime * nsecPerByte) / 8; +#endif + cl->overlimit = action; + +#ifdef ALTQ_RED + if (flags & (RMCF_RED|RMCF_RIO)) { + int red_flags, red_pkttime; + + red_flags = 0; + if (flags & RMCF_ECN) + red_flags |= REDF_ECN; +#ifdef ALTQ_RIO + if (flags & RMCF_CLEARDSCP) + red_flags |= RIOF_CLEARDSCP; +#endif + red_pkttime = nsecPerByte * pktsize / 1000; + + if (flags & RMCF_RED) { + cl->red_ = red_alloc(0, 0, + qlimit(cl->q_) * 10/100, + qlimit(cl->q_) * 30/100, + red_flags, red_pkttime); + if (cl->red_ != NULL) + qtype(cl->q_) = Q_RED; + } +#ifdef ALTQ_RIO + else { + cl->red_ = (red_t *)rio_alloc(0, NULL, + red_flags, red_pkttime); + if (cl->red_ != NULL) + qtype(cl->q_) = Q_RIO; + } +#endif + } +#endif /* ALTQ_RED */ + + /* + * put the class into the class tree + */ + s = splimp(); + if ((peer = ifd->active_[pri]) != NULL) { + /* find the last class at this pri */ + cl->peer_ = peer; + while (peer->peer_ != ifd->active_[pri]) + peer = peer->peer_; + peer->peer_ = cl; + } else { + ifd->active_[pri] = cl; + cl->peer_ = cl; + } + + if (cl->parent_) { + cl->next_ = parent->children_; + parent->children_ = cl; + parent->leaf_ = 0; + } + + /* + * Compute the depth of this class and its ancestors in the class + * hierarchy. + */ + rmc_depth_compute(cl); + + /* + * If CBQ's WRR is enabled, then initialize the class WRR state. + */ + if (ifd->wrr_) { + ifd->num_[pri]++; + ifd->alloc_[pri] += cl->allotment_; + rmc_wrr_set_weights(ifd); + } + splx(s); + return (cl); +} + +int +rmc_modclass(struct rm_class *cl, u_int nsecPerByte, int maxq, u_int maxidle, + int minidle, u_int offtime, int pktsize) +{ + struct rm_ifdat *ifd; + u_int old_allotment; + int s; + + ifd = cl->ifdat_; + old_allotment = cl->allotment_; + + s = splimp(); + cl->allotment_ = RM_NS_PER_SEC / nsecPerByte; /* Bytes per sec */ + cl->qthresh_ = 0; + cl->ns_per_byte_ = nsecPerByte; + + qlimit(cl->q_) = maxq; + +#if 1 /* minidle is also scaled in ALTQ */ + cl->minidle_ = (minidle * nsecPerByte) / 8; + if (cl->minidle_ > 0) + cl->minidle_ = 0; +#else + cl->minidle_ = minidle; +#endif + cl->maxidle_ = (maxidle * nsecPerByte) / 8; + if (cl->maxidle_ == 0) + cl->maxidle_ = 1; +#if 1 /* offtime is also scaled in ALTQ */ + cl->avgidle_ = cl->maxidle_; + cl->offtime_ = ((offtime * nsecPerByte) / 8) >> RM_FILTER_GAIN; + if (cl->offtime_ == 0) + cl->offtime_ = 1; +#else + cl->avgidle_ = 0; + cl->offtime_ = (offtime * nsecPerByte) / 8; +#endif + + /* + * If CBQ's WRR is enabled, then initialize the class WRR state. + */ + if (ifd->wrr_) { + ifd->alloc_[cl->pri_] += cl->allotment_ - old_allotment; + rmc_wrr_set_weights(ifd); + } + splx(s); + return (0); +} + +/* + * static void + * rmc_wrr_set_weights(struct rm_ifdat *ifdat) - This function computes + * the appropriate run robin weights for the CBQ weighted round robin + * algorithm. + * + * Returns: NONE + */ + +static void +rmc_wrr_set_weights(struct rm_ifdat *ifd) +{ + int i; + struct rm_class *cl, *clh; + + for (i = 0; i < RM_MAXPRIO; i++) { + /* + * This is inverted from that of the simulator to + * maintain precision. + */ + if (ifd->num_[i] == 0) + ifd->M_[i] = 0; + else + ifd->M_[i] = ifd->alloc_[i] / + (ifd->num_[i] * ifd->maxpkt_); + /* + * Compute the weighted allotment for each class. + * This takes the expensive div instruction out + * of the main loop for the wrr scheduling path. + * These only get recomputed when a class comes or + * goes. + */ + if (ifd->active_[i] != NULL) { + clh = cl = ifd->active_[i]; + do { + /* safe-guard for slow link or alloc_ == 0 */ + if (ifd->M_[i] == 0) + cl->w_allotment_ = 0; + else + cl->w_allotment_ = cl->allotment_ / + ifd->M_[i]; + cl = cl->peer_; + } while ((cl != NULL) && (cl != clh)); + } + } +} + +int +rmc_get_weight(struct rm_ifdat *ifd, int pri) +{ + if ((pri >= 0) && (pri < RM_MAXPRIO)) + return (ifd->M_[pri]); + else + return (0); +} + +/* + * static void + * rmc_depth_compute(struct rm_class *cl) - This function computes the + * appropriate depth of class 'cl' and its ancestors. + * + * Returns: NONE + */ + +static void +rmc_depth_compute(struct rm_class *cl) +{ + rm_class_t *t = cl, *p; + + /* + * Recompute the depth for the branch of the tree. + */ + while (t != NULL) { + p = t->parent_; + if (p && (t->depth_ >= p->depth_)) { + p->depth_ = t->depth_ + 1; + t = p; + } else + t = NULL; + } +} + +/* + * static void + * rmc_depth_recompute(struct rm_class *cl) - This function re-computes + * the depth of the tree after a class has been deleted. + * + * Returns: NONE + */ + +static void +rmc_depth_recompute(rm_class_t *cl) +{ +#if 1 /* ALTQ */ + rm_class_t *p, *t; + + p = cl; + while (p != NULL) { + if ((t = p->children_) == NULL) { + p->depth_ = 0; + } else { + int cdepth = 0; + + while (t != NULL) { + if (t->depth_ > cdepth) + cdepth = t->depth_; + t = t->next_; + } + + if (p->depth_ == cdepth + 1) + /* no change to this parent */ + return; + + p->depth_ = cdepth + 1; + } + + p = p->parent_; + } +#else + rm_class_t *t; + + if (cl->depth_ >= 1) { + if (cl->children_ == NULL) { + cl->depth_ = 0; + } else if ((t = cl->children_) != NULL) { + while (t != NULL) { + if (t->children_ != NULL) + rmc_depth_recompute(t); + t = t->next_; + } + } else + rmc_depth_compute(cl); + } +#endif +} + +/* + * void + * rmc_delete_class(struct rm_ifdat *ifdat, struct rm_class *cl) - This + * function deletes a class from the link-sharing structure and frees + * all resources associated with the class. + * + * Returns: NONE + */ + +void +rmc_delete_class(struct rm_ifdat *ifd, struct rm_class *cl) +{ + struct rm_class *p, *head, *previous; + int s; + + KKASSERT(cl->children_ == NULL); + + if (cl->sleeping_) + callout_stop(&cl->callout_); + + s = splimp(); + /* + * Free packets in the packet queue. + * XXX - this may not be a desired behavior. Packets should be + * re-queued. + */ + rmc_dropall(cl); + + /* + * If the class has a parent, then remove the class from the + * class from the parent's children chain. + */ + if (cl->parent_ != NULL) { + head = cl->parent_->children_; + p = previous = head; + if (head->next_ == NULL) { + KKASSERT(head == cl); + cl->parent_->children_ = NULL; + cl->parent_->leaf_ = 1; + } else while (p != NULL) { + if (p == cl) { + if (cl == head) + cl->parent_->children_ = cl->next_; + else + previous->next_ = cl->next_; + cl->next_ = NULL; + p = NULL; + } else { + previous = p; + p = p->next_; + } + } + } + + /* + * Delete class from class priority peer list. + */ + if ((p = ifd->active_[cl->pri_]) != NULL) { + /* + * If there is more than one member of this priority + * level, then look for class(cl) in the priority level. + */ + if (p != p->peer_) { + while (p->peer_ != cl) + p = p->peer_; + p->peer_ = cl->peer_; + + if (ifd->active_[cl->pri_] == cl) + ifd->active_[cl->pri_] = cl->peer_; + } else { + KKASSERT(p == cl); + ifd->active_[cl->pri_] = NULL; + } + } + + /* + * Recompute the WRR weights. + */ + if (ifd->wrr_) { + ifd->alloc_[cl->pri_] -= cl->allotment_; + ifd->num_[cl->pri_]--; + rmc_wrr_set_weights(ifd); + } + + /* + * Re-compute the depth of the tree. + */ +#if 1 /* ALTQ */ + rmc_depth_recompute(cl->parent_); +#else + rmc_depth_recompute(ifd->root_); +#endif + + splx(s); + + /* + * Free the class structure. + */ + if (cl->red_ != NULL) { +#ifdef ALTQ_RIO + if (q_is_rio(cl->q_)) + rio_destroy((rio_t *)cl->red_); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->q_)) + red_destroy(cl->red_); +#endif + } + free(cl->q_, M_ALTQ); + free(cl, M_ALTQ); +} + +/* + * void + * rmc_init(...) - Initialize the resource management data structures + * associated with the output portion of interface 'ifp'. 'ifd' is + * where the structures will be built (for backwards compatibility, the + * structures aren't kept in the ifnet struct). 'nsecPerByte' + * gives the link speed (inverse of bandwidth) in nanoseconds/byte. + * 'restart' is the driver-specific routine that the generic 'delay + * until under limit' action will call to restart output. `maxq' + * is the queue size of the 'link' & 'default' classes. 'maxqueued' + * is the maximum number of packets that the resource management + * code will allow to be queued 'downstream' (this is typically 1). + * + * Returns: NONE + */ + +void +rmc_init(struct ifaltq *ifq, struct rm_ifdat *ifd, u_int nsecPerByte, + void (*restart)(struct ifaltq *), int maxq, int maxqueued, u_int maxidle, + int minidle, u_int offtime, int flags) +{ + int i, mtu; + + /* + * Initialize the CBQ tracing/debug facility. + */ + CBQTRACEINIT(); + + bzero(ifd, sizeof (*ifd)); + mtu = ifq->altq_ifp->if_mtu; + ifd->ifq_ = ifq; + ifd->restart = restart; + ifd->maxqueued_ = maxqueued; + ifd->ns_per_byte_ = nsecPerByte; + ifd->maxpkt_ = mtu; + ifd->wrr_ = (flags & RMCF_WRR) ? 1 : 0; + ifd->efficient_ = (flags & RMCF_EFFICIENT) ? 1 : 0; +#if 1 + ifd->maxiftime_ = mtu * nsecPerByte / 1000 * 16; + if (mtu * nsecPerByte > 10 * 1000000) + ifd->maxiftime_ /= 4; +#endif + + reset_cutoff(ifd); + CBQTRACE(rmc_init, 'INIT', ifd->cutoff_); + + /* + * Initialize the CBQ's WRR state. + */ + for (i = 0; i < RM_MAXPRIO; i++) { + ifd->alloc_[i] = 0; + ifd->M_[i] = 0; + ifd->num_[i] = 0; + ifd->na_[i] = 0; + ifd->active_[i] = NULL; + } + + /* + * Initialize current packet state. + */ + ifd->qi_ = 0; + ifd->qo_ = 0; + for (i = 0; i < RM_MAXQUEUED; i++) { + ifd->class_[i] = NULL; + ifd->curlen_[i] = 0; + ifd->borrowed_[i] = NULL; + } + + /* + * Create the root class of the link-sharing structure. + */ + ifd->root_ = rmc_newclass(0, ifd, nsecPerByte, rmc_root_overlimit, + maxq, 0, 0, maxidle, minidle, offtime, 0, 0); + if (ifd->root_ == NULL) { + printf("rmc_init: root class not allocated\n"); + return ; + } + ifd->root_->depth_ = 0; +} + +/* + * void + * rmc_queue_packet(struct rm_class *cl, struct mbuf *m) - Add packet given by + * mbuf 'm' to queue for resource class 'cl'. This routine is called + * by a driver's if_output routine. This routine must be called with + * output packet completion interrupts locked out (to avoid racing with + * rmc_dequeue_next). + * + * Returns: 0 on successful queueing + * -1 when packet drop occurs + */ +int +rmc_queue_packet(struct rm_class *cl, struct mbuf *m) +{ + struct timeval now; + struct rm_ifdat *ifd = cl->ifdat_; + int cpri = cl->pri_; + int is_empty = qempty(cl->q_); + + RM_GETTIME(now); + if (ifd->cutoff_ > 0) { + if (TV_LT(&cl->undertime_, &now)) { + if (ifd->cutoff_ > cl->depth_) + ifd->cutoff_ = cl->depth_; + CBQTRACE(rmc_queue_packet, 'ffoc', cl->depth_); + } +#if 1 /* ALTQ */ + else { + /* + * the class is overlimit. if the class has + * underlimit ancestors, set cutoff to the lowest + * depth among them. + */ + struct rm_class *borrow = cl->borrow_; + + while (borrow != NULL && + borrow->depth_ < ifd->cutoff_) { + if (TV_LT(&borrow->undertime_, &now)) { + ifd->cutoff_ = borrow->depth_; + CBQTRACE(rmc_queue_packet, 'ffob', ifd->cutoff_); + break; + } + borrow = borrow->borrow_; + } + } +#else /* !ALTQ */ + else if ((ifd->cutoff_ > 1) && cl->borrow_) { + if (TV_LT(&cl->borrow_->undertime_, &now)) { + ifd->cutoff_ = cl->borrow_->depth_; + CBQTRACE(rmc_queue_packet, 'ffob', + cl->borrow_->depth_); + } + } +#endif /* !ALTQ */ + } + + if (_rmc_addq(cl, m) < 0) + /* failed */ + return (-1); + + if (is_empty) { + CBQTRACE(rmc_queue_packet, 'ytpe', cl->stats_.handle); + ifd->na_[cpri]++; + } + + if (qlen(cl->q_) > qlimit(cl->q_)) { + /* note: qlimit can be set to 0 or 1 */ + rmc_drop_action(cl); + return (-1); + } + return (0); +} + +/* + * void + * rmc_tl_satisfied(struct rm_ifdat *ifd, struct timeval *now) - Check all + * classes to see if there are satified. + */ + +static void +rmc_tl_satisfied(struct rm_ifdat *ifd, struct timeval *now) +{ + int i; + rm_class_t *p, *bp; + + for (i = RM_MAXPRIO - 1; i >= 0; i--) { + if ((bp = ifd->active_[i]) != NULL) { + p = bp; + do { + if (!rmc_satisfied(p, now)) { + ifd->cutoff_ = p->depth_; + return; + } + p = p->peer_; + } while (p != bp); + } + } + + reset_cutoff(ifd); +} + +/* + * rmc_satisfied - Return 1 of the class is satisfied. O, otherwise. + */ + +static int +rmc_satisfied(struct rm_class *cl, struct timeval *now) +{ + rm_class_t *p; + + if (cl == NULL) + return (1); + if (TV_LT(now, &cl->undertime_)) + return (1); + if (cl->depth_ == 0) { + if (!cl->sleeping_ && (qlen(cl->q_) > cl->qthresh_)) + return (0); + else + return (1); + } + if (cl->children_ != NULL) { + p = cl->children_; + while (p != NULL) { + if (!rmc_satisfied(p, now)) + return (0); + p = p->next_; + } + } + + return (1); +} + +/* + * Return 1 if class 'cl' is under limit or can borrow from a parent, + * 0 if overlimit. As a side-effect, this routine will invoke the + * class overlimit action if the class if overlimit. + */ + +static int +rmc_under_limit(struct rm_class *cl, struct timeval *now) +{ + rm_class_t *p = cl; + rm_class_t *top; + struct rm_ifdat *ifd = cl->ifdat_; + + ifd->borrowed_[ifd->qi_] = NULL; + /* + * If cl is the root class, then always return that it is + * underlimit. Otherwise, check to see if the class is underlimit. + */ + if (cl->parent_ == NULL) + return (1); + + if (cl->sleeping_) { + if (TV_LT(now, &cl->undertime_)) + return (0); + + callout_stop(&cl->callout_); + cl->sleeping_ = 0; + cl->undertime_.tv_sec = 0; + return (1); + } + + top = NULL; + while (cl->undertime_.tv_sec && TV_LT(now, &cl->undertime_)) { + if (((cl = cl->borrow_) == NULL) || + (cl->depth_ > ifd->cutoff_)) { +#ifdef ADJUST_CUTOFF + if (cl != NULL) + /* cutoff is taking effect, just + return false without calling + the delay action. */ + return (0); +#endif +#ifdef BORROW_OFFTIME + /* + * check if the class can borrow offtime too. + * borrow offtime from the top of the borrow + * chain if the top class is not overloaded. + */ + if (cl != NULL) { + /* cutoff is taking effect, use this class as top. */ + top = cl; + CBQTRACE(rmc_under_limit, 'ffou', ifd->cutoff_); + } + if (top != NULL && top->avgidle_ == top->minidle_) + top = NULL; + p->overtime_ = *now; + (p->overlimit)(p, top); +#else + p->overtime_ = *now; + (p->overlimit)(p, NULL); +#endif + return (0); + } + top = cl; + } + + if (cl != p) + ifd->borrowed_[ifd->qi_] = cl; + return (1); +} + +/* + * _rmc_wrr_dequeue_next() - This is scheduler for WRR as opposed to + * Packet-by-packet round robin. + * + * The heart of the weighted round-robin scheduler, which decides which + * class next gets to send a packet. Highest priority first, then + * weighted round-robin within priorites. + * + * Each able-to-send class gets to send until its byte allocation is + * exhausted. Thus, the active pointer is only changed after a class has + * exhausted its allocation. + * + * If the scheduler finds no class that is underlimit or able to borrow, + * then the first class found that had a nonzero queue and is allowed to + * borrow gets to send. + */ + +static struct mbuf * +_rmc_wrr_dequeue_next(struct rm_ifdat *ifd, int op) +{ + struct rm_class *cl = NULL, *first = NULL; + u_int deficit; + int cpri; + struct mbuf *m; + struct timeval now; + + RM_GETTIME(now); + + /* + * if the driver polls the top of the queue and then removes + * the polled packet, we must return the same packet. + */ + if (op == ALTDQ_REMOVE && ifd->pollcache_) { + cl = ifd->pollcache_; + cpri = cl->pri_; + if (ifd->efficient_) { + /* check if this class is overlimit */ + if (cl->undertime_.tv_sec != 0 && + rmc_under_limit(cl, &now) == 0) + first = cl; + } + ifd->pollcache_ = NULL; + goto _wrr_out; + } + else { + /* mode == ALTDQ_POLL || pollcache == NULL */ + ifd->pollcache_ = NULL; + ifd->borrowed_[ifd->qi_] = NULL; + } +#ifdef ADJUST_CUTOFF + _again: +#endif + for (cpri = RM_MAXPRIO - 1; cpri >= 0; cpri--) { + if (ifd->na_[cpri] == 0) + continue; + deficit = 0; + /* + * Loop through twice for a priority level, if some class + * was unable to send a packet the first round because + * of the weighted round-robin mechanism. + * During the second loop at this level, deficit==2. + * (This second loop is not needed if for every class, + * "M[cl->pri_])" times "cl->allotment" is greater than + * the byte size for the largest packet in the class.) + */ + _wrr_loop: + cl = ifd->active_[cpri]; + KKASSERT(cl != NULL); + do { + if ((deficit < 2) && (cl->bytes_alloc_ <= 0)) + cl->bytes_alloc_ += cl->w_allotment_; + if (!qempty(cl->q_)) { + if ((cl->undertime_.tv_sec == 0) || + rmc_under_limit(cl, &now)) { + if (cl->bytes_alloc_ > 0 || deficit > 1) + goto _wrr_out; + + /* underlimit but no alloc */ + deficit = 1; +#if 1 + ifd->borrowed_[ifd->qi_] = NULL; +#endif + } + else if (first == NULL && cl->borrow_ != NULL) + first = cl; /* borrowing candidate */ + } + + cl->bytes_alloc_ = 0; + cl = cl->peer_; + } while (cl != ifd->active_[cpri]); + + if (deficit == 1) { + /* first loop found an underlimit class with deficit */ + /* Loop on same priority level, with new deficit. */ + deficit = 2; + goto _wrr_loop; + } + } + +#ifdef ADJUST_CUTOFF + /* + * no underlimit class found. if cutoff is taking effect, + * increase cutoff and try again. + */ + if (first != NULL && ifd->cutoff_ < ifd->root_->depth_) { + ifd->cutoff_++; + CBQTRACE(_rmc_wrr_dequeue_next, 'ojda', ifd->cutoff_); + goto _again; + } +#endif /* ADJUST_CUTOFF */ + /* + * If LINK_EFFICIENCY is turned on, then the first overlimit + * class we encounter will send a packet if all the classes + * of the link-sharing structure are overlimit. + */ + reset_cutoff(ifd); + CBQTRACE(_rmc_wrr_dequeue_next, 'otsr', ifd->cutoff_); + + if (!ifd->efficient_ || first == NULL) + return (NULL); + + cl = first; + cpri = cl->pri_; +#if 0 /* too time-consuming for nothing */ + if (cl->sleeping_) + callout_stop(&cl->callout_); + cl->sleeping_ = 0; + cl->undertime_.tv_sec = 0; +#endif + ifd->borrowed_[ifd->qi_] = cl->borrow_; + ifd->cutoff_ = cl->borrow_->depth_; + + /* + * Deque the packet and do the book keeping... + */ + _wrr_out: + if (op == ALTDQ_REMOVE) { + m = _rmc_getq(cl); + if (m == NULL) + panic("_rmc_wrr_dequeue_next"); + if (qempty(cl->q_)) + ifd->na_[cpri]--; + + /* + * Update class statistics and link data. + */ + if (cl->bytes_alloc_ > 0) + cl->bytes_alloc_ -= m_pktlen(m); + + if ((cl->bytes_alloc_ <= 0) || first == cl) + ifd->active_[cl->pri_] = cl->peer_; + else + ifd->active_[cl->pri_] = cl; + + ifd->class_[ifd->qi_] = cl; + ifd->curlen_[ifd->qi_] = m_pktlen(m); + ifd->now_[ifd->qi_] = now; + ifd->qi_ = (ifd->qi_ + 1) % ifd->maxqueued_; + ifd->queued_++; + } else { + /* mode == ALTDQ_PPOLL */ + m = _rmc_pollq(cl); + ifd->pollcache_ = cl; + } + return (m); +} + +/* + * Dequeue & return next packet from the highest priority class that + * has a packet to send & has enough allocation to send it. This + * routine is called by a driver whenever it needs a new packet to + * output. + */ +static struct mbuf * +_rmc_prr_dequeue_next(struct rm_ifdat *ifd, int op) +{ + struct mbuf *m; + int cpri; + struct rm_class *cl, *first = NULL; + struct timeval now; + + RM_GETTIME(now); + + /* + * if the driver polls the top of the queue and then removes + * the polled packet, we must return the same packet. + */ + if (op == ALTDQ_REMOVE && ifd->pollcache_) { + cl = ifd->pollcache_; + cpri = cl->pri_; + ifd->pollcache_ = NULL; + goto _prr_out; + } else { + /* mode == ALTDQ_POLL || pollcache == NULL */ + ifd->pollcache_ = NULL; + ifd->borrowed_[ifd->qi_] = NULL; + } +#ifdef ADJUST_CUTOFF + _again: +#endif + for (cpri = RM_MAXPRIO - 1; cpri >= 0; cpri--) { + if (ifd->na_[cpri] == 0) + continue; + cl = ifd->active_[cpri]; + KKASSERT(cl != NULL); + do { + if (!qempty(cl->q_)) { + if ((cl->undertime_.tv_sec == 0) || + rmc_under_limit(cl, &now)) + goto _prr_out; + if (first == NULL && cl->borrow_ != NULL) + first = cl; + } + cl = cl->peer_; + } while (cl != ifd->active_[cpri]); + } + +#ifdef ADJUST_CUTOFF + /* + * no underlimit class found. if cutoff is taking effect, increase + * cutoff and try again. + */ + if (first != NULL && ifd->cutoff_ < ifd->root_->depth_) { + ifd->cutoff_++; + goto _again; + } +#endif /* ADJUST_CUTOFF */ + /* + * If LINK_EFFICIENCY is turned on, then the first overlimit + * class we encounter will send a packet if all the classes + * of the link-sharing structure are overlimit. + */ + reset_cutoff(ifd); + if (!ifd->efficient_ || first == NULL) + return (NULL); + + cl = first; + cpri = cl->pri_; +#if 0 /* too time-consuming for nothing */ + if (cl->sleeping_) + callout_stop(&cl->callout_); + cl->sleeping_ = 0; + cl->undertime_.tv_sec = 0; +#endif + ifd->borrowed_[ifd->qi_] = cl->borrow_; + ifd->cutoff_ = cl->borrow_->depth_; + + /* + * Deque the packet and do the book keeping... + */ + _prr_out: + if (op == ALTDQ_REMOVE) { + m = _rmc_getq(cl); + if (m == NULL) + panic("_rmc_prr_dequeue_next"); + if (qempty(cl->q_)) + ifd->na_[cpri]--; + + ifd->active_[cpri] = cl->peer_; + + ifd->class_[ifd->qi_] = cl; + ifd->curlen_[ifd->qi_] = m_pktlen(m); + ifd->now_[ifd->qi_] = now; + ifd->qi_ = (ifd->qi_ + 1) % ifd->maxqueued_; + ifd->queued_++; + } else { + /* mode == ALTDQ_POLL */ + m = _rmc_pollq(cl); + ifd->pollcache_ = cl; + } + return (m); +} + +/* + * struct mbuf * + * rmc_dequeue_next(struct rm_ifdat *ifd, struct timeval *now) - this function + * is invoked by the packet driver to get the next packet to be + * dequeued and output on the link. If WRR is enabled, then the + * WRR dequeue next routine will determine the next packet to sent. + * Otherwise, packet-by-packet round robin is invoked. + * + * Returns: NULL, if a packet is not available or if all + * classes are overlimit. + * + * Otherwise, Pointer to the next packet. + */ + +struct mbuf * +rmc_dequeue_next(struct rm_ifdat *ifd, int mode) +{ + if (ifd->queued_ >= ifd->maxqueued_) + return (NULL); + else if (ifd->wrr_) + return (_rmc_wrr_dequeue_next(ifd, mode)); + else + return (_rmc_prr_dequeue_next(ifd, mode)); +} + +/* + * Update the utilization estimate for the packet that just completed. + * The packet's class & the parent(s) of that class all get their + * estimators updated. This routine is called by the driver's output- + * packet-completion interrupt service routine. + */ + +/* + * a macro to approximate "divide by 1000" that gives 0.000999, + * if a value has enough effective digits. + * (on pentium, mul takes 9 cycles but div takes 46!) + */ +#define NSEC_TO_USEC(t) (((t) >> 10) + ((t) >> 16) + ((t) >> 17)) +void +rmc_update_class_util(struct rm_ifdat *ifd) +{ + int idle, avgidle, pktlen; + int pkt_time, tidle; + rm_class_t *cl, *borrowed; + rm_class_t *borrows; + struct timeval *nowp; + + /* + * Get the most recent completed class. + */ + if ((cl = ifd->class_[ifd->qo_]) == NULL) + return; + + pktlen = ifd->curlen_[ifd->qo_]; + borrowed = ifd->borrowed_[ifd->qo_]; + borrows = borrowed; + + PKTCNTR_ADD(&cl->stats_.xmit_cnt, pktlen); + + /* + * Run estimator on class and its ancestors. + */ + /* + * rm_update_class_util is designed to be called when the + * transfer is completed from a xmit complete interrupt, + * but most drivers don't implement an upcall for that. + * so, just use estimated completion time. + * as a result, ifd->qi_ and ifd->qo_ are always synced. + */ + nowp = &ifd->now_[ifd->qo_]; + /* get pkt_time (for link) in usec */ +#if 1 /* use approximation */ + pkt_time = ifd->curlen_[ifd->qo_] * ifd->ns_per_byte_; + pkt_time = NSEC_TO_USEC(pkt_time); +#else + pkt_time = ifd->curlen_[ifd->qo_] * ifd->ns_per_byte_ / 1000; +#endif +#if 1 /* ALTQ4PPP */ + if (TV_LT(nowp, &ifd->ifnow_)) { + int iftime; + + /* + * make sure the estimated completion time does not go + * too far. it can happen when the link layer supports + * data compression or the interface speed is set to + * a much lower value. + */ + TV_DELTA(&ifd->ifnow_, nowp, iftime); + if (iftime+pkt_time < ifd->maxiftime_) { + TV_ADD_DELTA(&ifd->ifnow_, pkt_time, &ifd->ifnow_); + } else { + TV_ADD_DELTA(nowp, ifd->maxiftime_, &ifd->ifnow_); + } + } else { + TV_ADD_DELTA(nowp, pkt_time, &ifd->ifnow_); + } +#else + if (TV_LT(nowp, &ifd->ifnow_)) { + TV_ADD_DELTA(&ifd->ifnow_, pkt_time, &ifd->ifnow_); + } else { + TV_ADD_DELTA(nowp, pkt_time, &ifd->ifnow_); + } +#endif + + while (cl != NULL) { + TV_DELTA(&ifd->ifnow_, &cl->last_, idle); + if (idle >= 2000000) + /* + * this class is idle enough, reset avgidle. + * (TV_DELTA returns 2000000 us when delta is large.) + */ + cl->avgidle_ = cl->maxidle_; + + /* get pkt_time (for class) in usec */ +#if 1 /* use approximation */ + pkt_time = pktlen * cl->ns_per_byte_; + pkt_time = NSEC_TO_USEC(pkt_time); +#else + pkt_time = pktlen * cl->ns_per_byte_ / 1000; +#endif + idle -= pkt_time; + + avgidle = cl->avgidle_; + avgidle += idle - (avgidle >> RM_FILTER_GAIN); + cl->avgidle_ = avgidle; + + /* Are we overlimit ? */ + if (avgidle <= 0) { + CBQTRACE(rmc_update_class_util, 'milo', cl->stats_.handle); +#if 1 /* ALTQ */ + /* + * need some lower bound for avgidle, otherwise + * a borrowing class gets unbounded penalty. + */ + if (avgidle < cl->minidle_) + avgidle = cl->avgidle_ = cl->minidle_; +#endif + /* set next idle to make avgidle 0 */ + tidle = pkt_time + + (((1 - RM_POWER) * avgidle) >> RM_FILTER_GAIN); + TV_ADD_DELTA(nowp, tidle, &cl->undertime_); + ++cl->stats_.over; + } else { + cl->avgidle_ = + (avgidle > cl->maxidle_) ? cl->maxidle_ : avgidle; + cl->undertime_.tv_sec = 0; + if (cl->sleeping_) { + callout_stop(&cl->callout_); + cl->sleeping_ = 0; + } + } + + if (borrows != NULL) { + if (borrows != cl) + ++cl->stats_.borrows; + else + borrows = NULL; + } + cl->last_ = ifd->ifnow_; + cl->last_pkttime_ = pkt_time; + +#if 1 + if (cl->parent_ == NULL) { + /* take stats of root class */ + PKTCNTR_ADD(&cl->stats_.xmit_cnt, pktlen); + } +#endif + + cl = cl->parent_; + } + + /* + * Check to see if cutoff needs to set to a new level. + */ + cl = ifd->class_[ifd->qo_]; + if (borrowed && (ifd->cutoff_ >= borrowed->depth_)) { +#if 1 /* ALTQ */ + if ((qlen(cl->q_) <= 0) || TV_LT(nowp, &borrowed->undertime_)) { + rmc_tl_satisfied(ifd, nowp); + CBQTRACE(rmc_update_class_util, 'broe', ifd->cutoff_); + } else { + ifd->cutoff_ = borrowed->depth_; + CBQTRACE(rmc_update_class_util, 'ffob', borrowed->depth_); + } +#else /* !ALTQ */ + if ((qlen(cl->q_) <= 1) || TV_LT(&now, &borrowed->undertime_)) { + reset_cutoff(ifd); +#ifdef notdef + rmc_tl_satisfied(ifd, &now); +#endif + CBQTRACE(rmc_update_class_util, 'broe', ifd->cutoff_); + } else { + ifd->cutoff_ = borrowed->depth_; + CBQTRACE(rmc_update_class_util, 'ffob', borrowed->depth_); + } +#endif /* !ALTQ */ + } + + /* + * Release class slot + */ + ifd->borrowed_[ifd->qo_] = NULL; + ifd->class_[ifd->qo_] = NULL; + ifd->qo_ = (ifd->qo_ + 1) % ifd->maxqueued_; + ifd->queued_--; +} + +/* + * void + * rmc_drop_action(struct rm_class *cl) - Generic (not protocol-specific) + * over-limit action routines. These get invoked by rmc_under_limit() + * if a class with packets to send if over its bandwidth limit & can't + * borrow from a parent class. + * + * Returns: NONE + */ + +static void +rmc_drop_action(struct rm_class *cl) +{ + struct rm_ifdat *ifd = cl->ifdat_; + + KKASSERT(qlen(cl->q_) > 0); + _rmc_dropq(cl); + if (qempty(cl->q_)) + ifd->na_[cl->pri_]--; +} + +void rmc_dropall(struct rm_class *cl) +{ + struct rm_ifdat *ifd = cl->ifdat_; + + if (!qempty(cl->q_)) { + _flushq(cl->q_); + + ifd->na_[cl->pri_]--; + } +} + +/* + * void + * rmc_delay_action(struct rm_class *cl) - This function is the generic CBQ + * delay action routine. It is invoked via rmc_under_limit when the + * packet is discoverd to be overlimit. + * + * If the delay action is result of borrow class being overlimit, then + * delay for the offtime of the borrowing class that is overlimit. + * + * Returns: NONE + */ + +void +rmc_delay_action(struct rm_class *cl, struct rm_class *borrow) +{ + int delay, t, extradelay; + + cl->stats_.overactions++; + TV_DELTA(&cl->undertime_, &cl->overtime_, delay); +#ifndef BORROW_OFFTIME + delay += cl->offtime_; +#endif + + if (!cl->sleeping_) { + CBQTRACE(rmc_delay_action, 'yled', cl->stats_.handle); +#ifdef BORROW_OFFTIME + if (borrow != NULL) + extradelay = borrow->offtime_; + else +#endif + extradelay = cl->offtime_; + +#ifdef ALTQ + /* + * XXX recalculate suspend time: + * current undertime is (tidle + pkt_time) calculated + * from the last transmission. + * tidle: time required to bring avgidle back to 0 + * pkt_time: target waiting time for this class + * we need to replace pkt_time by offtime + */ + extradelay -= cl->last_pkttime_; +#endif + if (extradelay > 0) { + TV_ADD_DELTA(&cl->undertime_, extradelay, &cl->undertime_); + delay += extradelay; + } + + cl->sleeping_ = 1; + cl->stats_.delays++; + + /* + * Since packets are phased randomly with respect to the + * clock, 1 tick (the next clock tick) can be an arbitrarily + * short time so we have to wait for at least two ticks. + * NOTE: If there's no other traffic, we need the timer as + * a 'backstop' to restart this class. + */ + if (delay > tick * 2) + t = (delay + tick - 1) / tick; + else + t = 2; + callout_reset(&cl->callout_, t, rmc_restart, cl); + } +} + +/* + * void + * rmc_restart() - is just a helper routine for rmc_delay_action -- it is + * called by the system timer code & is responsible checking if the + * class is still sleeping (it might have been restarted as a side + * effect of the queue scan on a packet arrival) and, if so, restarting + * output for the class. Inspecting the class state & restarting output + * require locking the class structure. In general the driver is + * responsible for locking but this is the only routine that is not + * called directly or indirectly from the interface driver so it has + * know about system locking conventions. Under bsd, locking is done + * by raising IPL to splimp so that's what's implemented here. On a + * different system this would probably need to be changed. + * + * Returns: NONE + */ + +static void +rmc_restart(void *arg) +{ + struct rm_class *cl = arg; + struct rm_ifdat *ifd = cl->ifdat_; + int s; + + s = splimp(); + if (cl->sleeping_) { + cl->sleeping_ = 0; + cl->undertime_.tv_sec = 0; + + if (ifd->queued_ < ifd->maxqueued_ && ifd->restart != NULL) { + CBQTRACE(rmc_restart, 'trts', cl->stats_.handle); + (ifd->restart)(ifd->ifq_); + } + } + splx(s); +} + +/* + * void + * rmc_root_overlimit(struct rm_class *cl) - This the generic overlimit + * handling routine for the root class of the link sharing structure. + * + * Returns: NONE + */ + +static void +rmc_root_overlimit(struct rm_class *cl, struct rm_class *borrow) +{ + panic("rmc_root_overlimit"); +} + +/* + * Packet Queue handling routines. Eventually, this is to localize the + * effects on the code whether queues are red queues or droptail + * queues. + */ + +static int +_rmc_addq(rm_class_t *cl, struct mbuf *m) +{ +#ifdef ALTQ_RIO + if (q_is_rio(cl->q_)) + return rio_addq((rio_t *)cl->red_, cl->q_, m, cl->pktattr_); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->q_)) + return red_addq(cl->red_, cl->q_, m, cl->pktattr_); +#endif /* ALTQ_RED */ + + if (cl->flags_ & RMCF_CLEARDSCP) + write_dsfield(m, cl->pktattr_, 0); + + _addq(cl->q_, m); + return (0); +} + +/* note: _rmc_dropq is not called for red */ +static void +_rmc_dropq(rm_class_t *cl) +{ + struct mbuf *m; + + if ((m = _getq(cl->q_)) != NULL) + m_freem(m); +} + +static struct mbuf * +_rmc_getq(rm_class_t *cl) +{ +#ifdef ALTQ_RIO + if (q_is_rio(cl->q_)) + return rio_getq((rio_t *)cl->red_, cl->q_); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->q_)) + return red_getq(cl->red_, cl->q_); +#endif + return _getq(cl->q_); +} + +static struct mbuf * +_rmc_pollq(rm_class_t *cl) +{ + return qhead(cl->q_); +} + +#ifdef CBQ_TRACE +/* + * DDB hook to trace cbq events: + * the last 1024 events are held in a circular buffer. + * use "call cbqtrace_dump(N)" to display 20 events from Nth event. + */ +void cbqtrace_dump(int); +static char *rmc_funcname(void *); + +static struct rmc_funcs { + void *func; + char *name; +} rmc_funcs[] = { + rmc_init, "rmc_init", + rmc_queue_packet, "rmc_queue_packet", + rmc_under_limit, "rmc_under_limit", + rmc_update_class_util, "rmc_update_class_util", + rmc_delay_action, "rmc_delay_action", + rmc_restart, "rmc_restart", + _rmc_wrr_dequeue_next, "_rmc_wrr_dequeue_next", + NULL, NULL +}; + +static char *rmc_funcname(void *func) +{ + struct rmc_funcs *fp; + + for (fp = rmc_funcs; fp->func != NULL; fp++) { + if (fp->func == func) + return (fp->name); + } + + return ("unknown"); +} + +void +cbqtrace_dump(int counter) +{ + int i, *p; + char *cp; + + counter = counter % NCBQTRACE; + p = (int *)&cbqtrace_buffer[counter]; + + for (i=0; i<20; i++) { + printf("[0x%x] ", *p++); + printf("%s: ", rmc_funcname((void *)*p++)); + cp = (char *)p++; + printf("%c%c%c%c: ", cp[0], cp[1], cp[2], cp[3]); + printf("%d\n",*p++); + + if (p >= (int *)&cbqtrace_buffer[NCBQTRACE]) + p = (int *)cbqtrace_buffer; + } +} +#endif /* CBQ_TRACE */ +#endif /* ALTQ_CBQ */ diff --git a/sys/net/altq/altq_rmclass.h b/sys/net/altq/altq_rmclass.h new file mode 100644 index 0000000000..fa20242da0 --- /dev/null +++ b/sys/net/altq/altq_rmclass.h @@ -0,0 +1,255 @@ +/* $KAME: altq_rmclass.h,v 1.10 2003/08/20 23:30:23 itojun Exp $ */ +/* $DragonFly: src/sys/net/altq/altq_rmclass.h,v 1.1 2005/02/11 22:25:57 joerg Exp $ */ + +/* + * Copyright (c) 1991-1997 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the Network Research + * Group at Lawrence Berkeley Laboratory. + * 4. Neither the name of the University nor of the Laboratory may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _ALTQ_ALTQ_RMCLASS_H_ +#define _ALTQ_ALTQ_RMCLASS_H_ + +#include + +/* #pragma ident "@(#)rm_class.h 1.20 97/10/23 SMI" */ + +#define RM_MAXPRIO 8 /* Max priority */ + +#ifdef _KERNEL + +typedef struct rm_ifdat rm_ifdat_t; +typedef struct rm_class rm_class_t; + +struct red; + +/* + * Macros for dealing with time values. We assume all times are + * 'timevals'. `microtime' is used to get the best available clock + * resolution. If `microtime' *doesn't* return a value that's about + * ten times smaller than the average packet time on the fastest + * link that will use these routines, a slightly different clock + * scheme than this one should be used. + * (Bias due to truncation error in this scheme will overestimate utilization + * and discriminate against high bandwidth classes. To remove this bias an + * integrator needs to be added. The simplest integrator uses a history of + * 10 * avg.packet.time / min.tick.time packet completion entries. This is + * straight forward to add but we don't want to pay the extra memory + * traffic to maintain it if it's not necessary (occasionally a vendor + * accidentally builds a workstation with a decent clock - e.g., Sun & HP).) + */ + +#define RM_GETTIME(now) microtime(&now) + +#define TV_LT(a, b) (((a)->tv_sec < (b)->tv_sec) || \ + (((a)->tv_usec < (b)->tv_usec) && ((a)->tv_sec <= (b)->tv_sec))) + +#define TV_DELTA(a, b, delta) { \ + register int xxs; \ + \ + delta = (a)->tv_usec - (b)->tv_usec; \ + if ((xxs = (a)->tv_sec - (b)->tv_sec)) { \ + switch (xxs) { \ + default: \ + /* if (xxs < 0) \ + printf("rm_class: bogus time values\n"); */ \ + delta = 0; \ + /* fall through */ \ + case 2: \ + delta += 1000000; \ + /* fall through */ \ + case 1: \ + delta += 1000000; \ + break; \ + } \ + } \ +} + +#define TV_ADD_DELTA(a, delta, res) { \ + register int xxus = (a)->tv_usec + (delta); \ + \ + (res)->tv_sec = (a)->tv_sec; \ + while (xxus >= 1000000) { \ + ++((res)->tv_sec); \ + xxus -= 1000000; \ + } \ + (res)->tv_usec = xxus; \ +} + +#define RM_TIMEOUT 2 /* 1 Clock tick. */ + +#if 1 +#define RM_MAXQUEUED 1 /* this isn't used in ALTQ/CBQ */ +#else +#define RM_MAXQUEUED 16 /* Max number of packets downstream of CBQ */ +#endif +#define RM_MAXQUEUE 64 /* Max queue length */ +#define RM_FILTER_GAIN 5 /* log2 of gain, e.g., 5 => 31/32 */ +#define RM_POWER (1 << RM_FILTER_GAIN) +#define RM_MAXDEPTH 32 +#define RM_NS_PER_SEC (1000000000) + +typedef struct _rm_class_stats_ { + u_int handle; + u_int depth; + + struct pktcntr xmit_cnt; /* packets sent in this class */ + struct pktcntr drop_cnt; /* dropped packets */ + u_int over; /* # times went over limit */ + u_int borrows; /* # times tried to borrow */ + u_int overactions; /* # times invoked overlimit action */ + u_int delays; /* # times invoked delay actions */ +} rm_class_stats_t; + +/* + * CBQ Class state structure + */ +struct rm_class { + class_queue_t *q_; /* Queue of packets */ + rm_ifdat_t *ifdat_; + int pri_; /* Class priority. */ + int depth_; /* Class depth */ + u_int ns_per_byte_; /* NanoSeconds per byte. */ + u_int maxrate_; /* Bytes per second for this class. */ + u_int allotment_; /* Fraction of link bandwidth. */ + u_int w_allotment_; /* Weighted allotment for WRR */ + int bytes_alloc_; /* Allocation for round of WRR */ + + int avgidle_; + int maxidle_; + int minidle_; + int offtime_; + int sleeping_; /* != 0 if delaying */ + int qthresh_; /* Queue threshold for formal link sharing */ + int leaf_; /* Note whether leaf class or not.*/ + + rm_class_t *children_; /* Children of this class */ + rm_class_t *next_; /* Next pointer, used if child */ + + rm_class_t *peer_; /* Peer class */ + rm_class_t *borrow_; /* Borrow class */ + rm_class_t *parent_; /* Parent class */ + + void (*overlimit)(struct rm_class *, struct rm_class *); + void (*drop)(struct rm_class *); /* Class drop action. */ + + struct red *red_; /* RED state pointer */ + struct altq_pktattr *pktattr_; /* saved hdr used by RED/ECN */ + int flags_; + + int last_pkttime_; /* saved pkt_time */ + struct timeval undertime_; /* time can next send */ + struct timeval last_; /* time last packet sent */ + struct timeval overtime_; + struct callout callout_; /* for timeout() calls */ + + rm_class_stats_t stats_; /* Class Statistics */ +}; + +/* + * CBQ Interface state + */ +struct rm_ifdat { + int queued_; /* # pkts queued downstream */ + int efficient_; /* Link Efficency bit */ + int wrr_; /* Enable Weighted Round-Robin */ + u_long ns_per_byte_; /* Link byte speed. */ + int maxqueued_; /* Max packets to queue */ + int maxpkt_; /* Max packet size. */ + int qi_; /* In/out pointers for downstream */ + int qo_; /* packets */ + + /* + * Active class state and WRR state. + */ + rm_class_t *active_[RM_MAXPRIO]; /* Active cl's in each pri */ + int na_[RM_MAXPRIO]; /* # of active cl's in a pri */ + int num_[RM_MAXPRIO]; /* # of cl's per pri */ + int alloc_[RM_MAXPRIO]; /* Byte Allocation */ + u_long M_[RM_MAXPRIO]; /* WRR weights. */ + + /* + * Network Interface/Solaris Queue state pointer. + */ + struct ifaltq *ifq_; + rm_class_t *default_; /* Default Pkt class, BE */ + rm_class_t *root_; /* Root Link class. */ + rm_class_t *ctl_; /* Control Traffic class. */ + void (*restart)(struct ifaltq *); /* Restart routine. */ + + /* + * Current packet downstream packet state and dynamic state. + */ + rm_class_t *borrowed_[RM_MAXQUEUED]; /* Class borrowed last */ + rm_class_t *class_[RM_MAXQUEUED]; /* class sending */ + int curlen_[RM_MAXQUEUED]; /* Current pktlen */ + struct timeval now_[RM_MAXQUEUED]; /* Current packet time. */ + int is_overlimit_[RM_MAXQUEUED];/* Current packet time. */ + + int cutoff_; /* Cut-off depth for borrowing */ + + struct timeval ifnow_; /* expected xmit completion time */ +#if 1 /* ALTQ4PPP */ + int maxiftime_; /* max delay inside interface */ +#endif + rm_class_t *pollcache_; /* cached rm_class by poll operation */ +}; + +/* flags for rmc_init and rmc_newclass */ +/* class flags */ +#define RMCF_RED 0x0001 +#define RMCF_ECN 0x0002 +#define RMCF_RIO 0x0004 +#define RMCF_CLEARDSCP 0x0008 /* clear diffserv codepoint */ + +/* flags for rmc_init */ +#define RMCF_WRR 0x0100 +#define RMCF_EFFICIENT 0x0200 + +#define is_a_parent_class(cl) ((cl)->children_ != NULL) + +rm_class_t *rmc_newclass(int, struct rm_ifdat *, u_int, + void (*)(struct rm_class *, struct rm_class *), + int, struct rm_class *, struct rm_class *, + u_int, int, u_int, int, int); +void rmc_delete_class(struct rm_ifdat *, struct rm_class *); +int rmc_modclass(struct rm_class *, u_int, int, u_int, int, u_int, int); +void rmc_init(struct ifaltq *, struct rm_ifdat *, u_int, + void (*)(struct ifaltq *), int, int, u_int, int, u_int, int); +int rmc_queue_packet(struct rm_class *, struct mbuf *); +struct mbuf *rmc_dequeue_next(struct rm_ifdat *, int); +void rmc_update_class_util(struct rm_ifdat *); +void rmc_delay_action(struct rm_class *, struct rm_class *); +void rmc_dropall(struct rm_class *); +int rmc_get_weight(struct rm_ifdat *, int); + +#endif /* _KERNEL */ + +#endif /* _ALTQ_ALTQ_RMCLASS_H_ */ diff --git a/sys/net/altq/altq_rmclass_debug.h b/sys/net/altq/altq_rmclass_debug.h new file mode 100644 index 0000000000..d46daafc7b --- /dev/null +++ b/sys/net/altq/altq_rmclass_debug.h @@ -0,0 +1,101 @@ +/* $KAME: altq_rmclass_debug.h,v 1.3 2002/11/29 04:36:24 kjc Exp $ */ +/* $DragonFly: src/sys/net/altq/altq_rmclass_debug.h,v 1.1 2005/02/11 22:25:57 joerg Exp $ */ + +/* + * Copyright (c) Sun Microsystems, Inc. 1998 All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the SMCC Technology + * Development Group at Sun Microsystems, Inc. + * + * 4. The name of the Sun Microsystems, Inc nor may not be used to endorse or + * promote products derived from this software without specific prior + * written permission. + * + * SUN MICROSYSTEMS DOES NOT CLAIM MERCHANTABILITY OF THIS SOFTWARE OR THE + * SUITABILITY OF THIS SOFTWARE FOR ANY PARTICULAR PURPOSE. The software is + * provided "as is" without express or implied warranty of any kind. + * + * These notices must be retained in any copies of any part of this software. + */ + +#ifndef _ALTQ_ALTQ_RMCLASS_DEBUG_H_ +#define _ALTQ_ALTQ_RMCLASS_DEBUG_H_ + +/* #pragma ident "@(#)rm_class_debug.h 1.7 98/05/04 SMI" */ + +/* + * Cbq debugging macros + */ + +#ifdef CBQ_TRACE +#ifndef NCBQTRACE +#define NCBQTRACE (16 * 1024) +#endif + +/* + * To view the trace output, using adb, type: + * adb -k /dev/ksyms /dev/mem , then type + * cbqtrace_count/D to get the count, then type + * cbqtrace_buffer,0tcount/Dp4C" "Xn + * This will dump the trace buffer from 0 to count. + */ +/* + * in ALTQ, "call cbqtrace_dump(N)" from DDB to display 20 events + * from Nth event in the circular buffer. + */ + +struct cbqtrace { + int count; + int function; /* address of function */ + int trace_action; /* descriptive 4 characters */ + int object; /* object operated on */ +}; + +#define CBQTRACEINIT() { \ + if (cbqtrace_ptr == NULL) \ + cbqtrace_ptr = cbqtrace_buffer; \ + else { \ + cbqtrace_ptr = cbqtrace_buffer; \ + bzero((void *)cbqtrace_ptr, sizeof(cbqtrace_buffer)); \ + cbqtrace_count = 0; \ + } \ +} + +#define LOCK_TRACE() splimp() +#define UNLOCK_TRACE(x) splx(x) + +#define CBQTRACE(func, act, obj) { \ + int __s = LOCK_TRACE(); \ + int *_p = &cbqtrace_ptr->count; \ + *_p++ = ++cbqtrace_count; \ + *_p++ = (int)(func); \ + *_p++ = (int)(act); \ + *_p++ = (int)(obj); \ + if ((struct cbqtrace *)(void *)_p >= &cbqtrace_buffer[NCBQTRACE])\ + cbqtrace_ptr = cbqtrace_buffer; \ + else \ + cbqtrace_ptr = (struct cbqtrace *)(void *)_p; \ + UNLOCK_TRACE(__s); \ + } +#else + +/* If no tracing, define no-ops */ +#define CBQTRACEINIT() +#define CBQTRACE(a, b, c) + +#endif /* !CBQ_TRACE */ + +#endif /* _ALTQ_ALTQ_RMCLASS_DEBUG_H_ */ diff --git a/sys/net/altq/altq_subr.c b/sys/net/altq/altq_subr.c new file mode 100644 index 0000000000..46cfbe31f0 --- /dev/null +++ b/sys/net/altq/altq_subr.c @@ -0,0 +1,785 @@ +/* $KAME: altq_subr.c,v 1.23 2004/04/20 16:10:06 itojun Exp $ */ +/* $DragonFly: src/sys/net/altq/altq_subr.c,v 1.1 2005/02/11 22:25:57 joerg Exp $ */ + +/* + * Copyright (C) 1997-2003 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "opt_altq.h" +#include "opt_inet.h" +#include "opt_inet6.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#ifdef INET6 +#include +#endif +#include +#include + +#include +#include + +/* machine dependent clock related includes */ +#if defined(__i386__) +#include /* for tsc_freq */ +#include /* for cpu_feature */ +#include /* for CPUID_TSC */ +#endif /* __i386__ */ + +/* + * internal function prototypes + */ +static void tbr_timeout(void *); +int (*altq_input)(struct mbuf *, int) = NULL; +static int tbr_timer = 0; /* token bucket regulator timer */ +static struct callout tbr_callout; + +int pfaltq_running; /* keep track of running state */ + +MALLOC_DEFINE(M_ALTQ, "altq", "ALTQ structures"); + +/* + * alternate queueing support routines + */ + +/* look up the queue state by the interface name and the queueing type. */ +void * +altq_lookup(const char *name, int type) +{ + struct ifnet *ifp; + + if ((ifp = ifunit(name)) != NULL) { + if (type != ALTQT_NONE && ifp->if_snd.altq_type == type) + return (ifp->if_snd.altq_disc); + } + + return (NULL); +} + +int +altq_attach(struct ifaltq *ifq, int type, void *discipline, + int (*enqueue)(struct ifaltq *, struct mbuf *, struct altq_pktattr *), + struct mbuf *(*dequeue)(struct ifaltq *, int), + int (*request)(struct ifaltq *, int, void *), + void *clfier, + void *(*classify)(struct ifaltq *, struct mbuf *, + struct altq_pktattr *)) +{ + if (!ifq_is_ready(ifq)) + return ENXIO; + + ifq->altq_type = type; + ifq->altq_disc = discipline; + ifq->altq_enqueue = enqueue; + ifq->altq_dequeue = dequeue; + ifq->altq_request = request; + ifq->altq_clfier = clfier; + ifq->altq_classify = classify; + ifq->altq_flags &= (ALTQF_CANTCHANGE|ALTQF_ENABLED); + return 0; +} + +int +altq_detach(struct ifaltq *ifq) +{ + if (!ifq_is_ready(ifq)) + return ENXIO; + if (ifq_is_enabled(ifq)) + return EBUSY; + if (!ifq_is_attached(ifq)) + return (0); + + ifq->altq_type = ALTQT_NONE; + ifq->altq_disc = NULL; + ifq->altq_enqueue = NULL; + ifq->altq_dequeue = NULL; + ifq->altq_request = NULL; + ifq->altq_clfier = NULL; + ifq->altq_classify = NULL; + ifq->altq_flags &= ALTQF_CANTCHANGE; + return 0; +} + +int +altq_enable(struct ifaltq *ifq) +{ + int s; + + if (!ifq_is_ready(ifq)) + return ENXIO; + if (ifq_is_enabled(ifq)) + return 0; + + s = splimp(); + ifq_purge(ifq); + KKASSERT(ifq->ifq_len == 0); + ifq->altq_flags |= ALTQF_ENABLED; + if (ifq->altq_clfier != NULL) + ifq->altq_flags |= ALTQF_CLASSIFY; + splx(s); + + return 0; +} + +int +altq_disable(struct ifaltq *ifq) +{ + int s; + + if (!ifq_is_enabled(ifq)) + return 0; + + s = splimp(); + ifq_purge(ifq); + KKASSERT(ifq->ifq_len == 0); + ifq->altq_flags &= ~(ALTQF_ENABLED|ALTQF_CLASSIFY); + splx(s); + return 0; +} + +/* + * internal representation of token bucket parameters + * rate: byte_per_unittime << 32 + * (((bits_per_sec) / 8) << 32) / machclk_freq + * depth: byte << 32 + * + */ +#define TBR_SHIFT 32 +#define TBR_SCALE(x) ((int64_t)(x) << TBR_SHIFT) +#define TBR_UNSCALE(x) ((x) >> TBR_SHIFT) + +struct mbuf * +tbr_dequeue(struct ifaltq *ifq, int op) +{ + struct tb_regulator *tbr; + struct mbuf *m; + int64_t interval; + uint64_t now; + + tbr = ifq->altq_tbr; + if (op == ALTDQ_REMOVE && tbr->tbr_lastop == ALTDQ_POLL) { + /* if this is a remove after poll, bypass tbr check */ + } else { + /* update token only when it is negative */ + if (tbr->tbr_token <= 0) { + now = read_machclk(); + interval = now - tbr->tbr_last; + if (interval >= tbr->tbr_filluptime) + tbr->tbr_token = tbr->tbr_depth; + else { + tbr->tbr_token += interval * tbr->tbr_rate; + if (tbr->tbr_token > tbr->tbr_depth) + tbr->tbr_token = tbr->tbr_depth; + } + tbr->tbr_last = now; + } + /* if token is still negative, don't allow dequeue */ + if (tbr->tbr_token <= 0) + return (NULL); + } + + if (ifq_is_enabled(ifq)) + m = (*ifq->altq_dequeue)(ifq, op); + else if (op == ALTDQ_POLL) + IF_POLL(ifq, m); + else + IF_DEQUEUE(ifq, m); + + if (m != NULL && op == ALTDQ_REMOVE) + tbr->tbr_token -= TBR_SCALE(m_pktlen(m)); + tbr->tbr_lastop = op; + return (m); +} + +/* + * set a token bucket regulator. + * if the specified rate is zero, the token bucket regulator is deleted. + */ +int +tbr_set(struct ifaltq *ifq, struct tb_profile *profile) +{ + struct tb_regulator *tbr, *otbr; + + if (machclk_freq == 0) + init_machclk(); + if (machclk_freq == 0) { + printf("tbr_set: no cpu clock available!\n"); + return (ENXIO); + } + + if (profile->rate == 0) { + /* delete this tbr */ + if ((tbr = ifq->altq_tbr) == NULL) + return (ENOENT); + ifq->altq_tbr = NULL; + free(tbr, M_ALTQ); + return (0); + } + + tbr = malloc(sizeof(*tbr), M_ALTQ, M_WAITOK | M_ZERO); + tbr->tbr_rate = TBR_SCALE(profile->rate / 8) / machclk_freq; + tbr->tbr_depth = TBR_SCALE(profile->depth); + if (tbr->tbr_rate > 0) + tbr->tbr_filluptime = tbr->tbr_depth / tbr->tbr_rate; + else + tbr->tbr_filluptime = 0xffffffffffffffffLL; + tbr->tbr_token = tbr->tbr_depth; + tbr->tbr_last = read_machclk(); + tbr->tbr_lastop = ALTDQ_REMOVE; + + otbr = ifq->altq_tbr; + ifq->altq_tbr = tbr; /* set the new tbr */ + + if (otbr != NULL) + free(otbr, M_ALTQ); + else if (tbr_timer == 0) { + callout_reset(&tbr_callout, 1, tbr_timeout, NULL); + tbr_timer = 1; + } + return (0); +} + +/* + * tbr_timeout goes through the interface list, and kicks the drivers + * if necessary. + */ +static void +tbr_timeout(void *arg) +{ + struct ifnet *ifp; + int active, s; + + active = 0; + s = splimp(); + for (ifp = TAILQ_FIRST(&ifnet); ifp; ifp = TAILQ_NEXT(ifp, if_list)) { + if (ifp->if_snd.altq_tbr == NULL) + continue; + active++; + if (!ifq_is_empty(&ifp->if_snd) && ifp->if_start != NULL) + (*ifp->if_start)(ifp); + } + splx(s); + if (active > 0) + callout_reset(&tbr_callout, 1, tbr_timeout, NULL); + else + tbr_timer = 0; /* don't need tbr_timer anymore */ +} + +/* + * get token bucket regulator profile + */ +int +tbr_get(struct ifaltq *ifq, struct tb_profile *profile) +{ + struct tb_regulator *tbr; + + if ((tbr = ifq->altq_tbr) == NULL) { + profile->rate = 0; + profile->depth = 0; + } else { + profile->rate = + (u_int)TBR_UNSCALE(tbr->tbr_rate * 8 * machclk_freq); + profile->depth = (u_int)TBR_UNSCALE(tbr->tbr_depth); + } + return (0); +} + +/* + * attach a discipline to the interface. if one already exists, it is + * overridden. + */ +int +altq_pfattach(struct pf_altq *a) +{ + struct ifnet *ifp; + struct tb_profile tb; + int s, error = 0; + + switch (a->scheduler) { + case ALTQT_NONE: + break; +#ifdef ALTQ_CBQ + case ALTQT_CBQ: + error = cbq_pfattach(a); + break; +#endif +#ifdef ALTQ_PRIQ + case ALTQT_PRIQ: + error = priq_pfattach(a); + break; +#endif +#ifdef ALTQ_HFSC + case ALTQT_HFSC: + error = hfsc_pfattach(a); + break; +#endif + default: + error = ENXIO; + } + + ifp = ifunit(a->ifname); + + /* if the state is running, enable altq */ + if (error == 0 && pfaltq_running && + ifp != NULL && ifp->if_snd.altq_type != ALTQT_NONE && + !ifq_is_enabled(&ifp->if_snd)) + error = altq_enable(&ifp->if_snd); + + /* if altq is already enabled, reset set tokenbucket regulator */ + if (error == 0 && ifp != NULL && ifq_is_enabled(&ifp->if_snd)) { + tb.rate = a->ifbandwidth; + tb.depth = a->tbrsize; + s = splimp(); + error = tbr_set(&ifp->if_snd, &tb); + splx(s); + } + + return (error); +} + +/* + * detach a discipline from the interface. + * it is possible that the discipline was already overridden by another + * discipline. + */ +int +altq_pfdetach(struct pf_altq *a) +{ + struct ifnet *ifp; + int s, error = 0; + + if ((ifp = ifunit(a->ifname)) == NULL) + return (EINVAL); + + /* if this discipline is no longer referenced, just return */ + if (a->altq_disc == NULL || a->altq_disc != ifp->if_snd.altq_disc) + return (0); + + s = splimp(); + if (ifq_is_enabled(&ifp->if_snd)) + error = altq_disable(&ifp->if_snd); + if (error == 0) + error = altq_detach(&ifp->if_snd); + splx(s); + + return (error); +} + +/* + * add a discipline or a queue + */ +int +altq_add(struct pf_altq *a) +{ + int error = 0; + + if (a->qname[0] != 0) + return (altq_add_queue(a)); + + if (machclk_freq == 0) + init_machclk(); + if (machclk_freq == 0) + panic("altq_add: no cpu clock"); + + switch (a->scheduler) { +#ifdef ALTQ_CBQ + case ALTQT_CBQ: + error = cbq_add_altq(a); + break; +#endif +#ifdef ALTQ_PRIQ + case ALTQT_PRIQ: + error = priq_add_altq(a); + break; +#endif +#ifdef ALTQ_HFSC + case ALTQT_HFSC: + error = hfsc_add_altq(a); + break; +#endif + default: + error = ENXIO; + } + + return (error); +} + +/* + * remove a discipline or a queue + */ +int +altq_remove(struct pf_altq *a) +{ + int error = 0; + + if (a->qname[0] != 0) + return (altq_remove_queue(a)); + + switch (a->scheduler) { +#ifdef ALTQ_CBQ + case ALTQT_CBQ: + error = cbq_remove_altq(a); + break; +#endif +#ifdef ALTQ_PRIQ + case ALTQT_PRIQ: + error = priq_remove_altq(a); + break; +#endif +#ifdef ALTQ_HFSC + case ALTQT_HFSC: + error = hfsc_remove_altq(a); + break; +#endif + default: + error = ENXIO; + } + + return (error); +} + +/* + * add a queue to the discipline + */ +int +altq_add_queue(struct pf_altq *a) +{ + int error = 0; + + switch (a->scheduler) { +#ifdef ALTQ_CBQ + case ALTQT_CBQ: + error = cbq_add_queue(a); + break; +#endif +#ifdef ALTQ_PRIQ + case ALTQT_PRIQ: + error = priq_add_queue(a); + break; +#endif +#ifdef ALTQ_HFSC + case ALTQT_HFSC: + error = hfsc_add_queue(a); + break; +#endif + default: + error = ENXIO; + } + + return (error); +} + +/* + * remove a queue from the discipline + */ +int +altq_remove_queue(struct pf_altq *a) +{ + int error = 0; + + switch (a->scheduler) { +#ifdef ALTQ_CBQ + case ALTQT_CBQ: + error = cbq_remove_queue(a); + break; +#endif +#ifdef ALTQ_PRIQ + case ALTQT_PRIQ: + error = priq_remove_queue(a); + break; +#endif +#ifdef ALTQ_HFSC + case ALTQT_HFSC: + error = hfsc_remove_queue(a); + break; +#endif + default: + error = ENXIO; + } + + return (error); +} + +/* + * get queue statistics + */ +int +altq_getqstats(struct pf_altq *a, void *ubuf, int *nbytes) +{ + int error = 0; + + switch (a->scheduler) { +#ifdef ALTQ_CBQ + case ALTQT_CBQ: + error = cbq_getqstats(a, ubuf, nbytes); + break; +#endif +#ifdef ALTQ_PRIQ + case ALTQT_PRIQ: + error = priq_getqstats(a, ubuf, nbytes); + break; +#endif +#ifdef ALTQ_HFSC + case ALTQT_HFSC: + error = hfsc_getqstats(a, ubuf, nbytes); + break; +#endif + default: + error = ENXIO; + } + + return (error); +} + +/* + * read and write diffserv field in IPv4 or IPv6 header + */ +uint8_t +read_dsfield(struct mbuf *m, struct altq_pktattr *pktattr) +{ + struct mbuf *m0; + uint8_t ds_field = 0; + + if (pktattr == NULL || + (pktattr->pattr_af != AF_INET && pktattr->pattr_af != AF_INET6)) + return ((uint8_t)0); + + /* verify that pattr_hdr is within the mbuf data */ + for (m0 = m; m0 != NULL; m0 = m0->m_next) { + if ((pktattr->pattr_hdr >= m0->m_data) && + (pktattr->pattr_hdr < m0->m_data + m0->m_len)) + break; + } + if (m0 == NULL) { + /* ick, pattr_hdr is stale */ + pktattr->pattr_af = AF_UNSPEC; +#ifdef ALTQ_DEBUG + printf("read_dsfield: can't locate header!\n"); +#endif + return ((uint8_t)0); + } + + if (pktattr->pattr_af == AF_INET) { + struct ip *ip = (struct ip *)pktattr->pattr_hdr; + + if (ip->ip_v != 4) + return ((uint8_t)0); /* version mismatch! */ + ds_field = ip->ip_tos; + } +#ifdef INET6 + else if (pktattr->pattr_af == AF_INET6) { + struct ip6_hdr *ip6 = (struct ip6_hdr *)pktattr->pattr_hdr; + uint32_t flowlabel; + + flowlabel = ntohl(ip6->ip6_flow); + if ((flowlabel >> 28) != 6) + return ((uint8_t)0); /* version mismatch! */ + ds_field = (flowlabel >> 20) & 0xff; + } +#endif + return (ds_field); +} + +void +write_dsfield(struct mbuf *m, struct altq_pktattr *pktattr, uint8_t dsfield) +{ + struct mbuf *m0; + + if (pktattr == NULL || + (pktattr->pattr_af != AF_INET && pktattr->pattr_af != AF_INET6)) + return; + + /* verify that pattr_hdr is within the mbuf data */ + for (m0 = m; m0 != NULL; m0 = m0->m_next) { + if ((pktattr->pattr_hdr >= m0->m_data) && + (pktattr->pattr_hdr < m0->m_data + m0->m_len)) + break; + } + if (m0 == NULL) { + /* ick, pattr_hdr is stale */ + pktattr->pattr_af = AF_UNSPEC; +#ifdef ALTQ_DEBUG + printf("write_dsfield: can't locate header!\n"); +#endif + return; + } + + if (pktattr->pattr_af == AF_INET) { + struct ip *ip = (struct ip *)pktattr->pattr_hdr; + uint8_t old; + int32_t sum; + + if (ip->ip_v != 4) + return; /* version mismatch! */ + old = ip->ip_tos; + dsfield |= old & 3; /* leave CU bits */ + if (old == dsfield) + return; + ip->ip_tos = dsfield; + /* + * update checksum (from RFC1624) + * HC' = ~(~HC + ~m + m') + */ + sum = ~ntohs(ip->ip_sum) & 0xffff; + sum += 0xff00 + (~old & 0xff) + dsfield; + sum = (sum >> 16) + (sum & 0xffff); + sum += (sum >> 16); /* add carry */ + + ip->ip_sum = htons(~sum & 0xffff); + } +#ifdef INET6 + else if (pktattr->pattr_af == AF_INET6) { + struct ip6_hdr *ip6 = (struct ip6_hdr *)pktattr->pattr_hdr; + uint32_t flowlabel; + + flowlabel = ntohl(ip6->ip6_flow); + if ((flowlabel >> 28) != 6) + return; /* version mismatch! */ + flowlabel = (flowlabel & 0xf03fffff) | (dsfield << 20); + ip6->ip6_flow = htonl(flowlabel); + } +#endif +} + +/* + * high resolution clock support taking advantage of a machine dependent + * high resolution time counter (e.g., timestamp counter of intel pentium). + * we assume + * - 64-bit-long monotonically-increasing counter + * - frequency range is 100M-4GHz (CPU speed) + */ +/* if pcc is not available or disabled, emulate 256MHz using microtime() */ +#define MACHCLK_SHIFT 8 + +int machclk_usepcc; +uint32_t machclk_freq = 0; +uint32_t machclk_per_tick = 0; + +void +init_machclk(void) +{ + callout_init(&tbr_callout); + + machclk_usepcc = 1; + +#if !defined(__i386__) || defined(ALTQ_NOPCC) + machclk_usepcc = 0; +#elif defined(__DragonFly__) && defined(SMP) + machclk_usepcc = 0; +#elif defined(__i386__) + /* check if TSC is available */ + if (machclk_usepcc == 1 && (cpu_feature & CPUID_TSC) == 0) + machclk_usepcc = 0; +#endif + + if (machclk_usepcc == 0) { + /* emulate 256MHz using microtime() */ + machclk_freq = 1000000 << MACHCLK_SHIFT; + machclk_per_tick = machclk_freq / hz; +#ifdef ALTQ_DEBUG + printf("altq: emulate %uHz cpu clock\n", machclk_freq); +#endif + return; + } + + /* + * if the clock frequency (of Pentium TSC or Alpha PCC) is + * accessible, just use it. + */ +#ifdef __i386__ + machclk_freq = tsc_freq; +#else +#error "machclk_freq interface not implemented" +#endif + + /* + * if we don't know the clock frequency, measure it. + */ + if (machclk_freq == 0) { + static int wait; + struct timeval tv_start, tv_end; + uint64_t start, end, diff; + int timo; + + microtime(&tv_start); + start = read_machclk(); + timo = hz; /* 1 sec */ + tsleep(&wait, PCATCH, "init_machclk", timo); + microtime(&tv_end); + end = read_machclk(); + diff = (uint64_t)(tv_end.tv_sec - tv_start.tv_sec) * 1000000 + + tv_end.tv_usec - tv_start.tv_usec; + if (diff != 0) + machclk_freq = (u_int)((end - start) * 1000000 / diff); + } + + machclk_per_tick = machclk_freq / hz; + +#ifdef ALTQ_DEBUG + printf("altq: CPU clock: %uHz\n", machclk_freq); +#endif +} + +uint64_t +read_machclk(void) +{ + uint64_t val; + + if (machclk_usepcc) { +#if defined(__i386__) + val = rdtsc(); +#else + panic("read_machclk"); +#endif + } else { + struct timeval tv; + + microtime(&tv); + val = (((uint64_t)(tv.tv_sec - boottime.tv_sec) * 1000000 + + tv.tv_usec) << MACHCLK_SHIFT); + } + return (val); +} diff --git a/sys/net/altq/altq_var.h b/sys/net/altq/altq_var.h new file mode 100644 index 0000000000..7f2b679fb0 --- /dev/null +++ b/sys/net/altq/altq_var.h @@ -0,0 +1,96 @@ +/* $KAME: altq_var.h,v 1.17 2004/04/20 05:09:08 kjc Exp $ */ +/* $DragonFly: src/sys/net/altq/altq_var.h,v 1.1 2005/02/11 22:25:57 joerg Exp $ */ + +/* + * Copyright (C) 1998-2003 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#ifndef _ALTQ_ALTQ_VAR_H_ +#define _ALTQ_ALTQ_VAR_H_ + +#ifdef _KERNEL + +#include +#include +#include +#include + +MALLOC_DECLARE(M_ALTQ); + +/* + * machine dependent clock + * a 64bit high resolution time counter. + */ +extern int machclk_usepcc; +extern uint32_t machclk_freq; +extern uint32_t machclk_per_tick; + +void init_machclk(void); +uint64_t read_machclk(void); + +#define m_pktlen(m) ((m)->m_pkthdr.len) + +extern int pfaltq_running; + +struct ifnet; +struct mbuf; +struct pf_altq; + +void *altq_lookup(const char *, int); +uint8_t read_dsfield(struct mbuf *, struct altq_pktattr *); +void write_dsfield(struct mbuf *, struct altq_pktattr *, uint8_t); +int tbr_set(struct ifaltq *, struct tb_profile *); +int tbr_get(struct ifaltq *, struct tb_profile *); + +int altq_pfattach(struct pf_altq *); +int altq_pfdetach(struct pf_altq *); +int altq_add(struct pf_altq *); +int altq_remove(struct pf_altq *); +int altq_add_queue(struct pf_altq *); +int altq_remove_queue(struct pf_altq *); +int altq_getqstats(struct pf_altq *, void *, int *); + +int cbq_pfattach(struct pf_altq *); +int cbq_add_altq(struct pf_altq *); +int cbq_remove_altq(struct pf_altq *); +int cbq_add_queue(struct pf_altq *); +int cbq_remove_queue(struct pf_altq *); +int cbq_getqstats(struct pf_altq *, void *, int *); + +int priq_pfattach(struct pf_altq *); +int priq_add_altq(struct pf_altq *); +int priq_remove_altq(struct pf_altq *); +int priq_add_queue(struct pf_altq *); +int priq_remove_queue(struct pf_altq *); +int priq_getqstats(struct pf_altq *, void *, int *); + +int hfsc_pfattach(struct pf_altq *); +int hfsc_add_altq(struct pf_altq *); +int hfsc_remove_altq(struct pf_altq *); +int hfsc_add_queue(struct pf_altq *); +int hfsc_remove_queue(struct pf_altq *); +int hfsc_getqstats(struct pf_altq *, void *, int *); + +#endif /* _KERNEL */ +#endif /* _ALTQ_ALTQ_VAR_H_ */ diff --git a/sys/net/altq/if_altq.h b/sys/net/altq/if_altq.h new file mode 100644 index 0000000000..56af60587c --- /dev/null +++ b/sys/net/altq/if_altq.h @@ -0,0 +1,144 @@ +/* $KAME: if_altq.h,v 1.11 2003/07/10 12:07:50 kjc Exp $ */ +/* $DragonFly: src/sys/net/altq/if_altq.h,v 1.1 2005/02/11 22:25:57 joerg Exp $ */ + +/* + * Copyright (C) 1997-2003 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#ifndef _ALTQ_IF_ALTQ_H_ +#define _ALTQ_IF_ALTQ_H_ + +struct altq_pktattr; + +/* + * Structure defining a queue for a network interface. + */ +struct ifaltq { + /* fields compatible with struct ifqueue */ + struct mbuf *ifq_head; + struct mbuf *ifq_tail; + int ifq_len; + int ifq_maxlen; + int ifq_drops; + + /* alternate queueing related fields */ + int altq_type; /* discipline type */ + int altq_flags; /* flags (e.g. ready, in-use) */ + void *altq_disc; /* for discipline-specific use */ + struct ifnet *altq_ifp; /* back pointer to interface */ + + int (*altq_enqueue)(struct ifaltq *, struct mbuf *, + struct altq_pktattr *); + struct mbuf *(*altq_dequeue)(struct ifaltq *, int); + int (*altq_request)(struct ifaltq *, int, void *); + + /* classifier fields */ + void *altq_clfier; /* classifier-specific use */ + void *(*altq_classify)(struct ifaltq *, struct mbuf *, + struct altq_pktattr *); + + /* token bucket regulator */ + struct tb_regulator *altq_tbr; +}; + + +#ifdef _KERNEL + +/* + * packet attributes used by queueing disciplines. + * pattr_class is a discipline-dependent scheduling class that is + * set by a classifier. + * pattr_hdr and pattr_af may be used by a discipline to access + * the header within a mbuf. (e.g. ECN needs to update the CE bit) + * note that pattr_hdr could be stale after m_pullup, though link + * layer output routines usually don't use m_pullup. link-level + * compression also invalidates these fields. thus, pattr_hdr needs + * to be verified when a discipline touches the header. + */ +struct altq_pktattr { + void *pattr_class; /* sched class set by classifier */ + int pattr_af; /* address family */ + caddr_t pattr_hdr; /* saved header position in mbuf */ +}; + +/* + * a token-bucket regulator limits the rate that a network driver can + * dequeue packets from the output queue. + * modern cards are able to buffer a large amount of packets and dequeue + * too many packets at a time. this bursty dequeue behavior makes it + * impossible to schedule packets by queueing disciplines. + * a token-bucket is used to control the burst size in a device + * independent manner. + */ +struct tb_regulator { + int64_t tbr_rate; /* (scaled) token bucket rate */ + int64_t tbr_depth; /* (scaled) token bucket depth */ + + int64_t tbr_token; /* (scaled) current token */ + int64_t tbr_filluptime; /* (scaled) time to fill up bucket */ + uint64_t tbr_last; /* last time token was updated */ + + int tbr_lastop; /* last dequeue operation type + needed for poll-and-dequeue */ +}; + +/* if_altqflags */ +#define ALTQF_READY 0x01 /* driver supports alternate queueing */ +#define ALTQF_ENABLED 0x02 /* altq is in use */ +#define ALTQF_CLASSIFY 0x04 /* classify packets */ +#define ALTQF_DRIVER1 0x40 /* driver specific */ + +/* if_altqflags set internally only: */ +#define ALTQF_CANTCHANGE (ALTQF_READY) + +/* altq_dequeue 2nd arg */ +#define ALTDQ_REMOVE 1 /* dequeue mbuf from the queue */ +#define ALTDQ_POLL 2 /* don't dequeue mbuf from the queue */ + +/* altq request types (currently only purge is defined) */ +#define ALTRQ_PURGE 1 /* purge all packets */ + +#define ALTQ_ENQUEUE(ifq, m, pa, err) \ + (err) = (*(ifq)->altq_enqueue)((ifq),(m),(pa)) +#define ALTQ_DEQUEUE(ifq, m) \ + (m) = (*(ifq)->altq_dequeue)((ifq), ALTDQ_REMOVE) +#define ALTQ_POLL(ifq, m) \ + (m) = (*(ifq)->altq_dequeue)((ifq), ALTDQ_POLL) +#define ALTQ_PURGE(ifq) \ + (void)(*(ifq)->altq_request)((ifq), ALTRQ_PURGE, (void *)0) + +int altq_attach(struct ifaltq *, int, void *, + int (*)(struct ifaltq *, struct mbuf *, struct altq_pktattr *), + struct mbuf *(*)(struct ifaltq *, int), + int (*)(struct ifaltq *, int, void *), + void *, void *(*)(struct ifaltq *, struct mbuf *, + struct altq_pktattr *)); +int altq_detach(struct ifaltq *); +int altq_enable(struct ifaltq *); +int altq_disable(struct ifaltq *); +struct mbuf *tbr_dequeue(struct ifaltq *, int); +extern int (*altq_input)(struct mbuf *, int); +#endif /* _KERNEL */ + +#endif /* _ALTQ_IF_ALTQ_H_ */ diff --git a/sys/net/bridge/bridge.c b/sys/net/bridge/bridge.c index f5208897d5..0e52e99a43 100644 --- a/sys/net/bridge/bridge.c +++ b/sys/net/bridge/bridge.c @@ -25,7 +25,7 @@ * SUCH DAMAGE. * * $FreeBSD: src/sys/net/bridge.c,v 1.16.2.25 2003/01/23 21:06:44 sam Exp $ - * $DragonFly: src/sys/net/bridge/Attic/bridge.c,v 1.12 2005/01/23 13:47:24 joerg Exp $ + * $DragonFly: src/sys/net/bridge/Attic/bridge.c,v 1.13 2005/02/11 22:25:57 joerg Exp $ */ /* @@ -100,7 +100,9 @@ #include #include +#include #include +#include #include #include /* for struct arpcom */ @@ -777,7 +779,7 @@ bdg_forward(struct mbuf *m0, struct ether_header *const eh, struct ifnet *dst) struct ifnet *src; struct ifnet *ifp, *last; int shared = bdg_copy ; /* someone else is using the mbuf */ - int once = 0; /* loop only once */ + int error, once = 0; /* loop only once */ struct ifnet *real_dst = dst ; /* real dst from ether_output */ struct ip_fw_args args; @@ -978,6 +980,8 @@ forward: for (;;) { if (last) { /* need to forward packet leftover from previous loop */ struct mbuf *m ; + struct altq_pktattr pktattr; + if (shared == 0 && once ) { /* no need to copy */ m = m0 ; m0 = NULL ; /* original is gone */ @@ -988,6 +992,26 @@ forward: return m0 ; /* the original is still there... */ } } + if (ifq_is_enabled(&last->if_snd)) { + uint16_t ether_type; + int af; + + /* + * If the queueing discipline needs packet classification, + * do it before prepending link headers. + */ + ether_type = ntohs(eh->ether_type); + if (ether_type == ETHERTYPE_IP) + af = AF_INET; +#ifdef INET6 + else if (ether_type == ETHERTYPE_IPV6) + af = AF_INET6; +#endif + else + af = AF_UNSPEC; + ifq_classify(&last->if_snd, m, af, &pktattr); + } + /* * Add header (optimized for the common case of eh pointing * already into the mbuf) and execute last part of ether_output: @@ -1006,7 +1030,8 @@ forward: return m0; bcopy(&save_eh, mtod(m, struct ether_header *), ETHER_HDR_LEN); } - if (!IF_HANDOFF(&last->if_snd, m, last)) { + error = ifq_handoff(last, m, &pktattr); + if (error != 0) { #if 0 BDG_MUTE(last); /* should I also mute ? */ #endif @@ -1023,7 +1048,10 @@ forward: * up and running, is not the source interface, and belongs to * the same cluster as the 'real_dst', then send here. */ - if ( BDG_USED(ifp) && !BDG_MUTED(ifp) && !IF_QFULL(&ifp->if_snd) && + if ( BDG_USED(ifp) && !BDG_MUTED(ifp) && +#ifndef ALTQ + !IF_QFULL(&ifp->if_snd) && +#endif (ifp->if_flags & (IFF_UP|IFF_RUNNING)) == (IFF_UP|IFF_RUNNING) && ifp != src && BDG_SAMECLUSTER(ifp, real_dst) ) last = ifp ; diff --git a/sys/net/ethernet.h b/sys/net/ethernet.h index 15154e4527..62f805a0a1 100644 --- a/sys/net/ethernet.h +++ b/sys/net/ethernet.h @@ -2,7 +2,7 @@ * Fundamental constants relating to ethernet. * * $FreeBSD: src/sys/net/ethernet.h,v 1.12.2.8 2002/12/01 14:03:09 sobomax Exp $ - * $DragonFly: src/sys/net/ethernet.h,v 1.10 2004/12/21 02:54:14 hsu Exp $ + * $DragonFly: src/sys/net/ethernet.h,v 1.11 2005/02/11 22:25:57 joerg Exp $ * */ @@ -376,6 +376,11 @@ extern int (*vlan_input_tag_p)(struct mbuf *m, uint16_t t); /* XXX: unlock */ \ } while (0) +struct altq_pktattr; +struct ifaltq; + +void altq_etherclassify(struct ifaltq *, struct mbuf *, struct altq_pktattr *); + #else /* _KERNEL */ #include diff --git a/sys/net/if.c b/sys/net/if.c index 3dde7d8346..327bcf99c6 100644 --- a/sys/net/if.c +++ b/sys/net/if.c @@ -32,7 +32,7 @@ * * @(#)if.c 8.3 (Berkeley) 1/4/94 * $FreeBSD: src/sys/net/if.c,v 1.185 2004/03/13 02:35:03 brooks Exp $ - * $DragonFly: src/sys/net/if.c,v 1.27 2005/02/01 16:09:37 hrs Exp $ + * $DragonFly: src/sys/net/if.c,v 1.28 2005/02/11 22:25:57 joerg Exp $ */ #include "opt_compat.h" @@ -60,6 +60,7 @@ #include #include #include +#include #include #include #include @@ -88,7 +89,6 @@ static void if_attachdomain(void *); static void if_attachdomain1(struct ifnet *); static int ifconf (u_long, caddr_t, struct thread *); static void ifinit (void *); -static void if_qflush (struct ifqueue *); static void if_slowtimo (void *); static void link_rtrequest (int, struct rtentry *, struct rt_addrinfo *); static int if_rtdel (struct radix_node *, void *); @@ -240,6 +240,12 @@ if_attach(struct ifnet *ifp) EVENTHANDLER_INVOKE(ifnet_attach_event, ifp); + ifp->if_snd.altq_type = 0; + ifp->if_snd.altq_disc = NULL; + ifp->if_snd.altq_flags &= ALTQF_CANTCHANGE; + ifp->if_snd.altq_tbr = NULL; + ifp->if_snd.altq_ifp = ifp; + if (domains) if_attachdomain1(ifp); @@ -300,6 +306,11 @@ if_detach(struct ifnet *ifp) s = splnet(); if_down(ifp); + if (ifq_is_enabled(&ifp->if_snd)) + altq_disable(&ifp->if_snd); + if (ifq_is_attached(&ifp->if_snd)) + altq_detach(&ifp->if_snd); + /* * Remove address from ifnet_addrs[] and maybe decrement if_index. * Clean up all addresses. @@ -877,7 +888,7 @@ if_unroute(struct ifnet *ifp, int flag, int fam) TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family)) pfctlinput(PRC_IFDOWN, ifa->ifa_addr); - if_qflush(&ifp->if_snd); + ifq_purge(&ifp->if_snd); rt_ifmsg(ifp); } @@ -931,24 +942,6 @@ if_up(struct ifnet *ifp) if_route(ifp, IFF_UP, AF_UNSPEC); } -/* - * Flush an interface queue. - */ -static void -if_qflush(struct ifqueue *ifq) -{ - struct mbuf *m, *n; - - n = ifq->ifq_head; - while ((m = n) != 0) { - n = m->m_nextpkt; - m_freem(m); - } - ifq->ifq_head = 0; - ifq->ifq_tail = 0; - ifq->ifq_len = 0; -} - /* * Handle interface watchdog timer routines. Called * from softclock, we decrement timers (if set) and diff --git a/sys/net/if_arcsubr.c b/sys/net/if_arcsubr.c index 0e88f753f8..d417a4fea6 100644 --- a/sys/net/if_arcsubr.c +++ b/sys/net/if_arcsubr.c @@ -1,6 +1,6 @@ /* $NetBSD: if_arcsubr.c,v 1.36 2001/06/14 05:44:23 itojun Exp $ */ /* $FreeBSD: src/sys/net/if_arcsubr.c,v 1.1.2.5 2003/02/05 18:42:15 fjoe Exp $ */ -/* $DragonFly: src/sys/net/Attic/if_arcsubr.c,v 1.13 2005/01/23 20:23:22 joerg Exp $ */ +/* $DragonFly: src/sys/net/Attic/if_arcsubr.c,v 1.14 2005/02/11 22:25:57 joerg Exp $ */ /* * Copyright (c) 1994, 1995 Ignatios Souvatzis @@ -63,6 +63,7 @@ #include #include #include +#include #include #if defined(INET) || defined(INET6) @@ -113,10 +114,17 @@ arc_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, u_int8_t atype, adst; int loop_copy = 0; int isphds; + struct altq_pktattr pktattr; if ((ifp->if_flags & (IFF_UP | IFF_RUNNING)) != (IFF_UP | IFF_RUNNING)) return (ENETDOWN); /* m, m1 aren't initialized yet */ + /* + * If the queueing discipline needs packet classification, + * do it before prepending link headers. + */ + ifq_classify(&ifp->if_snd, m, dst->sa_family, &pktattr); + switch (dst->sa_family) { #ifdef INET case AF_INET: @@ -207,12 +215,8 @@ arc_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, BPF_MTAP(ifp, m); - if (!IF_HANDOFF(&ifp->if_snd, m, ifp)) { - m = NULL; - gotoerr(ENOBUFS); - } - - return (0); + error = ifq_handoff(ifp, m, &pktattr); + return (error); bad: if (m != NULL) diff --git a/sys/net/if_atmsubr.c b/sys/net/if_atmsubr.c index 0db7d5a925..1b7445d2db 100644 --- a/sys/net/if_atmsubr.c +++ b/sys/net/if_atmsubr.c @@ -32,7 +32,7 @@ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD: src/sys/net/if_atmsubr.c,v 1.10.2.1 2001/03/06 00:29:26 obrien Exp $ - * $DragonFly: src/sys/net/if_atmsubr.c,v 1.11 2005/01/06 09:14:13 hsu Exp $ + * $DragonFly: src/sys/net/if_atmsubr.c,v 1.12 2005/02/11 22:25:57 joerg Exp $ */ /* @@ -51,6 +51,8 @@ #include #include +#include +#include #include #include #include @@ -101,10 +103,18 @@ atm_output(struct ifnet *ifp, struct mbuf *m0, struct sockaddr *dst, struct atmllc *atmllc; struct atmllc *llc_hdr = NULL; u_int32_t atm_flags; + struct altq_pktattr pktattr; if ((ifp->if_flags & (IFF_UP|IFF_RUNNING)) != (IFF_UP|IFF_RUNNING)) gotoerr(ENETDOWN); + /* + * if the queueing discipline needs packet classification, + * do it before prepending link headers. + */ + ifq_classify(&ifp->if_snd, m, + (dst != NULL ? dst->sa_family : AF_UNSPEC), &pktattr); + /* * check route */ @@ -203,13 +213,12 @@ atm_output(struct ifnet *ifp, struct mbuf *m0, struct sockaddr *dst, * not yet active. */ s = splimp(); - if (IF_QFULL(&ifp->if_snd)) { - IF_DROP(&ifp->if_snd); + error = ifq_enqueue(&ifp->if_snd, m, &pktattr); + if (error) { splx(s); - gotoerr(ENOBUFS); + return (ENOBUFS); } ifp->if_obytes += m->m_pkthdr.len; - IF_ENQUEUE(&ifp->if_snd, m); if (!(ifp->if_flags & IFF_OACTIVE)) (*ifp->if_start)(ifp); splx(s); @@ -318,7 +327,7 @@ atm_ifattach(ifp) ifp->if_input = atm_input; #endif ifp->if_output = atm_output; - ifp->if_snd.ifq_maxlen = 50; /* dummy */ + ifq_set_maxlen(&ifp->if_snd, 50); if_attach(ifp); #if defined(__NetBSD__) || defined(__OpenBSD__) diff --git a/sys/net/if_ethersubr.c b/sys/net/if_ethersubr.c index 543695bf9e..f6a3c0616b 100644 --- a/sys/net/if_ethersubr.c +++ b/sys/net/if_ethersubr.c @@ -32,7 +32,7 @@ * * @(#)if_ethersubr.c 8.1 (Berkeley) 6/10/93 * $FreeBSD: src/sys/net/if_ethersubr.c,v 1.70.2.33 2003/04/28 15:45:53 archie Exp $ - * $DragonFly: src/sys/net/if_ethersubr.c,v 1.26 2005/01/26 00:37:39 joerg Exp $ + * $DragonFly: src/sys/net/if_ethersubr.c,v 1.27 2005/02/11 22:25:57 joerg Exp $ */ #include "opt_atalk.h" @@ -57,6 +57,7 @@ #include #include #include +#include #include #include #include @@ -364,6 +365,7 @@ ether_output_frame(struct ifnet *ifp, struct mbuf *m) struct ip_fw *rule = NULL; int error = 0; int s; + struct altq_pktattr pktattr; /* Extract info from dummynet tag, ignore others */ while (m->m_type == MT_TAG) { @@ -388,6 +390,8 @@ ether_output_frame(struct ifnet *ifp, struct mbuf *m) } no_bridge: + if (ifq_is_enabled(&ifp->if_snd)) + altq_etherclassify(&ifp->if_snd, m, &pktattr); s = splimp(); if (IPFW_LOADED && ether_ipfw != 0) { struct ether_header save_eh, *eh; @@ -421,8 +425,7 @@ no_bridge: * Queue message on interface, update output statistics if * successful, and start output if interface not yet active. */ - if (!IF_HANDOFF(&ifp->if_snd, m, ifp)) - error = ENOBUFS; + error = ifq_handoff(ifp, m, &pktattr); splx(s); return (error); } @@ -1094,3 +1097,73 @@ ether_crc32_be(const uint8_t *buf, size_t len) return (crc); } + +#ifdef ALTQ +/* + * find the size of ethernet header, and call classifier + */ +void +altq_etherclassify(struct ifaltq *ifq, struct mbuf *m, + struct altq_pktattr *pktattr) +{ + struct ether_header *eh; + uint16_t ether_type; + int hlen, af, hdrsize; + caddr_t hdr; + + hlen = sizeof(struct ether_header); + eh = mtod(m, struct ether_header *); + + ether_type = ntohs(eh->ether_type); + if (ether_type < ETHERMTU) { + /* ick! LLC/SNAP */ + struct llc *llc = (struct llc *)(eh + 1); + hlen += 8; + + if (m->m_len < hlen || + llc->llc_dsap != LLC_SNAP_LSAP || + llc->llc_ssap != LLC_SNAP_LSAP || + llc->llc_control != LLC_UI) + goto bad; /* not snap! */ + + ether_type = ntohs(llc->llc_un.type_snap.ether_type); + } + + if (ether_type == ETHERTYPE_IP) { + af = AF_INET; + hdrsize = 20; /* sizeof(struct ip) */ +#ifdef INET6 + } else if (ether_type == ETHERTYPE_IPV6) { + af = AF_INET6; + hdrsize = 40; /* sizeof(struct ip6_hdr) */ +#endif + } else + goto bad; + + while (m->m_len <= hlen) { + hlen -= m->m_len; + m = m->m_next; + } + hdr = m->m_data + hlen; + if (m->m_len < hlen + hdrsize) { + /* + * ip header is not in a single mbuf. this should not + * happen in the current code. + * (todo: use m_pulldown in the future) + */ + goto bad; + } + m->m_data += hlen; + m->m_len -= hlen; + ifq_classify(ifq, m, af, pktattr); + m->m_data -= hlen; + m->m_len += hlen; + + return; + +bad: + pktattr->pattr_class = NULL; + pktattr->pattr_hdr = NULL; + pktattr->pattr_af = AF_UNSPEC; +} +#endif /* ALTQ */ diff --git a/sys/net/if_fddisubr.c b/sys/net/if_fddisubr.c index c16e702329..54fac538d7 100644 --- a/sys/net/if_fddisubr.c +++ b/sys/net/if_fddisubr.c @@ -34,7 +34,7 @@ * * from: if_ethersubr.c,v 1.5 1994/12/13 22:31:45 wollman Exp * $FreeBSD: src/sys/net/if_fddisubr.c,v 1.41.2.8 2002/02/20 23:34:09 fjoe Exp $ - * $DragonFly: src/sys/net/Attic/if_fddisubr.c,v 1.15 2005/01/06 17:59:32 hsu Exp $ + * $DragonFly: src/sys/net/Attic/if_fddisubr.c,v 1.16 2005/02/11 22:25:57 joerg Exp $ */ #include "opt_atalk.h" @@ -55,6 +55,7 @@ #include #include #include +#include #if defined(INET) || defined(INET6) #include @@ -136,10 +137,17 @@ fddi_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, struct fddi_header *fh; boolean_t hdrcmplt = FALSE; int s, loop_copy = 0, error; + struct altq_pktattr pktattr; if ((ifp->if_flags & (IFF_UP|IFF_RUNNING)) != (IFF_UP|IFF_RUNNING)) senderr(ENETDOWN); + /* + * If the queueing discipline needs packet classification, + * do it before prepending link headers. + */ + ifq_classify(&ifp->if_snd, m, dst->sa_family, &pktattr); + switch (dst->sa_family) { #ifdef INET case AF_INET: { @@ -336,15 +344,14 @@ queue_it: * Queue message on interface, and start output if interface * not yet active. */ - if (IF_QFULL(&ifp->if_snd)) { - IF_DROP(&ifp->if_snd); + error = ifq_enqueue(&ifp->if_snd, m, &pktattr); + if (error) { splx(s); - senderr(ENOBUFS); + return(ENOBUFS); } ifp->if_obytes += m->m_pkthdr.len; if (m->m_flags & M_MCAST) ifp->if_omcasts++; - IF_ENQUEUE(&ifp->if_snd, m); if ((ifp->if_flags & IFF_OACTIVE) == 0) (*ifp->if_start)(ifp); splx(s); diff --git a/sys/net/if_loop.c b/sys/net/if_loop.c index 127fa2869e..d714473955 100644 --- a/sys/net/if_loop.c +++ b/sys/net/if_loop.c @@ -32,7 +32,7 @@ * * @(#)if_loop.c 8.1 (Berkeley) 6/10/93 * $FreeBSD: src/sys/net/if_loop.c,v 1.47.2.8 2003/06/01 01:46:11 silby Exp $ - * $DragonFly: src/sys/net/if_loop.c,v 1.13 2005/01/26 00:37:39 joerg Exp $ + * $DragonFly: src/sys/net/if_loop.c,v 1.14 2005/02/11 22:25:57 joerg Exp $ */ /* @@ -54,6 +54,7 @@ #include #include +#include #include #include #include @@ -91,6 +92,9 @@ int loioctl (struct ifnet *, u_long, caddr_t, struct ucred *); static void lortrequest (int, struct rtentry *, struct rt_addrinfo *); static void loopattach (void *); +#ifdef ALTQ +static void lo_altqstart(struct ifnet *); +#endif PSEUDO_SET(loopattach, if_loop); int looutput (struct ifnet *ifp, @@ -120,7 +124,11 @@ loopattach(void *dummy) ifp->if_ioctl = loioctl; ifp->if_output = looutput; ifp->if_type = IFT_LOOP; - ifp->if_snd.ifq_maxlen = ifqmaxlen; + ifq_set_maxlen(&ifp->if_snd, ifqmaxlen); + ifq_set_ready(&ifp->if_snd); +#ifdef ALTQ + ifp->if_start = lo_altqstart; +#endif if_attach(ifp); bpfattach(ifp, DLT_NULL, sizeof(u_int)); } @@ -236,6 +244,37 @@ if_simloop(struct ifnet *ifp, struct mbuf *m, int af, int hlen) } #endif } + +#ifdef ALTQ + /* + * altq for loop is just for debugging. + * only used when called for loop interface (not for + * a simplex interface). + */ + if (ifq_is_enabled(&ifp->if_snd) && ifp->if_start == lo_altqstart) { + struct altq_pktattr pktattr; + int32_t *afp; + int error, s; + + /* + * if the queueing discipline needs packet classification, + * do it before prepending link headers. + */ + ifq_classify(&ifp->if_snd, m, af, &pktattr); + + M_PREPEND(m, sizeof(int32_t), MB_DONTWAIT); + if (m == 0) + return(ENOBUFS); + afp = mtod(m, int32_t *); + *afp = (int32_t)af; + + s = splimp(); + error = ifq_enqueue(&ifp->if_snd, m, &pktattr); + (*ifp->if_start)(ifp); + splx(s); + return (error); + } +#endif /* ALTQ */ /* Deliver to upper layer protocol */ switch (af) { @@ -277,6 +316,70 @@ if_simloop(struct ifnet *ifp, struct mbuf *m, int af, int hlen) return (0); } +#ifdef ALTQ +static void +lo_altqstart(struct ifnet *ifp) +{ + struct mbuf *m; + int32_t af, *afp; + int s, isr; + + while (1) { + s = splimp(); + m = ifq_dequeue(&ifp->if_snd); + splx(s); + if (m == NULL) + return; + + afp = mtod(m, int32_t *); + af = *afp; + m_adj(m, sizeof(int32_t)); + + switch (af) { +#ifdef INET + case AF_INET: + isr = NETISR_IP; + break; +#endif +#ifdef INET6 + case AF_INET6: + m->m_flags |= M_LOOP; + isr = NETISR_IPV6; + break; +#endif +#ifdef IPX + case AF_IPX: + isr = NETISR_IPX; + break; +#endif +#ifdef NS + case AF_NS: + isr = NETISR_NS; + break; +#endif +#ifdef ISO + case AF_ISO: + isr = NETISR_ISO; + break; +#endif +#ifdef NETATALK + case AF_APPLETALK: + isr = NETISR_ATALK2; + break; +#endif NETATALK + default: + printf("lo_altqstart: can't handle af%d\n", af); + m_freem(m); + return; + } + + ifp->if_ipackets++; + ifp->if_ibytes += m->m_pkthdr.len; + netisr_queue(isr, m); + } +} +#endif /* ALTQ */ + /* ARGSUSED */ static void lortrequest(int cmd, struct rtentry *rt, struct rt_addrinfo *info) diff --git a/sys/net/if_var.h b/sys/net/if_var.h index e25c849594..1a5add4281 100644 --- a/sys/net/if_var.h +++ b/sys/net/if_var.h @@ -32,7 +32,7 @@ * * From: @(#)if.h 8.1 (Berkeley) 6/10/93 * $FreeBSD: src/sys/net/if_var.h,v 1.18.2.16 2003/04/15 18:11:19 fjoe Exp $ - * $DragonFly: src/sys/net/if_var.h,v 1.23 2005/02/01 23:14:54 joerg Exp $ + * $DragonFly: src/sys/net/if_var.h,v 1.24 2005/02/11 22:25:57 joerg Exp $ */ #ifndef _NET_IF_VAR_H_ @@ -80,6 +80,8 @@ struct ucred; #include /* get TAILQ macros */ +#include + #ifdef _KERNEL #include #include @@ -171,7 +173,7 @@ struct ifnet { (void *); int (*if_resolvemulti) /* validate/resolve multicast */ (struct ifnet *, struct sockaddr **, struct sockaddr *); - struct ifqueue if_snd; /* output queue */ + struct ifaltq if_snd; /* output queue (includes altq) */ struct ifprefixhead if_prefixhead; /* list of prefixes per if */ const uint8_t *if_broadcastaddr; void *if_afdata[AF_MAX]; @@ -284,57 +286,6 @@ if_handoff(struct ifqueue *ifq, struct mbuf *m, struct ifnet *ifp, int adjust) return (1); } -#define IFQ_ENQUEUE(ifq, m, err) \ -do { \ - if (IF_QFULL(ifq)) { \ - m_freem(m); \ - (err) = ENOBUFS; \ - } else { \ - IF_ENQUEUE(ifq, m); \ - (err) = 0; \ - } \ - if (err) \ - (ifq)->ifq_drops++; \ -} while (0) - -#define IFQ_DEQUEUE(ifq, m) IF_DEQUEUE(ifq, m) -#define IFQ_POLL(ifq, m) IF_POLL(ifq, m) -#define IFQ_PURGE(ifq) IF_DRAIN(ifq) - -#define IFQ_SET_READY(ifq) /* nothing */ - -#define IFQ_IS_EMPTY(ifq) ((ifq)->ifq_len == 0) -#define IFQ_INC_LEN(ifq) ((ifq)->ifq_len++) -#define IFQ_DEC_LEN(ifq) (--(ifq)->ifq_len) -#define IFQ_INC_DROPS(ifq) ((ifq)->ifq_drops++) -#define IFQ_SET_MAXLEN(ifq, len) ((ifq)->ifq_maxlen = (len)) -#define IFQ_SET_DRV_MAXLEN(ifq, len) /* nothing */ - -#define IFQ_HANDOFF_ADJ(ifp, m, adj, err) \ -do { \ - int len; \ - short mflags; \ - \ - len = (m)->m_pkthdr.len; \ - mflags = (m)->m_flags; \ - IFQ_ENQUEUE(&(ifp)->if_snd, m, err); \ - if ((err) == 0) { \ - (ifp)->if_obytes += len + (adj); \ - if (mflags & M_MCAST) \ - (ifp)->if_omcasts++; \ - if (((ifp)->if_flags & IFF_OACTIVE) == 0) \ - (*(ifp)->if_start)(ifp); \ - } \ -} while (0) - -#define IFQ_HANDOFF(ifp, m, err) \ - IFQ_HANDOFF_ADJ(ifp, m, 0, err) - -#define IFQ_DRV_DEQUEUE(ifq, m) IF_DEQUEUE(ifq, m) -#define IFQ_DRV_PREPEND(ifq, m) IF_PREPEND(ifq, m) -#define IFQ_DRV_IS_EMPTY(ifq) IFQ_IS_EMPTY(ifq) -#define IFQ_DRV_PURGE(ifq) IFQ_PURGE(ifq) - /* * 72 was chosen below because it is the size of a TCP/IP * header (40) + the minimum mss (32). diff --git a/sys/net/ifq_var.h b/sys/net/ifq_var.h new file mode 100644 index 0000000000..310925419a --- /dev/null +++ b/sys/net/ifq_var.h @@ -0,0 +1,175 @@ +/*- + * Copyright (c) 2005 The DragonFly Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of The DragonFly Project nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific, prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $DragonFly: src/sys/net/ifq_var.h,v 1.1 2005/02/11 22:25:57 joerg Exp $ + */ +#ifndef _NET_IFQ_VAR_H +#define _NET_IFQ_VAR_H + +#ifdef ALTQ +static __inline int +ifq_is_enabled(struct ifaltq *_ifq) +{ + return(_ifq->altq_flags & ALTQF_ENABLED); +} + +static __inline int +ifq_is_attached(struct ifaltq *_ifq) +{ + return(_ifq->altq_disc != NULL); +} +#else +static __inline int +ifq_is_enabled(struct ifaltq *_ifq) +{ + return(0); +} + +static __inline int +ifq_is_attached(struct ifaltq *_ifq) +{ + return(0); +} +#endif + +static __inline int +ifq_is_ready(struct ifaltq *_ifq) +{ + return(_ifq->altq_flags & ALTQF_READY); +} + +static __inline void +ifq_set_ready(struct ifaltq *_ifq) +{ + _ifq->altq_flags |= ALTQF_READY; +} + +static __inline int +ifq_enqueue(struct ifaltq *_ifq, struct mbuf *_m, struct altq_pktattr *_pa) +{ + if (ifq_is_enabled(_ifq)) { + return((*_ifq->altq_enqueue)(_ifq, _m, _pa)); + } else { + if (IF_QFULL(_ifq)) { + m_freem(_m); + return(ENOBUFS); + } else { + IF_ENQUEUE(_ifq, _m); + return(0); + } + } +} + +static __inline struct mbuf * +ifq_dequeue(struct ifaltq *_ifq) +{ +#ifdef ALTQ + if (_ifq->altq_tbr != NULL) + return(tbr_dequeue(_ifq, ALTDQ_REMOVE)); +#endif + if (ifq_is_enabled(_ifq)) { + return((*_ifq->altq_dequeue)(_ifq, ALTDQ_REMOVE)); + } else { + struct mbuf *_m; + + IF_DEQUEUE(_ifq, _m); + return(_m); + } +} + +static __inline struct mbuf * +ifq_poll(struct ifaltq *_ifq) +{ +#ifdef ALTQ + if (_ifq->altq_tbr != NULL) + return(tbr_dequeue(_ifq, ALTDQ_POLL)); +#endif + if (ifq_is_enabled(_ifq)) { + return((*_ifq->altq_dequeue)(_ifq, ALTDQ_POLL)); + } else { + struct mbuf *_m; + + IF_POLL(_ifq, _m); + return(_m); + } +} + +static __inline void +ifq_purge(struct ifaltq *_ifq) +{ + if (ifq_is_enabled(_ifq)) + (*_ifq->altq_request)(_ifq, ALTRQ_PURGE, NULL); + else + IF_DRAIN(_ifq); +} + +static __inline void +ifq_classify(struct ifaltq *_ifq, struct mbuf *_m, uint8_t _af, + struct altq_pktattr *_pa) +{ + if (!ifq_is_enabled(_ifq)) + return; + _pa->pattr_af = _af; + _pa->pattr_hdr = mtod(_m, caddr_t); + if (_ifq->altq_flags & ALTQF_CLASSIFY) + (*_ifq->altq_classify)(_ifq, _m, _pa); +} + +static __inline int +ifq_handoff(struct ifnet *_ifp, struct mbuf *_m, struct altq_pktattr *_pa) +{ + int _error, _s; + + _s = splimp(); + _error = ifq_enqueue(&_ifp->if_snd, _m, _pa); + if (_error == 0) { + _ifp->if_obytes += _m->m_pkthdr.len; + if (_m->m_flags & M_MCAST) + _ifp->if_omcasts++; + if ((_ifp->if_flags & IFF_OACTIVE) == 0) + (*_ifp->if_start)(_ifp); + } + splx(_s); + return(_error); +} + +static __inline int +ifq_is_empty(struct ifaltq *_ifq) +{ + return(_ifq->ifq_len == 0); +} + +static __inline void +ifq_set_maxlen(struct ifaltq *_ifq, int _len) +{ + _ifq->ifq_maxlen = _len; +} + +#endif diff --git a/sys/net/ip_mroute/ip_mroute.c b/sys/net/ip_mroute/ip_mroute.c index a95af9c47a..dc3e3745cb 100644 --- a/sys/net/ip_mroute/ip_mroute.c +++ b/sys/net/ip_mroute/ip_mroute.c @@ -18,7 +18,7 @@ * bandwidth metering and signaling * * $FreeBSD: src/sys/netinet/ip_mroute.c,v 1.56.2.10 2003/08/24 21:37:34 hsu Exp $ - * $DragonFly: src/sys/net/ip_mroute/ip_mroute.c,v 1.15 2004/09/16 23:30:10 joerg Exp $ + * $DragonFly: src/sys/net/ip_mroute/ip_mroute.c,v 1.16 2005/02/11 22:25:57 joerg Exp $ */ #include "opt_mrouting.h" @@ -58,6 +58,9 @@ #include #include #endif +#ifdef ALTQ +#include +#endif #include /* @@ -2119,6 +2122,12 @@ X_rsvp_input(struct mbuf *m, ...) int s; struct ifnet *ifp; int off, proto; +#ifdef ALTQ + /* support IP_RECVIF used by rsvpd rel4.2a1 */ + struct inpcb *inp; + struct socket *so; + struct mbuf *opts; +#endif __va_list ap; __va_start(ap, m); @@ -2154,7 +2163,11 @@ X_rsvp_input(struct mbuf *m, ...) if (viftable[vifi].v_ifp == ifp) break; +#ifdef ALTQ + if (vifi == numvifs || (so = viftable[vifi].v_rsvpd) == NULL) { +#else if (vifi == numvifs || viftable[vifi].v_rsvpd == NULL) { +#endif /* * If the old-style non-vif-associated socket is set, * then use it. Otherwise, drop packet since there @@ -2180,6 +2193,26 @@ X_rsvp_input(struct mbuf *m, ...) printf("rsvp_input: m->m_len = %d, sbspace() = %ld\n", m->m_len,sbspace(&(viftable[vifi].v_rsvpd->so_rcv))); +#ifdef ALTQ + opts = NULL; + inp = (struct inpcb *)so->so_pcb; + if (inp->inp_flags & INP_CONTROLOPTS || + inp->inp_socket->so_options & SO_TIMESTAMP) + ip_savecontrol(inp, &opts, ip, m); + if (sbappendaddr(&so->so_rcv, + (struct sockaddr *)&rsvp_src,m, opts) == 0) { + m_freem(m); + if (opts) + m_freem(opts); + if (rsvpdebug) + printf("rsvp_input: Failed to append to socket\n"); + } + else { + sorwakeup(so); + if (rsvpdebug) + printf("rsvp_input: send packet up\n"); + } +#else /* !ALTQ */ if (socket_send(viftable[vifi].v_rsvpd, m, &rsvp_src) < 0) { if (rsvpdebug) printf("rsvp_input: Failed to append to socket\n"); @@ -2187,6 +2220,7 @@ X_rsvp_input(struct mbuf *m, ...) if (rsvpdebug) printf("rsvp_input: send packet up\n"); } +#endif /* !ALTQ */ splx(s); } diff --git a/sys/net/oldbridge/bridge.c b/sys/net/oldbridge/bridge.c index 95d2c6251e..57beccd9e0 100644 --- a/sys/net/oldbridge/bridge.c +++ b/sys/net/oldbridge/bridge.c @@ -25,7 +25,7 @@ * SUCH DAMAGE. * * $FreeBSD: src/sys/net/bridge.c,v 1.16.2.25 2003/01/23 21:06:44 sam Exp $ - * $DragonFly: src/sys/net/oldbridge/Attic/bridge.c,v 1.12 2005/01/23 13:47:24 joerg Exp $ + * $DragonFly: src/sys/net/oldbridge/Attic/bridge.c,v 1.13 2005/02/11 22:25:57 joerg Exp $ */ /* @@ -100,7 +100,9 @@ #include #include +#include #include +#include #include #include /* for struct arpcom */ @@ -777,7 +779,7 @@ bdg_forward(struct mbuf *m0, struct ether_header *const eh, struct ifnet *dst) struct ifnet *src; struct ifnet *ifp, *last; int shared = bdg_copy ; /* someone else is using the mbuf */ - int once = 0; /* loop only once */ + int error, once = 0; /* loop only once */ struct ifnet *real_dst = dst ; /* real dst from ether_output */ struct ip_fw_args args; @@ -978,6 +980,8 @@ forward: for (;;) { if (last) { /* need to forward packet leftover from previous loop */ struct mbuf *m ; + struct altq_pktattr pktattr; + if (shared == 0 && once ) { /* no need to copy */ m = m0 ; m0 = NULL ; /* original is gone */ @@ -988,6 +992,26 @@ forward: return m0 ; /* the original is still there... */ } } + if (ifq_is_enabled(&last->if_snd)) { + uint16_t ether_type; + int af; + + /* + * If the queueing discipline needs packet classification, + * do it before prepending link headers. + */ + ether_type = ntohs(eh->ether_type); + if (ether_type == ETHERTYPE_IP) + af = AF_INET; +#ifdef INET6 + else if (ether_type == ETHERTYPE_IPV6) + af = AF_INET6; +#endif + else + af = AF_UNSPEC; + ifq_classify(&last->if_snd, m, af, &pktattr); + } + /* * Add header (optimized for the common case of eh pointing * already into the mbuf) and execute last part of ether_output: @@ -1006,7 +1030,8 @@ forward: return m0; bcopy(&save_eh, mtod(m, struct ether_header *), ETHER_HDR_LEN); } - if (!IF_HANDOFF(&last->if_snd, m, last)) { + error = ifq_handoff(last, m, &pktattr); + if (error != 0) { #if 0 BDG_MUTE(last); /* should I also mute ? */ #endif @@ -1023,7 +1048,10 @@ forward: * up and running, is not the source interface, and belongs to * the same cluster as the 'real_dst', then send here. */ - if ( BDG_USED(ifp) && !BDG_MUTED(ifp) && !IF_QFULL(&ifp->if_snd) && + if ( BDG_USED(ifp) && !BDG_MUTED(ifp) && +#ifndef ALTQ + !IF_QFULL(&ifp->if_snd) && +#endif (ifp->if_flags & (IFF_UP|IFF_RUNNING)) == (IFF_UP|IFF_RUNNING) && ifp != src && BDG_SAMECLUSTER(ifp, real_dst) ) last = ifp ; diff --git a/sys/net/pf/pf.c b/sys/net/pf/pf.c index f664adad94..a7a8f3fb52 100644 --- a/sys/net/pf/pf.c +++ b/sys/net/pf/pf.c @@ -1,7 +1,7 @@ /* $FreeBSD: src/sys/contrib/pf/net/pf.c,v 1.19 2004/09/11 11:18:25 mlaier Exp $ */ /* $OpenBSD: pf.c,v 1.433.2.2 2004/07/17 03:22:34 brad Exp $ */ /* add $OpenBSD: pf.c,v 1.448 2004/05/11 07:34:11 dhartmei Exp $ */ -/* $DragonFly: src/sys/net/pf/pf.c,v 1.3 2004/12/21 02:54:15 hsu Exp $ */ +/* $DragonFly: src/sys/net/pf/pf.c,v 1.4 2005/02/11 22:25:57 joerg Exp $ */ /* * Copyright (c) 2004 The DragonFly Project. All rights reserved. @@ -1305,20 +1305,13 @@ pf_send_tcp(const struct pf_rule *r, sa_family_t af, m = m_gethdr(MB_DONTWAIT, MT_HEADER); if (m == NULL) return; - m->m_pkthdr.pf_flags |= PF_MBUF_GENERATED; + m->m_pkthdr.fw_flags |= PF_MBUF_GENERATED; #ifdef ALTQ if (r != NULL && r->qid) { - struct altq_tag *atag; - - mtag = m_tag_get(PACKET_TAG_PF_QID, sizeof(*atag), MB_DONTWAIT); - if (mtag != NULL) { - atag = (struct altq_tag *)(mtag + 1); - atag->qid = r->qid; - /* add hints for ecn */ - atag->af = af; - atag->hdr = mtod(m, struct ip *); - m_tag_prepend(m, mtag); - } + m->m_pkthdr.fw_flags |= ALTQ_MBUF_TAGGED; + m->m_pkthdr.altq_qid = r->qid; + m->m_pkthdr.ecn_af = af; + m->m_pkthdr.header = mtod(m, struct ip *); } #endif m->m_data += max_linkhdr; @@ -1413,21 +1406,14 @@ pf_send_icmp(struct mbuf *m, u_int8_t type, u_int8_t code, sa_family_t af, m0 = m_copypacket(m, MB_DONTWAIT); if (m0 == NULL) return; - m0->m_pkthdr.pf_flags |= PF_MBUF_GENERATED; + m0->m_pkthdr.fw_flags |= PF_MBUF_GENERATED; #ifdef ALTQ if (r->qid) { - struct altq_tag *atag; - - mtag = m_tag_get(PACKET_TAG_PF_QID, sizeof(*atag), MB_DONTWAIT); - if (mtag != NULL) { - atag = (struct altq_tag *)(mtag + 1); - atag->qid = r->qid; - /* add hints for ecn */ - atag->af = af; - atag->hdr = mtod(m0, struct ip *); - m_tag_prepend(m0, mtag); - } + m->m_pkthdr.fw_flags |= ALTQ_MBUF_TAGGED; + m->m_pkthdr.altq_qid = r->qid; + m->m_pkthdr.ecn_af = af; + m->m_pkthdr.header = mtod(m0, struct ip *); } #endif @@ -1549,7 +1535,7 @@ pf_match_tag(struct mbuf *m, struct pf_rule *r, struct pf_rule *nat_rule, if (*tag == -1) { /* find mbuf tag */ if (nat_rule != NULL && nat_rule->tag) *tag = nat_rule->tag; - else if (m->m_pkthdr.pf_flags & PF_MBUF_TAGGED) + else if (m->m_pkthdr.fw_flags & PF_MBUF_TAGGED) *tag = m->m_pkthdr.pf_tag; else *tag = 0; @@ -1565,7 +1551,7 @@ pf_tag_packet(struct mbuf *m, int tag) if (tag <= 0) return; - m->m_pkthdr.pf_flags |= PF_MBUF_TAGGED; + m->m_pkthdr.fw_flags |= PF_MBUF_TAGGED; m->m_pkthdr.pf_tag = tag; } @@ -4884,8 +4870,8 @@ pf_route(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp, (dir != PF_IN && dir != PF_OUT) || oifp == NULL) panic("pf_route: invalid parameters"); - if (((*m)->m_pkthdr.pf_flags & PF_MBUF_ROUTED) == 0) { - (*m)->m_pkthdr.pf_flags |= PF_MBUF_ROUTED; + if (((*m)->m_pkthdr.fw_flags & PF_MBUF_ROUTED) == 0) { + (*m)->m_pkthdr.fw_flags |= PF_MBUF_ROUTED; (*m)->m_pkthdr.pf_routed = 1; } else { if ((*m)->m_pkthdr.pf_routed > 3) { @@ -5054,8 +5040,8 @@ pf_route6(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp, (dir != PF_IN && dir != PF_OUT) || oifp == NULL) panic("pf_route6: invalid parameters"); - if (((*m)->m_pkthdr.pf_flags & PF_MBUF_ROUTED) == 0) { - (*m)->m_pkthdr.pf_flags |= PF_MBUF_ROUTED; + if (((*m)->m_pkthdr.fw_flags & PF_MBUF_ROUTED) == 0) { + (*m)->m_pkthdr.fw_flags |= PF_MBUF_ROUTED; (*m)->m_pkthdr.pf_routed = 1; } else { if ((*m)->m_pkthdr.pf_routed > 3) { @@ -5088,7 +5074,7 @@ pf_route6(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp, /* Cheat. */ if (r->rt == PF_FASTROUTE) { - m0->m_pkthdr.pf_flags |= PF_MBUF_GENERATED; + m0->m_pkthdr.fw_flags |= PF_MBUF_GENERATED; ip6_output(m0, NULL, NULL, 0, NULL, NULL, NULL); return; } @@ -5284,7 +5270,7 @@ pf_test(int dir, struct ifnet *ifp, struct mbuf **m0) struct pf_pdesc pd; int off, dirndx, pqid = 0; - if (!pf_status.running || (m->m_pkthdr.pf_flags & PF_MBUF_GENERATED)) + if (!pf_status.running || (m->m_pkthdr.fw_flags & PF_MBUF_GENERATED)) return (PF_PASS); kif = pfi_index2kif[ifp->if_index]; @@ -5462,21 +5448,13 @@ done: #ifdef ALTQ if (action == PF_PASS && r->qid) { - struct m_tag *mtag; - struct altq_tag *atag; - - mtag = m_tag_get(PACKET_TAG_PF_QID, sizeof(*atag), MB_DONTWAIT); - if (mtag != NULL) { - atag = (struct altq_tag *)(mtag + 1); - if (pqid || pd.tos == IPTOS_LOWDELAY) - atag->qid = r->pqid; - else - atag->qid = r->qid; - /* add hints for ecn */ - atag->af = AF_INET; - atag->hdr = h; - m_tag_prepend(m, mtag); - } + m->m_pkthdr.fw_flags |= ALTQ_MBUF_TAGGED; + if (pd.tos == IPTOS_LOWDELAY) + m->m_pkthdr.altq_qid = r->pqid; + else + m->m_pkthdr.altq_qid = r->qid; + m->m_pkthdr.ecn_af = AF_INET; + m->m_pkthdr.header = h; } #endif @@ -5494,7 +5472,7 @@ done: REASON_SET(&reason, PFRES_MEMORY); } - m->m_pkthdr.pf_flags |= PF_MBUF_TRANSLATE_LOCALHOST; + m->m_pkthdr.fw_flags |= PF_MBUF_TRANSLATE_LOCALHOST; if (log) PFLOG_PACKET(kif, h, m, AF_INET, dir, reason, r, a, ruleset); @@ -5582,14 +5560,14 @@ pf_test6(int dir, struct ifnet *ifp, struct mbuf **m0) struct pfi_kif *kif; u_short action, reason = 0, log = 0; struct mbuf *m = *m0; - struct ip6_hdr *h; + struct ip6_hdr *h = NULL; struct pf_rule *a = NULL, *r = &pf_default_rule, *tr, *nr; struct pf_state *s = NULL; struct pf_ruleset *ruleset = NULL; struct pf_pdesc pd; int off, terminal = 0, dirndx; - if (!pf_status.running || (m->m_pkthdr.pf_flags & PF_MBUF_GENERATED)) + if (!pf_status.running || (m->m_pkthdr.fw_flags & PF_MBUF_GENERATED)) return (PF_PASS); kif = pfi_index2kif[ifp->if_index]; @@ -5780,21 +5758,13 @@ done: #ifdef ALTQ if (action == PF_PASS && r->qid) { - struct m_tag *mtag; - struct altq_tag *atag; - - mtag = m_tag_get(PACKET_TAG_PF_QID, sizeof(*atag), MB_DONTWAIT); - if (mtag != NULL) { - atag = (struct altq_tag *)(mtag + 1); - if (pd.tos == IPTOS_LOWDELAY) - atag->qid = r->pqid; - else - atag->qid = r->qid; - /* add hints for ecn */ - atag->af = AF_INET6; - atag->hdr = h; - m_tag_prepend(m, mtag); - } + m->m_pkthdr.fw_flags |= ALTQ_MBUF_TAGGED; + if (pd.tos == IPTOS_LOWDELAY) + m->m_pkthdr.altq_qid = r->pqid; + else + m->m_pkthdr.altq_qid = r->qid; + m->m_pkthdr.ecn_af = AF_INET6; + m->m_pkthdr.header = h; } #endif @@ -5807,7 +5777,7 @@ done: REASON_SET(&reason, PFRES_MEMORY); } - m->m_pkthdr.pf_flags |= PF_MBUF_TRANSLATE_LOCALHOST; + m->m_pkthdr.fw_flags |= PF_MBUF_TRANSLATE_LOCALHOST; if (log) PFLOG_PACKET(kif, h, m, AF_INET6, dir, reason, r, a, ruleset); diff --git a/sys/net/pf/pf_ioctl.c b/sys/net/pf/pf_ioctl.c index cc7d97ab23..804a0611ae 100644 --- a/sys/net/pf/pf_ioctl.c +++ b/sys/net/pf/pf_ioctl.c @@ -1,6 +1,6 @@ /* $FreeBSD: src/sys/contrib/pf/net/pf_ioctl.c,v 1.12 2004/08/12 14:15:42 mlaier Exp $ */ /* $OpenBSD: pf_ioctl.c,v 1.112.2.2 2004/07/24 18:28:12 brad Exp $ */ -/* $DragonFly: src/sys/net/pf/pf_ioctl.c,v 1.3 2004/09/21 21:20:58 joerg Exp $ */ +/* $DragonFly: src/sys/net/pf/pf_ioctl.c,v 1.4 2005/02/11 22:25:57 joerg Exp $ */ /* * Copyright (c) 2004 The DragonFly Project. All rights reserved. @@ -80,7 +80,7 @@ #endif /* INET6 */ #ifdef ALTQ -#include +#include #endif #include diff --git a/sys/net/pf/pf_norm.c b/sys/net/pf/pf_norm.c index 71d90b484c..53986c59a7 100644 --- a/sys/net/pf/pf_norm.c +++ b/sys/net/pf/pf_norm.c @@ -1,7 +1,7 @@ /* $FreeBSD: src/sys/contrib/pf/net/pf_norm.c,v 1.10 2004/08/14 15:32:40 dwmalone Exp $ */ /* $OpenBSD: pf_norm.c,v 1.80.2.1 2004/04/30 21:46:33 brad Exp $ */ /* add $OpenBSD: pf_norm.c,v 1.87 2004/05/11 07:34:11 dhartmei Exp $ */ -/* $DragonFly: src/sys/net/pf/pf_norm.c,v 1.1 2004/09/19 22:32:47 joerg Exp $ */ +/* $DragonFly: src/sys/net/pf/pf_norm.c,v 1.2 2005/02/11 22:25:57 joerg Exp $ */ /* * Copyright (c) 2004 The DragonFly Project. All rights reserved. @@ -954,7 +954,7 @@ pf_normalize_ip(struct mbuf **m0, int dir, struct pfi_kif *kif, u_short *reason) int nomem = 0; if (dir == PF_OUT) { - if (m->m_pkthdr.pf_flags & PF_MBUF_FRAGCACHE) { + if (m->m_pkthdr.fw_flags & PF_MBUF_FRAGCACHE) { /* Already passed the fragment cache in the * input direction. If we continued, it would * appear to be a dup and would be dropped. @@ -982,7 +982,7 @@ pf_normalize_ip(struct mbuf **m0, int dir, struct pfi_kif *kif, u_short *reason) } if (dir == PF_IN) - m->m_pkthdr.pf_flags |= PF_MBUF_FRAGCACHE; + m->m_pkthdr.fw_flags |= PF_MBUF_FRAGCACHE; if (frag != NULL && (frag->fr_flags & PFFRAG_DROP)) goto drop; diff --git a/sys/net/pf/pfvar.h b/sys/net/pf/pfvar.h index 657000ca82..ce6e25638c 100644 --- a/sys/net/pf/pfvar.h +++ b/sys/net/pf/pfvar.h @@ -1,7 +1,7 @@ /* $FreeBSD: src/sys/contrib/pf/net/pfvar.h,v 1.8 2004/08/12 13:59:44 mlaier Exp $ */ /* $OpenBSD: pfvar.h,v 1.187 2004/03/22 04:54:18 mcbride Exp $ */ /* add $OpenBSD: pfvar.h,v 1.194 2004/05/11 07:34:11 dhartmei Exp $ */ -/* $DragonFly: src/sys/net/pf/pfvar.h,v 1.1 2004/09/19 22:32:47 joerg Exp $ */ +/* $DragonFly: src/sys/net/pf/pfvar.h,v 1.2 2005/02/11 22:25:57 joerg Exp $ */ /* * Copyright (c) 2004 The DragonFly Project. All rights reserved. @@ -44,6 +44,11 @@ #include #include + +#ifdef _KERNEL +#include +#endif + /* * XXX * If we include , we need _KERNEL definition. diff --git a/sys/net/ppp/if_ppp.c b/sys/net/ppp/if_ppp.c index 1d04448f20..1575849a40 100644 --- a/sys/net/ppp/if_ppp.c +++ b/sys/net/ppp/if_ppp.c @@ -70,7 +70,7 @@ */ /* $FreeBSD: src/sys/net/if_ppp.c,v 1.67.2.4 2002/04/14 21:41:48 luigi Exp $ */ -/* $DragonFly: src/sys/net/ppp/if_ppp.c,v 1.23 2005/01/26 00:37:39 joerg Exp $ */ +/* $DragonFly: src/sys/net/ppp/if_ppp.c,v 1.24 2005/02/11 22:25:57 joerg Exp $ */ /* from if_sl.c,v 1.11 84/10/04 12:54:47 rick Exp */ /* from NetBSD: if_ppp.c,v 1.15.2.2 1994/07/28 05:17:58 cgd Exp */ @@ -100,6 +100,7 @@ #include #include +#include #include #include @@ -148,6 +149,7 @@ static void ppp_ccp (struct ppp_softc *, struct mbuf *m, int rcvd); static void ppp_ccp_closed (struct ppp_softc *); static void ppp_inproc (struct ppp_softc *, struct mbuf *); static void pppdumpm (struct mbuf *m0); +static void ppp_ifstart(struct ifnet *ifp); /* * Some useful mbuf macros not in mbuf.h. @@ -205,7 +207,7 @@ pppintr(struct netmsg *msg) for (i = 0; i < NPPP; ++i, ++sc) { s = splimp(); if (!(sc->sc_flags & SC_TBUSY) - && (sc->sc_if.if_snd.ifq_head || sc->sc_fastq.ifq_head)) { + && (!ifq_is_empty(&sc->sc_if.if_snd) || !IF_QEMPTY(&sc->sc_fastq))) { sc->sc_flags |= SC_TBUSY; splx(s); (*sc->sc_start)(sc); @@ -242,7 +244,9 @@ pppattach(dummy) sc->sc_if.if_hdrlen = PPP_HDRLEN; sc->sc_if.if_ioctl = pppsioctl; sc->sc_if.if_output = pppoutput; - sc->sc_if.if_snd.ifq_maxlen = IFQ_MAXLEN; + sc->sc_if.if_start = ppp_ifstart; + ifq_set_maxlen(&sc->sc_if.if_snd, IFQ_MAXLEN); + ifq_set_ready(&sc->sc_if.if_snd); sc->sc_inq.ifq_maxlen = IFQ_MAXLEN; sc->sc_fastq.ifq_maxlen = IFQ_MAXLEN; sc->sc_rawq.ifq_maxlen = IFQ_MAXLEN; @@ -724,6 +728,7 @@ pppoutput(ifp, m0, dst, rtp) enum NPmode mode; int len; struct mbuf *m; + struct altq_pktattr pktattr; if (sc->sc_devp == NULL || (ifp->if_flags & IFF_RUNNING) == 0 || ((ifp->if_flags & IFF_UP) == 0 && dst->sa_family != AF_UNSPEC)) { @@ -731,6 +736,8 @@ pppoutput(ifp, m0, dst, rtp) goto bad; } + ifq_classify(&ifp->if_snd, m0, dst->sa_family, &pktattr); + /* * Compute PPP header. */ @@ -862,16 +869,25 @@ pppoutput(ifp, m0, dst, rtp) sc->sc_npqtail = &m0->m_nextpkt; } else { /* fastq and if_snd are emptied at spl[soft]net now */ - ifq = (m0->m_flags & M_HIGHPRI)? &sc->sc_fastq: &ifp->if_snd; - if (IF_QFULL(ifq) && dst->sa_family != AF_UNSPEC) { - IF_DROP(ifq); + if ((m0->m_flags & M_HIGHPRI) && !ifq_is_enabled(&sc->sc_if.if_snd)) { + ifq = &sc->sc_fastq; + if (IF_QFULL(ifq) && dst->sa_family != AF_UNSPEC) { + IF_DROP(ifq); + m_freem(m0); + error = ENOBUFS; + } else { + IF_ENQUEUE(ifq, m0); + error = 0; + } + } else { + error = ifq_enqueue(&sc->sc_if.if_snd, m0, &pktattr); + } + if (error) { splx(s); sc->sc_if.if_oerrors++; sc->sc_stats.ppp_oerrors++; - error = ENOBUFS; - goto bad; + return (error); } - IF_ENQUEUE(ifq, m0); (*sc->sc_start)(sc); } getmicrotime(&ifp->if_lastchange); @@ -898,6 +914,7 @@ ppp_requeue(sc) struct mbuf *m, **mpp; struct ifqueue *ifq; enum NPmode mode; + int error; for (mpp = &sc->sc_npqueue; (m = *mpp) != NULL; ) { switch (PPP_PROTOCOL(mtod(m, u_char *))) { @@ -915,13 +932,22 @@ ppp_requeue(sc) */ *mpp = m->m_nextpkt; m->m_nextpkt = NULL; - ifq = (m->m_flags & M_HIGHPRI)? &sc->sc_fastq: &sc->sc_if.if_snd; - if (IF_QFULL(ifq)) { - IF_DROP(ifq); - sc->sc_if.if_oerrors++; - sc->sc_stats.ppp_oerrors++; - } else - IF_ENQUEUE(ifq, m); + if ((m->m_flags & M_HIGHPRI) && !ifq_is_enabled(&sc->sc_if.if_snd)) { + ifq = &sc->sc_fastq; + if (IF_QFULL(ifq)) { + IF_DROP(ifq); + error = ENOBUFS; + } else { + IF_ENQUEUE(ifq, m); + error = 0; + } + } else { + error = ifq_enqueue(&sc->sc_if.if_snd, m, NULL); + } + if (error) { + sc->sc_if.if_oerrors++; + sc->sc_stats.ppp_oerrors++; + } break; case NPMODE_DROP: @@ -974,7 +1000,7 @@ ppp_dequeue(sc) */ IF_DEQUEUE(&sc->sc_fastq, m); if (m == NULL) - IF_DEQUEUE(&sc->sc_if.if_snd, m); + m = ifq_dequeue(&sc->sc_if.if_snd); if (m == NULL) return NULL; @@ -1562,3 +1588,16 @@ done: *bp = 0; printf("%s\n", buf); } + +/* + * a wrapper to transmit a packet from if_start since ALTQ uses + * if_start to send a packet. + */ +static void +ppp_ifstart(struct ifnet *ifp) +{ + struct ppp_softc *sc; + + sc = ifp->if_softc; + (*sc->sc_start)(sc); +} diff --git a/sys/net/ppp_layer/ppp_tty.c b/sys/net/ppp_layer/ppp_tty.c index 192c06f955..b32993300f 100644 --- a/sys/net/ppp_layer/ppp_tty.c +++ b/sys/net/ppp_layer/ppp_tty.c @@ -71,7 +71,7 @@ */ /* $FreeBSD: src/sys/net/ppp_tty.c,v 1.43.2.1 2002/02/13 00:43:11 dillon Exp $ */ -/* $DragonFly: src/sys/net/ppp_layer/ppp_tty.c,v 1.11 2004/09/16 04:39:31 dillon Exp $ */ +/* $DragonFly: src/sys/net/ppp_layer/ppp_tty.c,v 1.12 2005/02/11 22:25:57 joerg Exp $ */ #include "opt_ppp.h" /* XXX for ppp_defs.h */ @@ -93,6 +93,9 @@ #include #endif +#include +#include + #ifdef PPP_FILTER #include #endif @@ -778,6 +781,14 @@ pppstart(tp) if (tp->t_oproc != NULL) (*tp->t_oproc)(tp); + /* + * If ALTQ is enabled, don't invoke NETISR_PPP. + * pppintr() could loop without doing anything useful + * under rate-limiting. + */ + if (ifq_is_enabled(&sc->sc_if.if_snd)) + return 0; + /* * If the transmit queue has drained and the tty has not hung up * or been disconnected from the ppp unit, then tell if_ppp.c that diff --git a/sys/net/sl/if_sl.c b/sys/net/sl/if_sl.c index 8c851c4327..b7157e5efd 100644 --- a/sys/net/sl/if_sl.c +++ b/sys/net/sl/if_sl.c @@ -32,7 +32,7 @@ * * @(#)if_sl.c 8.6 (Berkeley) 2/1/94 * $FreeBSD: src/sys/net/if_sl.c,v 1.84.2.2 2002/02/13 00:43:10 dillon Exp $ - * $DragonFly: src/sys/net/sl/if_sl.c,v 1.16 2005/01/26 00:37:39 joerg Exp $ + * $DragonFly: src/sys/net/sl/if_sl.c,v 1.17 2005/02/11 22:25:57 joerg Exp $ */ /* @@ -89,6 +89,7 @@ #include #include +#include #include #if INET @@ -223,7 +224,8 @@ slattach(dummy) sc->sc_if.if_type = IFT_SLIP; sc->sc_if.if_ioctl = slioctl; sc->sc_if.if_output = sloutput; - sc->sc_if.if_snd.ifq_maxlen = 50; + ifq_set_maxlen(&sc->sc_if.if_snd, 50); + ifq_set_ready(&sc->sc_if.if_snd); sc->sc_fastq.ifq_maxlen = 32; sc->sc_if.if_linkmib = sc; sc->sc_if.if_linkmiblen = sizeof *sc; @@ -471,8 +473,10 @@ sloutput(ifp, m, dst, rtp) { struct sl_softc *sc = &sl_softc[ifp->if_dunit]; struct ip *ip; - struct ifqueue *ifq; - int s; + int error, s; + struct altq_pktattr pktattr; + + ifq_classify(&ifp->if_snd, m, dst->sa_family, &pktattr); /* * `Cannot happen' (see slioctl). Someday we will extend @@ -494,23 +498,29 @@ sloutput(ifp, m, dst, rtp) m_freem(m); return (EHOSTUNREACH); } - ifq = &sc->sc_if.if_snd; ip = mtod(m, struct ip *); if (sc->sc_if.if_flags & SC_NOICMP && ip->ip_p == IPPROTO_ICMP) { m_freem(m); return (ENETRESET); /* XXX ? */ } - if (ip->ip_tos & IPTOS_LOWDELAY) - ifq = &sc->sc_fastq; s = splimp(); - if (IF_QFULL(ifq)) { - IF_DROP(ifq); - m_freem(m); - splx(s); + if ((ip->ip_tos & IPTOS_LOWDELAY) && !ifq_is_enabled(&sc->sc_if.if_snd)) { + if (IF_QFULL(&sc->sc_fastq)) { + IF_DROP(&sc->sc_fastq); + m_freem(m); + error = ENOBUFS; + } else { + IF_ENQUEUE(&sc->sc_fastq, m); + error = 0; + } + } else { + error = ifq_enqueue(&sc->sc_if.if_snd, m, &pktattr); + } + if (error) { sc->sc_if.if_oerrors++; - return (ENOBUFS); + splx(s); + return (error); } - IF_ENQUEUE(ifq, m); if (sc->sc_ttyp->t_outq.c_cc == 0) slstart(sc->sc_ttyp); splx(s); @@ -563,7 +573,7 @@ slstart(tp) if (m) sc->sc_if.if_omcasts++; /* XXX */ else - IF_DEQUEUE(&sc->sc_if.if_snd, m); + m = ifq_dequeue(&sc->sc_if.if_snd); splx(s); if (m == NULL) return 0; diff --git a/sys/net/sppp/if_spppsubr.c b/sys/net/sppp/if_spppsubr.c index 0afecbbd0d..8f4c2aae5a 100644 --- a/sys/net/sppp/if_spppsubr.c +++ b/sys/net/sppp/if_spppsubr.c @@ -18,7 +18,7 @@ * From: Version 2.4, Thu Apr 30 17:17:21 MSD 1997 * * $FreeBSD: src/sys/net/if_spppsubr.c,v 1.59.2.13 2002/07/03 15:44:41 joerg Exp $ - * $DragonFly: src/sys/net/sppp/if_spppsubr.c,v 1.18 2004/09/16 14:56:32 joerg Exp $ + * $DragonFly: src/sys/net/sppp/if_spppsubr.c,v 1.19 2005/02/11 22:25:57 joerg Exp $ */ #include @@ -56,6 +56,7 @@ #endif #include +#include #include #include #include @@ -400,7 +401,6 @@ static void sppp_keepalive(void *dummy); static void sppp_phase_network(struct sppp *sp); static void sppp_print_bytes(const u_char *p, u_short len); static void sppp_print_string(const char *p, u_short len); -static void sppp_qflush(struct ifqueue *ifq); static void sppp_set_ip_addr(struct sppp *sp, u_long src); #ifdef INET6 static void sppp_get_ip6_addrs(struct sppp *sp, struct in6_addr *src, @@ -749,6 +749,7 @@ sppp_output(struct ifnet *ifp, struct mbuf *m, int s, rv = 0; int ipproto = PPP_IP; int debug = ifp->if_flags & IFF_DEBUG; + struct altq_pktattr pktattr; s = splimp(); @@ -786,7 +787,12 @@ sppp_output(struct ifnet *ifp, struct mbuf *m, s = splimp(); } - ifq = &ifp->if_snd; + /* + * if the queueing discipline needs packet classification, + * do it before prepending link headers. + */ + ifq_classify(&ifp->if_snd, m, dst->sa_family, &pktattr); + #ifdef INET if (dst->sa_family == AF_INET) { /* XXX Check mbuf length here? */ @@ -950,14 +956,24 @@ sppp_output(struct ifnet *ifp, struct mbuf *m, * Queue message on interface, and start output if interface * not yet active. */ - if (IF_QFULL (ifq)) { - IF_DROP (&ifp->if_snd); - m_freem (m); + if (ifq != NULL) { + if (IF_QFULL(ifq)) { + IF_DROP(ifq); + m_freem(m); + rv = ENOBUFS; + ifq->ifq_drops++; + } else { + IF_ENQUEUE(ifq, m); + rv = 0; + } + } else { + rv = ifq_enqueue(&ifp->if_snd, m, &pktattr); + } + if (rv) { ++ifp->if_oerrors; - splx (s); - return (rv? rv: ENOBUFS); + splx(s); + return(rv); } - IF_ENQUEUE (ifq, m); if (! (ifp->if_flags & IFF_OACTIVE)) (*ifp->if_start) (ifp); @@ -1056,9 +1072,9 @@ sppp_flush(struct ifnet *ifp) { struct sppp *sp = (struct sppp*) ifp; - sppp_qflush (&sp->pp_if.if_snd); - sppp_qflush (&sp->pp_fastq); - sppp_qflush (&sp->pp_cpq); + ifq_purge(&sp->pp_if.if_snd); + IF_DRAIN(&sp->pp_fastq); + IF_DRAIN(&sp->pp_cpq); } /* @@ -1071,8 +1087,8 @@ sppp_isempty(struct ifnet *ifp) int empty, s; s = splimp(); - empty = !sp->pp_fastq.ifq_head && !sp->pp_cpq.ifq_head && - !sp->pp_if.if_snd.ifq_head; + empty = IF_QEMPTY(&sp->pp_fastq) && IF_QEMPTY(&sp->pp_cpq) && + ifq_is_empty(&sp->pp_if.if_snd); splx(s); return (empty); } @@ -1099,7 +1115,7 @@ sppp_dequeue(struct ifnet *ifp) (sppp_ncp_check(sp) || sp->pp_mode == IFF_CISCO)) { IF_DEQUEUE(&sp->pp_fastq, m); if (m == NULL) - IF_DEQUEUE (&sp->pp_if.if_snd, m); + m = ifq_dequeue(&sp->pp_if.if_snd); } splx(s); return m; @@ -1119,9 +1135,10 @@ sppp_pick(struct ifnet *ifp) m = sp->pp_cpq.ifq_head; if (m == NULL && - (sp->pp_phase == PHASE_NETWORK || sp->pp_mode == IFF_CISCO)) + (sp->pp_phase == PHASE_NETWORK || sp->pp_mode == IFF_CISCO)) { if ((m = sp->pp_fastq.ifq_head) == NULL) - m = sp->pp_if.if_snd.ifq_head; + m = ifq_poll(&sp->pp_if.if_snd); + } splx (s); return (m); } @@ -1288,7 +1305,7 @@ sppp_cisco_input(struct sppp *sp, struct mbuf *m) sp->pp_loopcnt = 0; if (ifp->if_flags & IFF_UP) { if_down (ifp); - sppp_qflush (&sp->pp_cpq); + IF_DRAIN(&sp->pp_cpq); } } ++sp->pp_loopcnt; @@ -1806,7 +1823,7 @@ sppp_cp_input(const struct cp *cp, struct sppp *sp, struct mbuf *m) printf(SPP_FMT "loopback\n", SPP_ARGS(ifp)); sp->pp_loopcnt = MAXALIVECNT * 5; if_down (ifp); - sppp_qflush (&sp->pp_cpq); + IF_DRAIN(&sp->pp_cpq); /* Shut down the PPP link. */ /* XXX */ @@ -2413,7 +2430,7 @@ sppp_lcp_RCR(struct sppp *sp, struct lcp_header *h, int len) SPP_ARGS(ifp)); if (ifp->if_flags & IFF_UP) { if_down(ifp); - sppp_qflush(&sp->pp_cpq); + IF_DRAIN(&sp->pp_cpq); /* XXX ? */ lcp.Down(sp); lcp.Up(sp); @@ -4645,24 +4662,6 @@ sppp_auth_send(const struct cp *cp, struct sppp *sp, ifp->if_obytes += m->m_pkthdr.len + 3; } -/* - * Flush interface queue. - */ -static void -sppp_qflush(struct ifqueue *ifq) -{ - struct mbuf *m, *n; - - n = ifq->ifq_head; - while ((m = n)) { - n = m->m_nextpkt; - m_freem (m); - } - ifq->ifq_head = 0; - ifq->ifq_tail = 0; - ifq->ifq_len = 0; -} - /* * Send keepalive packets, every 10 seconds. */ @@ -4690,7 +4689,7 @@ sppp_keepalive(void *dummy) /* No keepalive packets got. Stop the interface. */ printf (SPP_FMT "down\n", SPP_ARGS(ifp)); if_down (ifp); - sppp_qflush (&sp->pp_cpq); + IF_DRAIN(&sp->pp_cpq); if (sp->pp_mode != IFF_CISCO) { /* XXX */ /* Shut down the PPP link. */ diff --git a/sys/net/tun/if_tun.c b/sys/net/tun/if_tun.c index 7e07abca9d..5f72f662f2 100644 --- a/sys/net/tun/if_tun.c +++ b/sys/net/tun/if_tun.c @@ -14,7 +14,7 @@ * operation though. * * $FreeBSD: src/sys/net/if_tun.c,v 1.74.2.8 2002/02/13 00:43:11 dillon Exp $ - * $DragonFly: src/sys/net/tun/if_tun.c,v 1.16 2005/01/26 00:37:40 joerg Exp $ + * $DragonFly: src/sys/net/tun/if_tun.c,v 1.17 2005/02/11 22:25:57 joerg Exp $ */ #include "opt_atalk.h" @@ -42,6 +42,7 @@ #include #include +#include #include #include @@ -69,6 +70,7 @@ static int tunoutput (struct ifnet *, struct mbuf *, struct sockaddr *, struct rtentry *rt); static int tunifioctl (struct ifnet *, u_long, caddr_t, struct ucred *); static int tuninit (struct ifnet *); +static void tunstart(struct ifnet *); static d_open_t tunopen; static d_close_t tunclose; @@ -122,9 +124,11 @@ tuncreate(dev) ifp->if_mtu = TUNMTU; ifp->if_ioctl = tunifioctl; ifp->if_output = tunoutput; + ifp->if_start = tunstart; ifp->if_flags = IFF_POINTOPOINT | IFF_MULTICAST; ifp->if_type = IFT_PPP; - ifp->if_snd.ifq_maxlen = ifqmaxlen; + ifq_set_maxlen(&ifp->if_snd, ifqmaxlen); + ifq_set_ready(&ifp->if_snd); ifp->if_softc = sc; if_attach(ifp); bpfattach(ifp, DLT_NULL, sizeof(u_int)); @@ -170,7 +174,6 @@ tunclose(dev_t dev, int foo, int bar, struct thread *td) int s; struct tun_softc *tp; struct ifnet *ifp; - struct mbuf *m; tp = dev->si_drv1; ifp = &tp->tun_if; @@ -178,16 +181,10 @@ tunclose(dev_t dev, int foo, int bar, struct thread *td) tp->tun_flags &= ~TUN_OPEN; tp->tun_pid = 0; - /* - * junk all pending output - */ - do { - s = splimp(); - IF_DEQUEUE(&ifp->if_snd, m); - splx(s); - if (m) - m_freem(m); - } while (m); + /* Junk all pending output. */ + s = splimp(); + ifq_purge(&ifp->if_snd); + splx(s); if (ifp->if_flags & IFF_UP) { s = splimp(); @@ -312,7 +309,8 @@ tunoutput(ifp, m0, dst, rt) struct rtentry *rt; { struct tun_softc *tp = ifp->if_softc; - int s; + int error, s; + struct altq_pktattr pktattr; TUNDEBUG ("%s: tunoutput\n", ifp->if_xname); @@ -323,6 +321,12 @@ tunoutput(ifp, m0, dst, rt) return EHOSTDOWN; } + /* + * if the queueing discipline needs packet classification, + * do it before prepending link headers. + */ + ifq_classify(&ifp->if_snd, m0, dst->sa_family, &pktattr); + /* BPF write needs to be handled specially */ if (dst->sa_family == AF_UNSPEC) { dst->sa_family = *(mtod(m0, int *)); @@ -382,15 +386,13 @@ tunoutput(ifp, m0, dst, rt) } s = splimp(); - if (IF_QFULL(&ifp->if_snd)) { - IF_DROP(&ifp->if_snd); - m_freem(m0); + error = ifq_enqueue(&ifp->if_snd, m0, &pktattr); + if (error) { splx(s); ifp->if_collisions++; return ENOBUFS; } ifp->if_obytes += m0->m_pkthdr.len; - IF_ENQUEUE(&ifp->if_snd, m0); splx(s); ifp->if_opackets++; @@ -480,8 +482,10 @@ tunioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct thread *td) break; case FIONREAD: s = splimp(); - if (tp->tun_if.if_snd.ifq_head) { - struct mbuf *mb = tp->tun_if.if_snd.ifq_head; + if (!ifq_is_empty(&tp->tun_if.if_snd)) { + struct mbuf *mb; + + mb = ifq_poll(&tp->tun_if.if_snd); for( *(int *)data = 0; mb != 0; mb = mb->m_next) *(int *)data += mb->m_len; } else @@ -536,7 +540,7 @@ tunread(dev, uio, flag) s = splimp(); do { - IF_DEQUEUE(&ifp->if_snd, m0); + m0 = ifq_dequeue(&ifp->if_snd); if (m0 == 0) { if (flag & IO_NDELAY) { splx(s); @@ -710,7 +714,7 @@ tunpoll(dev_t dev, int events, struct thread *td) TUNDEBUG("%s: tunpoll\n", ifp->if_xname); if (events & (POLLIN | POLLRDNORM)) { - if (ifp->if_snd.ifq_len > 0) { + if (!ifq_is_empty(&ifp->if_snd)) { TUNDEBUG("%s: tunpoll q=%d\n", ifp->if_xname, ifp->if_snd.ifq_len); revents |= events & (POLLIN | POLLRDNORM); @@ -725,3 +729,30 @@ tunpoll(dev_t dev, int events, struct thread *td) splx(s); return (revents); } + +/* + * Start packet transmission on the interface. + * when the interface queue is rate-limited by ALTQ, + * if_start is needed to drain packets from the queue in order + * to notify readers when outgoing packets become ready. + */ +static void +tunstart(struct ifnet *ifp) +{ + struct tun_softc *tp = ifp->if_softc; + struct mbuf *m; + + if (!ifq_is_enabled(&ifp->if_snd)) + return; + + m = ifq_poll(&ifp->if_snd); + if (m != NULL) { + if (tp->tun_flags & TUN_RWAIT) { + tp->tun_flags &= ~TUN_RWAIT; + wakeup((caddr_t)tp); + } + if (tp->tun_flags & TUN_ASYNC && tp->tun_sigio) + pgsigio(tp->tun_sigio, SIGIO, 0); + selwakeup(&tp->tun_rsel); + } +} diff --git a/sys/net/vlan/if_vlan.c b/sys/net/vlan/if_vlan.c index c9a1626dd7..91b226f8be 100644 --- a/sys/net/vlan/if_vlan.c +++ b/sys/net/vlan/if_vlan.c @@ -27,7 +27,7 @@ * SUCH DAMAGE. * * $FreeBSD: src/sys/net/if_vlan.c,v 1.15.2.13 2003/02/14 22:25:58 fenner Exp $ - * $DragonFly: src/sys/net/vlan/if_vlan.c,v 1.12 2005/01/26 00:37:40 joerg Exp $ + * $DragonFly: src/sys/net/vlan/if_vlan.c,v 1.13 2005/02/11 22:25:57 joerg Exp $ */ /* @@ -78,6 +78,7 @@ #include #include #include +#include #include "if_vlan_var.h" #ifdef INET @@ -229,7 +230,8 @@ vlan_clone_create(struct if_clone *ifc, int unit) ifp->if_init = vlan_ifinit; ifp->if_start = vlan_start; ifp->if_ioctl = vlan_ioctl; - ifp->if_snd.ifq_maxlen = ifqmaxlen; + ifq_set_maxlen(&ifp->if_snd, ifqmaxlen); + ifq_set_ready(&ifp->if_snd); ether_ifattach(ifp, ifv->ifv_ac.ac_enaddr); /* Now undo some of the damage... */ ifp->if_data.ifi_type = IFT_L2VLAN; @@ -267,13 +269,15 @@ vlan_start(struct ifnet *ifp) struct ifnet *p; struct ether_vlan_header *evl; struct mbuf *m; + int error; + struct altq_pktattr pktattr; ifv = ifp->if_softc; p = ifv->ifv_p; ifp->if_flags |= IFF_OACTIVE; for (;;) { - IF_DEQUEUE(&ifp->if_snd, m); + m = ifq_dequeue(&ifp->if_snd); if (m == 0) break; BPF_MTAP(ifp, m); @@ -289,6 +293,15 @@ vlan_start(struct ifnet *ifp) continue; } + /* + * If ALTQ is enabled on the parent interface, do + * classification; the queueing discipline might + * not require classification, but might require + * the address family/header pointer in the pktattr. + */ + if (ifq_is_enabled(&p->if_snd)) + altq_etherclassify(&p->if_snd, m, &pktattr); + /* * If the LINK0 flag is set, it means the underlying interface * can do VLAN tag insertion itself and doesn't require us to @@ -346,14 +359,11 @@ vlan_start(struct ifnet *ifp) * Send it, precisely as ether_output() would have. * We are already running at splimp. */ - if (IF_QFULL(&p->if_snd)) { - IF_DROP(&p->if_snd); - /* XXX stats */ + error = ifq_enqueue(&p->if_snd, m, &pktattr); + if (error) { ifp->if_oerrors++; - m_freem(m); continue; } - IF_ENQUEUE(&p->if_snd, m); ifp->if_opackets++; p->if_obytes += m->m_pkthdr.len; if (m->m_flags & M_MCAST) diff --git a/sys/netinet/ip.h b/sys/netinet/ip.h index 9a0d459626..b34ea33f12 100644 --- a/sys/netinet/ip.h +++ b/sys/netinet/ip.h @@ -32,7 +32,7 @@ * * @(#)ip.h 8.2 (Berkeley) 6/1/94 * $FreeBSD: src/sys/netinet/ip.h,v 1.17 1999/12/22 19:13:20 shin Exp $ - * $DragonFly: src/sys/netinet/ip.h,v 1.3 2004/09/23 16:44:32 joerg Exp $ + * $DragonFly: src/sys/netinet/ip.h,v 1.4 2005/02/11 22:25:57 joerg Exp $ */ #ifndef _NETINET_IP_H_ @@ -89,9 +89,11 @@ struct ip { #define IPTOS_THROUGHPUT 0x08 #define IPTOS_RELIABILITY 0x04 #define IPTOS_MINCOST 0x02 -/* ECN bits proposed by Sally Floyd */ +#if 1 +/* ECN RFC3168 obsoletes RFC2481, and these will be deprecated soon. */ #define IPTOS_CE 0x01 /* congestion experienced */ #define IPTOS_ECT 0x02 /* ECN-capable transport */ +#endif /* @@ -106,6 +108,16 @@ struct ip { #define IPTOS_PREC_PRIORITY 0x20 #define IPTOS_PREC_ROUTINE 0x00 +/* + * ECN (Explicit Congestion Notification) codepoints in RFC3168 + * mapped to the lower 2 bits of the TOS field. + */ +#define IPTOS_ECN_NOTECT 0x00 /* not-ECT */ +#define IPTOS_ECN_ECT1 0x01 /* ECN-capable transport (1) */ +#define IPTOS_ECN_ECT0 0x02 /* ECN-capable transport (0) */ +#define IPTOS_ECN_CE 0x03 /* congestion experienced */ +#define IPTOS_ECN_MASK 0x03 /* ECN field mask */ + /* * Definitions for options. */ diff --git a/sys/netinet/ip_icmp.c b/sys/netinet/ip_icmp.c index 5883807d59..a5e28c5c14 100644 --- a/sys/netinet/ip_icmp.c +++ b/sys/netinet/ip_icmp.c @@ -32,7 +32,7 @@ * * @(#)ip_icmp.c 8.2 (Berkeley) 1/4/94 * $FreeBSD: src/sys/netinet/ip_icmp.c,v 1.39.2.19 2003/01/24 05:11:34 sam Exp $ - * $DragonFly: src/sys/netinet/ip_icmp.c,v 1.19 2005/01/06 17:59:32 hsu Exp $ + * $DragonFly: src/sys/netinet/ip_icmp.c,v 1.20 2005/02/11 22:25:57 joerg Exp $ */ #include "opt_ipsec.h" @@ -237,7 +237,7 @@ icmp_error(n, type, code, dest, destifp) nip->ip_vhl = IP_VHL_BORING; nip->ip_p = IPPROTO_ICMP; nip->ip_tos = 0; - m->m_pkthdr.pf_flags |= n->m_pkthdr.pf_flags & PF_MBUF_GENERATED; + m->m_pkthdr.fw_flags |= n->m_pkthdr.fw_flags & PF_MBUF_GENERATED; icmp_reflect(m); freeit: @@ -742,7 +742,7 @@ match: bcopy((caddr_t)ip + optlen, ip + 1, m->m_len - sizeof(struct ip)); } - m->m_pkthdr.pf_flags &= PF_MBUF_GENERATED; + m->m_pkthdr.fw_flags &= PF_MBUF_GENERATED; m->m_flags &= ~(M_BCAST|M_MCAST); icmp_send(m, opts, ro); done: diff --git a/sys/netinet/ip_input.c b/sys/netinet/ip_input.c index 7c05549c22..b72f8b1404 100644 --- a/sys/netinet/ip_input.c +++ b/sys/netinet/ip_input.c @@ -82,7 +82,7 @@ * * @(#)ip_input.c 8.2 (Berkeley) 1/4/94 * $FreeBSD: src/sys/netinet/ip_input.c,v 1.130.2.52 2003/03/07 07:01:28 silby Exp $ - * $DragonFly: src/sys/netinet/ip_input.c,v 1.45 2005/01/26 23:09:57 hsu Exp $ + * $DragonFly: src/sys/netinet/ip_input.c,v 1.46 2005/02/11 22:25:57 joerg Exp $ */ #define _IP_VHL @@ -555,6 +555,12 @@ ip_input(struct mbuf *m) goto bad; } +#ifdef ALTQ + if (altq_input != NULL && (*altq_input)(m, AF_INET) == 0) { + /* packet is dropped by traffic conditioner */ + return; + } +#endif /* * Convert fields to host representation. */ diff --git a/sys/netinet/ip_output.c b/sys/netinet/ip_output.c index 41da6154b9..c732ad272d 100644 --- a/sys/netinet/ip_output.c +++ b/sys/netinet/ip_output.c @@ -28,7 +28,7 @@ * * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 * $FreeBSD: src/sys/netinet/ip_output.c,v 1.99.2.37 2003/04/15 06:44:45 silby Exp $ - * $DragonFly: src/sys/netinet/ip_output.c,v 1.25 2005/02/08 22:56:19 hsu Exp $ + * $DragonFly: src/sys/netinet/ip_output.c,v 1.26 2005/02/11 22:25:57 joerg Exp $ */ #define _IP_VHL @@ -407,6 +407,12 @@ ip_output(struct mbuf *m0, struct mbuf *opt, struct route *ro, } } #endif /* notdef */ +#ifdef ALTQ + /* + * Disable packet drop hack. + * Packetdrop should be done by queueing. + */ +#else /* !ALTQ */ /* * Verify that we have any chance at all of being able to queue * the packet or packet fragments @@ -417,6 +423,7 @@ ip_output(struct mbuf *m0, struct mbuf *opt, struct route *ro, ipstat.ips_odropped++; goto bad; } +#endif /* !ALTQ */ /* * Look for broadcast address and diff --git a/sys/netinet6/ip6_input.c b/sys/netinet6/ip6_input.c index 1c964005f1..dd9b2b9579 100644 --- a/sys/netinet6/ip6_input.c +++ b/sys/netinet6/ip6_input.c @@ -1,5 +1,5 @@ /* $FreeBSD: src/sys/netinet6/ip6_input.c,v 1.11.2.15 2003/01/24 05:11:35 sam Exp $ */ -/* $DragonFly: src/sys/netinet6/ip6_input.c,v 1.24 2005/02/01 16:09:37 hrs Exp $ */ +/* $DragonFly: src/sys/netinet6/ip6_input.c,v 1.25 2005/02/11 22:25:57 joerg Exp $ */ /* $KAME: ip6_input.c,v 1.259 2002/01/21 04:58:09 jinmei Exp $ */ /* @@ -360,6 +360,13 @@ ip6_input(struct netmsg *msg) ip6stat.ip6s_nxthist[ip6->ip6_nxt]++; +#ifdef ALTQ + if (altq_input != NULL && (*altq_input)(m, AF_INET6) == 0) { + /* packet is dropped by traffic conditioner */ + return(ENOBUFS); + } +#endif + /* * Check with the firewall... */ diff --git a/sys/sys/mbuf.h b/sys/sys/mbuf.h index eac16a9df4..3ec210c730 100644 --- a/sys/sys/mbuf.h +++ b/sys/sys/mbuf.h @@ -34,7 +34,7 @@ * * @(#)mbuf.h 8.5 (Berkeley) 2/19/95 * $FreeBSD: src/sys/sys/mbuf.h,v 1.44.2.17 2003/04/15 06:15:02 silby Exp $ - * $DragonFly: src/sys/sys/mbuf.h,v 1.18 2004/09/25 15:20:33 joerg Exp $ + * $DragonFly: src/sys/sys/mbuf.h,v 1.19 2005/02/11 22:26:35 joerg Exp $ */ #ifndef _SYS_MBUF_H_ @@ -88,6 +88,9 @@ SLIST_HEAD(packet_tags, m_tag); /* * Record/packet header in first mbuf of chain; valid only if M_PKTHDR is set. + * + * Be careful: The fields have been carefully ordered to avoid hidden padding. + * Keep this in mind, when adding or removing fields! */ struct pkthdr { struct ifnet *rcvif; /* rcv interface */ @@ -101,11 +104,16 @@ struct pkthdr { int csum_flags; /* flags regarding checksum */ int csum_data; /* data field used by csum routines */ + /* firewall flags */ + uint32_t fw_flags; /* flags for PF */ + /* variables for PF processing */ - int pf_flags; /* flags for PF */ uint16_t pf_tag; /* PF tag id */ uint8_t pf_routed; /* PF routing counter */ - uint8_t pf_unused01; /* pad */ + + /* variables for ALTQ processing */ + uint8_t ecn_af; /* address family for ECN */ + uint32_t altq_qid; /* queue id */ }; /* @@ -217,6 +225,7 @@ struct mbuf { #define PF_MBUF_TRANSLATE_LOCALHOST \ 0x00000008 #define PF_MBUF_FRAGCACHE 0x00000010 +#define ALTQ_MBUF_TAGGED 0x00000020 /* altq_qid is valid */ /* * mbuf types.