Import ALTQ support from KAME. This is based on the FreeBSD 4 snapshot.
authorJoerg Sonnenberger <joerg@dragonflybsd.org>
Fri, 11 Feb 2005 22:26:35 +0000 (22:26 +0000)
committerJoerg Sonnenberger <joerg@dragonflybsd.org>
Fri, 11 Feb 2005 22:26:35 +0000 (22:26 +0000)
This includes neither the ALTQ3 compat code nor the !DragonFly defines.
The macros have been replaced with inline functions in net/ifq_var.h.

This also renames pkthdr.pf_flags as it is intended as general flag bit.
Currently supported are ppp(4), sppp(4), tun(4) and wi(4), more drivers
are coming later.

Reviewed-by: corecode, dillon, hsu
Comments-from: hmp

56 files changed:
sys/conf/files
sys/conf/options
sys/config/LINT
sys/dev/netif/cm/smc90cx6.c
sys/dev/netif/owi/owi_hostap.c
sys/dev/netif/pdq_layer/pdq_ifsubr.c
sys/dev/netif/re/if_re.c
sys/dev/netif/wi/if_wi.c
sys/i386/conf/LINT
sys/kern/uipc_mbuf.c
sys/net/altq/altq.h [new file with mode: 0644]
sys/net/altq/altq_cbq.c [new file with mode: 0644]
sys/net/altq/altq_cbq.h [new file with mode: 0644]
sys/net/altq/altq_classq.h [new file with mode: 0644]
sys/net/altq/altq_hfsc.c [new file with mode: 0644]
sys/net/altq/altq_hfsc.h [new file with mode: 0644]
sys/net/altq/altq_priq.c [new file with mode: 0644]
sys/net/altq/altq_priq.h [new file with mode: 0644]
sys/net/altq/altq_red.c [new file with mode: 0644]
sys/net/altq/altq_red.h [new file with mode: 0644]
sys/net/altq/altq_rio.c [new file with mode: 0644]
sys/net/altq/altq_rio.h [new file with mode: 0644]
sys/net/altq/altq_rmclass.c [new file with mode: 0644]
sys/net/altq/altq_rmclass.h [new file with mode: 0644]
sys/net/altq/altq_rmclass_debug.h [new file with mode: 0644]
sys/net/altq/altq_subr.c [new file with mode: 0644]
sys/net/altq/altq_var.h [new file with mode: 0644]
sys/net/altq/if_altq.h [new file with mode: 0644]
sys/net/bridge/bridge.c
sys/net/ethernet.h
sys/net/if.c
sys/net/if_arcsubr.c
sys/net/if_atmsubr.c
sys/net/if_ethersubr.c
sys/net/if_fddisubr.c
sys/net/if_loop.c
sys/net/if_var.h
sys/net/ifq_var.h [new file with mode: 0644]
sys/net/ip_mroute/ip_mroute.c
sys/net/oldbridge/bridge.c
sys/net/pf/pf.c
sys/net/pf/pf_ioctl.c
sys/net/pf/pf_norm.c
sys/net/pf/pfvar.h
sys/net/ppp/if_ppp.c
sys/net/ppp_layer/ppp_tty.c
sys/net/sl/if_sl.c
sys/net/sppp/if_spppsubr.c
sys/net/tun/if_tun.c
sys/net/vlan/if_vlan.c
sys/netinet/ip.h
sys/netinet/ip_icmp.c
sys/netinet/ip_input.c
sys/netinet/ip_output.c
sys/netinet6/ip6_input.c
sys/sys/mbuf.h

index a6717a6..552b442 100644 (file)
@@ -1,5 +1,5 @@
 # $FreeBSD: src/sys/conf/files,v 1.340.2.137 2003/06/04 17:10:30 sam Exp $
-# $DragonFly: src/sys/conf/files,v 1.85 2005/01/31 23:44:34 joerg Exp $
+# $DragonFly: src/sys/conf/files,v 1.86 2005/02/11 22:25:56 joerg Exp $
 #
 # The long compile-with and dependency lines are required because of
 # limitations in config: backslash-newline doesn't work in strings, and
@@ -777,6 +777,13 @@ vfs/ntfs/ntfs_vnops.c              optional ntfs
 vfs/ntfs/ntfs_subr.c           optional ntfs
 vfs/ntfs/ntfs_compr.c          optional ntfs
 vfs/ntfs/ntfs_ihash.c          optional ntfs
+net/altq/altq_cbq.c            optional altq
+net/altq/altq_hfsc.c           optional altq
+net/altq/altq_priq.c           optional altq
+net/altq/altq_red.c            optional altq
+net/altq/altq_rio.c            optional altq
+net/altq/altq_rmclass.c                optional altq
+net/altq/altq_subr.c           optional altq
 net/bpf.c                      standard
 net/bpf_filter.c               optional bpf
 net/bridge/bridge.c            optional bridge
index 072bf39..9d51423 100644 (file)
@@ -1,5 +1,5 @@
 # $FreeBSD: src/sys/conf/options,v 1.191.2.53 2003/06/04 17:56:58 sam Exp $
-# $DragonFly: src/sys/conf/options,v 1.29 2005/01/31 23:44:34 joerg Exp $
+# $DragonFly: src/sys/conf/options,v 1.30 2005/02/11 22:25:56 joerg Exp $
 #
 #        On the handling of kernel options
 #
@@ -256,6 +256,16 @@ ISP_TARGET_MODE            opt_isp.h
 ATA_STATIC_ID          opt_ata.h
 
 # Net stuff.
+# altq stuff
+ALTQ                   opt_global.h
+ALTQ_CBQ               opt_altq.h
+ALTQ_RED               opt_altq.h
+ALTQ_RIO               opt_altq.h
+ALTQ_HFSC              opt_altq.h
+ALTQ_PRIQ              opt_altq.h
+ALTQ_NOPCC             opt_altq.h
+ALTQ_DEBUG             opt_altq.h
+
 ACCEPT_FILTER_DATA
 ACCEPT_FILTER_HTTP
 BOOTP                  opt_bootp.h
index 07b04b8..1a39e0b 100644 (file)
@@ -3,7 +3,7 @@
 #      as much of the source tree as it can.
 #
 # $FreeBSD: src/sys/i386/conf/LINT,v 1.749.2.144 2003/06/04 17:56:59 sam Exp $
-# $DragonFly: src/sys/config/LINT,v 1.46 2005/01/31 23:44:35 joerg Exp $
+# $DragonFly: src/sys/config/LINT,v 1.47 2005/02/11 22:25:56 joerg Exp $
 #
 # NB: You probably don't want to try running a kernel built from this
 # file.  Instead, you should start from GENERIC, and add options from
@@ -2796,3 +2796,16 @@ options  KTR_COMPILE=(KTR_ALL)   # Every trace class, see sys/ktr.h for
                                        # the different class numbers
 options        KTR_ENTRIES=1024
 options        KTR_VERBOSE=1
+
+# ALTQ
+options        ALTQ            #alternate queueing
+options        ALTQ_CBQ        #class based queueing
+options        ALTQ_RED        #random early detection
+options        ALTQ_RIO        #triple red for diffserv (needs RED)
+options        ALTQ_HFSC       #hierarchical fair service curve
+options        ALTQ_PRIQ       #priority queue
+#options       ALTQ_NOPCC      #don't use processor cycle counter
+options        ALTQ_DEBUG      #for debugging
+# you might want to set kernel timer to 1kHz if you use CBQ,
+# especially with 100baseT
+#options       HZ=1000
index b73f2a5..20ac62b 100644 (file)
@@ -1,6 +1,6 @@
 /*     $NetBSD: smc90cx6.c,v 1.38 2001/07/07 15:57:53 thorpej Exp $ */
 /*     $FreeBSD: src/sys/dev/cm/smc90cx6.c,v 1.1.2.3 2003/02/05 18:42:14 fjoe Exp $ */
-/*     $DragonFly: src/sys/dev/netif/cm/Attic/smc90cx6.c,v 1.10 2004/07/23 07:16:25 joerg Exp $ */
+/*     $DragonFly: src/sys/dev/netif/cm/Attic/smc90cx6.c,v 1.11 2005/02/11 22:25:56 joerg Exp $ */
 
 /*-
  * Copyright (c) 1994, 1995, 1998 The NetBSD Foundation, Inc.
@@ -318,7 +318,6 @@ cm_attach(dev)
        ifp->if_ioctl = cm_ioctl;
        ifp->if_watchdog  = cm_watchdog;
        ifp->if_init = cm_init;
-       /* XXX IFQ_SET_READY(&ifp->if_snd); */
        ifp->if_snd.ifq_maxlen = IFQ_MAXLEN;
        ifp->if_timer = 0;
        ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX;
index 5ffee95..6b6fa17 100644 (file)
@@ -30,7 +30,7 @@
  * THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD: src/sys/dev/wi/wi_hostap.c,v 1.7.2.4 2002/08/02 07:11:34 imp Exp $
- * $DragonFly: src/sys/dev/netif/owi/Attic/owi_hostap.c,v 1.2 2004/09/15 00:21:09 joerg Exp $
+ * $DragonFly: src/sys/dev/netif/owi/Attic/owi_hostap.c,v 1.3 2005/02/11 22:25:56 joerg Exp $
  */
 
 /* This is experimental Host AP software for Prism 2 802.11b interfaces.
@@ -65,6 +65,7 @@
 #include <sys/rman.h>
 
 #include <net/if.h>
+#include <net/ifq_var.h>
 #include <net/if_arp.h>
 #include <net/ethernet.h>
 #include <net/if_dl.h>
@@ -1113,7 +1114,7 @@ owihap_data_input(struct wi_softc *sc, struct wi_frame *rxfrm, struct mbuf *m)
 
                /* Queue up for repeating.
                 */
-               IF_HANDOFF(&ifp->if_snd, m, ifp);
+               ifq_handoff(ifp, m, NULL);
                return (!mcast);
        }
 
index 7e4105c..c2b4a88 100644 (file)
@@ -22,7 +22,7 @@
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD: src/sys/dev/pdq/pdq_ifsubr.c,v 1.11.2.1 2000/08/02 22:39:30 peter Exp $
- * $DragonFly: src/sys/dev/netif/pdq_layer/Attic/pdq_ifsubr.c,v 1.9 2005/02/10 00:09:17 joerg Exp $
+ * $DragonFly: src/sys/dev/netif/pdq_layer/Attic/pdq_ifsubr.c,v 1.10 2005/02/11 22:25:56 joerg Exp $
  *
  */
 
@@ -364,7 +364,7 @@ pdq_ifattach(
 
     ifp->if_ioctl = pdq_ifioctl;
     ifp->if_start = pdq_ifstart;
-    IFQ_SET_MAXLEN(&ifp->if_snd, IFQ_MAXLEN);
+    ifp->if_snd.ifq_maxlen = IFQ_MAXLEN;
   
     fddi_ifattach(ifp);
 }
index 878a27b..1bc70a2 100644 (file)
@@ -33,7 +33,7 @@
  * THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD: src/sys/dev/re/if_re.c,v 1.25 2004/06/09 14:34:01 naddy Exp $
- * $DragonFly: src/sys/dev/netif/re/if_re.c,v 1.7 2005/01/25 19:35:11 dillon Exp $
+ * $DragonFly: src/sys/dev/netif/re/if_re.c,v 1.8 2005/02/11 22:25:56 joerg Exp $
  */
 
 /*
 #include <sys/socket.h>
 
 #include <net/if.h>
+#include <net/ifq_var.h>
 #include <net/if_arp.h>
 #include <net/ethernet.h>
 #include <net/if_dl.h>
@@ -630,11 +631,15 @@ re_diag(struct re_softc *sc)
 
        /*
         * Queue the packet, start transmission.
-        * Note: IF_HANDOFF() ultimately calls re_start() for us.
+        * Note: ifq_handoff() ultimately calls re_start() for us.
         */
 
        CSR_WRITE_2(sc, RE_ISR, 0xFFFF);
-       IF_HANDOFF(&ifp->if_snd, m0, ifp);
+       error = ifq_handoff(ifp, m0, NULL);
+       if (error) {
+               m0 = NULL;
+               goto done;
+       }
        m0 = NULL;
 
        /* Wait for it to propagate through the chip */
index 5c05db4..bfdb18a 100644 (file)
@@ -32,7 +32,7 @@
  * THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD: src/sys/dev/wi/if_wi.c,v 1.166 2004/04/01 00:38:45 sam Exp $
- * $DragonFly: src/sys/dev/netif/wi/if_wi.c,v 1.19 2005/02/02 14:14:20 joerg Exp $
+ * $DragonFly: src/sys/dev/netif/wi/if_wi.c,v 1.20 2005/02/11 22:25:56 joerg Exp $
  */
 
 /*
@@ -93,6 +93,7 @@
 #include <net/if_dl.h>
 #include <net/if_media.h>
 #include <net/if_types.h>
+#include <net/ifq_var.h>
 
 #include <netproto/802_11/ieee80211_var.h>
 #include <netproto/802_11/ieee80211_ioctl.h>
@@ -291,7 +292,8 @@ wi_attach(device_t dev)
        ifp->if_start = wi_start;
        ifp->if_watchdog = wi_watchdog;
        ifp->if_init = wi_init;
-       ifp->if_snd.ifq_maxlen = IFQ_MAXLEN;
+       ifq_set_maxlen(&ifp->if_snd, IFQ_MAXLEN);
+       ifq_set_ready(&ifp->if_snd);
 #ifdef DEVICE_POLLING
        ifp->if_capabilities |= IFCAP_POLLING;
 #endif
@@ -548,8 +550,7 @@ wi_poll(struct ifnet *ifp, enum poll_cmd cmd, int count)
        }
 
        if ((ifp->if_flags & IFF_OACTIVE) == 0 &&
-           (sc->sc_flags & WI_FLAGS_OUTRANGE) == 0 &&
-           IF_QLEN(&ifp->if_snd) != NULL)
+           (sc->sc_flags & WI_FLAGS_OUTRANGE) == 0 && !ifq_is_empty(&ifp->if_snd))
                wi_start(ifp);
 }
 #endif /* DEVICE_POLLING */
@@ -595,7 +596,7 @@ wi_intr(void *arg)
                wi_info_intr(sc);
        if ((ifp->if_flags & IFF_OACTIVE) == 0 &&
            (sc->sc_flags & WI_FLAGS_OUTRANGE) == 0 &&
-           IF_QLEN(&ifp->if_snd) != 0)
+           !ifq_is_empty(&ifp->if_snd))
                wi_start(ifp);
 
        /* Re-enable interrupts. */
@@ -891,14 +892,14 @@ wi_start(struct ifnet *ifp)
                } else {
                        if (ic->ic_state != IEEE80211_S_RUN)
                                break;
-                       IFQ_POLL(&ifp->if_snd, m0);
+                       m0 = ifq_poll(&ifp->if_snd);
                        if (m0 == NULL)
                                break;
                        if (sc->sc_txd[cur].d_len != 0) {
                                ifp->if_flags |= IFF_OACTIVE;
                                break;
                        }
-                       IFQ_DEQUEUE(&ifp->if_snd, m0);
+                       m0 = ifq_dequeue(&ifp->if_snd);
                        ifp->if_opackets++;
                        m_copydata(m0, 0, ETHER_HDR_LEN, 
                            (caddr_t)&frmhdr.wi_ehdr);
index 5c2ac52..92c6010 100644 (file)
@@ -3,7 +3,7 @@
 #      as much of the source tree as it can.
 #
 # $FreeBSD: src/sys/i386/conf/LINT,v 1.749.2.144 2003/06/04 17:56:59 sam Exp $
-# $DragonFly: src/sys/i386/conf/Attic/LINT,v 1.46 2005/01/31 23:44:35 joerg Exp $
+# $DragonFly: src/sys/i386/conf/Attic/LINT,v 1.47 2005/02/11 22:25:56 joerg Exp $
 #
 # NB: You probably don't want to try running a kernel built from this
 # file.  Instead, you should start from GENERIC, and add options from
@@ -2796,3 +2796,16 @@ options  KTR_COMPILE=(KTR_ALL)   # Every trace class, see sys/ktr.h for
                                        # the different class numbers
 options        KTR_ENTRIES=1024
 options        KTR_VERBOSE=1
+
+# ALTQ
+options        ALTQ            #alternate queueing
+options        ALTQ_CBQ        #class based queueing
+options        ALTQ_RED        #random early detection
+options        ALTQ_RIO        #triple red for diffserv (needs RED)
+options        ALTQ_HFSC       #hierarchical fair service curve
+options        ALTQ_PRIQ       #priority queue
+#options       ALTQ_NOPCC      #don't use processor cycle counter
+options        ALTQ_DEBUG      #for debugging
+# you might want to set kernel timer to 1kHz if you use CBQ,
+# especially with 100baseT
+#options       HZ=1000
index 56ecf03..b86355f 100644 (file)
@@ -82,7 +82,7 @@
  *
  * @(#)uipc_mbuf.c     8.2 (Berkeley) 1/4/94
  * $FreeBSD: src/sys/kern/uipc_mbuf.c,v 1.51.2.24 2003/04/15 06:59:29 silby Exp $
- * $DragonFly: src/sys/kern/uipc_mbuf.c,v 1.31 2005/02/04 19:16:00 dillon Exp $
+ * $DragonFly: src/sys/kern/uipc_mbuf.c,v 1.32 2005/02/11 22:25:57 joerg Exp $
  */
 
 #include "opt_param.h"
@@ -770,7 +770,7 @@ m_gethdr(int how, int type)
        m->m_pkthdr.rcvif = NULL;
        SLIST_INIT(&m->m_pkthdr.tags);
        m->m_pkthdr.csum_flags = 0;
-       m->m_pkthdr.pf_flags = 0;
+       m->m_pkthdr.fw_flags = 0;
        return (m);
 }
 
diff --git a/sys/net/altq/altq.h b/sys/net/altq/altq.h
new file mode 100644 (file)
index 0000000..2216303
--- /dev/null
@@ -0,0 +1,63 @@
+/*     $KAME: altq.h,v 1.10 2003/07/10 12:07:47 kjc Exp $      */
+/*     $DragonFly: src/sys/net/altq/altq.h,v 1.1 2005/02/11 22:25:57 joerg Exp $ */
+
+/*
+ * Copyright (C) 1998-2003
+ *     Sony Computer Science Laboratories Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+#ifndef _ALTQ_ALTQ_H_
+#define        _ALTQ_ALTQ_H_
+
+/* altq discipline type */
+#define        ALTQT_NONE              0       /* reserved */
+#define        ALTQT_CBQ               1       /* cbq */
+#define        ALTQT_RED               2       /* red */
+#define        ALTQT_RIO               3       /* rio */
+#define        ALTQT_HFSC              4       /* hfsc */
+#define        ALTQT_PRIQ              5       /* priority queue */
+#define        ALTQT_MAX               6       /* should be max discipline type + 1 */
+
+/* simple token packet meter profile */
+struct tb_profile {
+       u_int   rate;   /* rate in bit-per-sec */
+       u_int   depth;  /* depth in bytes */
+};
+
+/*
+ * generic packet counter
+ */
+struct pktcntr {
+       uint64_t        packets;
+       uint64_t        bytes;
+};
+
+#define        PKTCNTR_ADD(cntr, len)          do {                            \
+       (cntr)->packets++; (cntr)->bytes += len;                        \
+} while (0)
+
+#ifdef _KERNEL
+#include <net/altq/altq_var.h>
+#endif
+
+#endif /* _ALTQ_ALTQ_H_ */
diff --git a/sys/net/altq/altq_cbq.c b/sys/net/altq/altq_cbq.c
new file mode 100644 (file)
index 0000000..5e237c0
--- /dev/null
@@ -0,0 +1,542 @@
+/*     $KAME: altq_cbq.c,v 1.20 2004/04/17 10:54:48 kjc Exp $  */
+/*     $DragonFly: src/sys/net/altq/altq_cbq.c,v 1.1 2005/02/11 22:25:57 joerg Exp $ */
+
+/*
+ * Copyright (c) Sun Microsystems, Inc. 1993-1998 All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *      This product includes software developed by the SMCC Technology
+ *      Development Group at Sun Microsystems, Inc.
+ *
+ * 4. The name of the Sun Microsystems, Inc nor may not be used to endorse or
+ *      promote products derived from this software without specific prior
+ *      written permission.
+ *
+ * SUN MICROSYSTEMS DOES NOT CLAIM MERCHANTABILITY OF THIS SOFTWARE OR THE
+ * SUITABILITY OF THIS SOFTWARE FOR ANY PARTICULAR PURPOSE.  The software is
+ * provided "as is" without express or implied warranty of any kind.
+ *
+ * These notices must be retained in any copies of any part of this software.
+ */
+
+#include "opt_altq.h"
+#include "opt_inet.h"
+#include "opt_inet6.h"
+
+#ifdef ALTQ_CBQ        /* cbq is enabled by ALTQ_CBQ option in opt_altq.h */
+
+#include <sys/param.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/socket.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/callout.h>
+#include <sys/errno.h>
+#include <sys/time.h>
+
+#include <net/if.h>
+#include <net/ifq_var.h>
+#include <netinet/in.h>
+
+#include <net/pf/pfvar.h>
+#include <net/altq/altq.h>
+#include <net/altq/altq_cbq.h>
+
+/*
+ * Forward Declarations.
+ */
+static int              cbq_class_destroy(cbq_state_t *, struct rm_class *);
+static struct rm_class  *clh_to_clp(cbq_state_t *, uint32_t);
+static int              cbq_clear_interface(cbq_state_t *);
+static int              cbq_request(struct ifaltq *, int, void *);
+static int              cbq_enqueue(struct ifaltq *, struct mbuf *,
+                            struct altq_pktattr *);
+static struct mbuf     *cbq_dequeue(struct ifaltq *, int);
+static void             cbqrestart(struct ifaltq *);
+static void             get_class_stats(class_stats_t *, struct rm_class *);
+static void             cbq_purge(cbq_state_t *);
+
+/*
+ * int
+ * cbq_class_destroy(cbq_mod_state_t *, struct rm_class *) - This
+ *     function destroys a given traffic class.  Before destroying
+ *     the class, all traffic for that class is released.
+ */
+static int
+cbq_class_destroy(cbq_state_t *cbqp, struct rm_class *cl)
+{
+       int     i;
+
+       /* delete the class */
+       rmc_delete_class(&cbqp->ifnp, cl);
+
+       /*
+        * free the class handle
+        */
+       for (i = 0; i < CBQ_MAX_CLASSES; i++)
+               if (cbqp->cbq_class_tbl[i] == cl)
+                       cbqp->cbq_class_tbl[i] = NULL;
+
+       if (cl == cbqp->ifnp.root_)
+               cbqp->ifnp.root_ = NULL;
+       if (cl == cbqp->ifnp.default_)
+               cbqp->ifnp.default_ = NULL;
+       return (0);
+}
+
+/* convert class handle to class pointer */
+static struct rm_class *
+clh_to_clp(cbq_state_t *cbqp, uint32_t chandle)
+{
+       int i;
+       struct rm_class *cl;
+
+       if (chandle == 0)
+               return (NULL);
+       /*
+        * first, try optimistically the slot matching the lower bits of
+        * the handle.  if it fails, do the linear table search.
+        */
+       i = chandle % CBQ_MAX_CLASSES;
+       if ((cl = cbqp->cbq_class_tbl[i]) != NULL &&
+           cl->stats_.handle == chandle)
+               return (cl);
+       for (i = 0; i < CBQ_MAX_CLASSES; i++)
+               if ((cl = cbqp->cbq_class_tbl[i]) != NULL &&
+                   cl->stats_.handle == chandle)
+                       return (cl);
+       return (NULL);
+}
+
+static int
+cbq_clear_interface(cbq_state_t *cbqp)
+{
+       int              again, i;
+       struct rm_class *cl;
+
+       /* clear out the classes now */
+       do {
+               again = 0;
+               for (i = 0; i < CBQ_MAX_CLASSES; i++) {
+                       if ((cl = cbqp->cbq_class_tbl[i]) != NULL) {
+                               if (is_a_parent_class(cl))
+                                       again++;
+                               else {
+                                       cbq_class_destroy(cbqp, cl);
+                                       cbqp->cbq_class_tbl[i] = NULL;
+                                       if (cl == cbqp->ifnp.root_)
+                                               cbqp->ifnp.root_ = NULL;
+                                       if (cl == cbqp->ifnp.default_)
+                                               cbqp->ifnp.default_ = NULL;
+                               }
+                       }
+               }
+       } while (again);
+
+       return (0);
+}
+
+static int
+cbq_request(struct ifaltq *ifq, int req, void *arg)
+{
+       cbq_state_t     *cbqp = (cbq_state_t *)ifq->altq_disc;
+
+       switch (req) {
+       case ALTRQ_PURGE:
+               cbq_purge(cbqp);
+               break;
+       }
+       return (0);
+}
+
+/* copy the stats info in rm_class to class_states_t */
+static void
+get_class_stats(class_stats_t *statsp, struct rm_class *cl)
+{
+       statsp->xmit_cnt        = cl->stats_.xmit_cnt;
+       statsp->drop_cnt        = cl->stats_.drop_cnt;
+       statsp->over            = cl->stats_.over;
+       statsp->borrows         = cl->stats_.borrows;
+       statsp->overactions     = cl->stats_.overactions;
+       statsp->delays          = cl->stats_.delays;
+
+       statsp->depth           = cl->depth_;
+       statsp->priority        = cl->pri_;
+       statsp->maxidle         = cl->maxidle_;
+       statsp->minidle         = cl->minidle_;
+       statsp->offtime         = cl->offtime_;
+       statsp->qmax            = qlimit(cl->q_);
+       statsp->ns_per_byte     = cl->ns_per_byte_;
+       statsp->wrr_allot       = cl->w_allotment_;
+       statsp->qcnt            = qlen(cl->q_);
+       statsp->avgidle         = cl->avgidle_;
+
+       statsp->qtype           = qtype(cl->q_);
+#ifdef ALTQ_RED
+       if (q_is_red(cl->q_))
+               red_getstats(cl->red_, &statsp->red[0]);
+#endif
+#ifdef ALTQ_RIO
+       if (q_is_rio(cl->q_))
+               rio_getstats((rio_t *)cl->red_, &statsp->red[0]);
+#endif
+}
+
+int
+cbq_pfattach(struct pf_altq *a)
+{
+       struct ifnet    *ifp;
+       int              s, error;
+
+       if ((ifp = ifunit(a->ifname)) == NULL || a->altq_disc == NULL)
+               return (EINVAL);
+       s = splimp();
+       error = altq_attach(&ifp->if_snd, ALTQT_CBQ, a->altq_disc,
+           cbq_enqueue, cbq_dequeue, cbq_request, NULL, NULL);
+       splx(s);
+       return (error);
+}
+
+int
+cbq_add_altq(struct pf_altq *a)
+{
+       cbq_state_t     *cbqp;
+       struct ifnet    *ifp;
+
+       if ((ifp = ifunit(a->ifname)) == NULL)
+               return (EINVAL);
+       if (!ifq_is_ready(&ifp->if_snd))
+               return (ENODEV);
+
+       /* allocate and initialize cbq_state_t */
+       cbqp = malloc(sizeof(*cbqp), M_ALTQ, M_WAITOK | M_ZERO);
+       callout_init(&cbqp->cbq_callout);
+       cbqp->cbq_qlen = 0;
+       cbqp->ifnp.ifq_ = &ifp->if_snd;     /* keep the ifq */
+
+       /* keep the state in pf_altq */
+       a->altq_disc = cbqp;
+
+       return (0);
+}
+
+int
+cbq_remove_altq(struct pf_altq *a)
+{
+       cbq_state_t     *cbqp;
+
+       if ((cbqp = a->altq_disc) == NULL)
+               return (EINVAL);
+       a->altq_disc = NULL;
+
+       cbq_clear_interface(cbqp);
+
+       if (cbqp->ifnp.default_)
+               cbq_class_destroy(cbqp, cbqp->ifnp.default_);
+       if (cbqp->ifnp.root_)
+               cbq_class_destroy(cbqp, cbqp->ifnp.root_);
+
+       /* deallocate cbq_state_t */
+       free(cbqp, M_ALTQ);
+
+       return (0);
+}
+
+int
+cbq_add_queue(struct pf_altq *a)
+{
+       struct rm_class *borrow, *parent;
+       cbq_state_t     *cbqp;
+       struct rm_class *cl;
+       struct cbq_opts *opts;
+       int             i;
+
+       if ((cbqp = a->altq_disc) == NULL)
+               return (EINVAL);
+       if (a->qid == 0)
+               return (EINVAL);
+
+       /*
+        * find a free slot in the class table.  if the slot matching
+        * the lower bits of qid is free, use this slot.  otherwise,
+        * use the first free slot.
+        */
+       i = a->qid % CBQ_MAX_CLASSES;
+       if (cbqp->cbq_class_tbl[i] != NULL) {
+               for (i = 0; i < CBQ_MAX_CLASSES; i++)
+                       if (cbqp->cbq_class_tbl[i] == NULL)
+                               break;
+               if (i == CBQ_MAX_CLASSES)
+                       return (EINVAL);
+       }
+
+       opts = &a->pq_u.cbq_opts;
+       /* check parameters */
+       if (a->priority >= CBQ_MAXPRI)
+               return (EINVAL);
+
+       /* Get pointers to parent and borrow classes.  */
+       parent = clh_to_clp(cbqp, a->parent_qid);
+       if (opts->flags & CBQCLF_BORROW)
+               borrow = parent;
+       else
+               borrow = NULL;
+
+       /*
+        * A class must borrow from it's parent or it can not
+        * borrow at all.  Hence, borrow can be null.
+        */
+       if (parent == NULL && (opts->flags & CBQCLF_ROOTCLASS) == 0) {
+               printf("cbq_add_queue: no parent class!\n");
+               return (EINVAL);
+       }
+
+       if ((borrow != parent)  && (borrow != NULL)) {
+               printf("cbq_add_class: borrow class != parent\n");
+               return (EINVAL);
+       }
+
+       /*
+        * check parameters
+        */
+       switch (opts->flags & CBQCLF_CLASSMASK) {
+       case CBQCLF_ROOTCLASS:
+               if (parent != NULL)
+                       return (EINVAL);
+               if (cbqp->ifnp.root_)
+                       return (EINVAL);
+               break;
+       case CBQCLF_DEFCLASS:
+               if (cbqp->ifnp.default_)
+                       return (EINVAL);
+               break;
+       case 0:
+               if (a->qid == 0)
+                       return (EINVAL);
+               break;
+       default:
+               /* more than two flags bits set */
+               return (EINVAL);
+       }
+
+       /*
+        * create a class.  if this is a root class, initialize the
+        * interface.
+        */
+       if ((opts->flags & CBQCLF_CLASSMASK) == CBQCLF_ROOTCLASS) {
+               rmc_init(cbqp->ifnp.ifq_, &cbqp->ifnp, opts->ns_per_byte,
+                   cbqrestart, a->qlimit, RM_MAXQUEUED,
+                   opts->maxidle, opts->minidle, opts->offtime,
+                   opts->flags);
+               cl = cbqp->ifnp.root_;
+       } else {
+               cl = rmc_newclass(a->priority,
+                                 &cbqp->ifnp, opts->ns_per_byte,
+                                 rmc_delay_action, a->qlimit, parent, borrow,
+                                 opts->maxidle, opts->minidle, opts->offtime,
+                                 opts->pktsize, opts->flags);
+       }
+       if (cl == NULL)
+               return (ENOMEM);
+
+       /* return handle to user space. */
+       cl->stats_.handle = a->qid;
+       cl->stats_.depth = cl->depth_;
+
+       /* save the allocated class */
+       cbqp->cbq_class_tbl[i] = cl;
+
+       if ((opts->flags & CBQCLF_CLASSMASK) == CBQCLF_DEFCLASS)
+               cbqp->ifnp.default_ = cl;
+
+       return (0);
+}
+
+int
+cbq_remove_queue(struct pf_altq *a)
+{
+       struct rm_class *cl;
+       cbq_state_t     *cbqp;
+       int             i;
+
+       if ((cbqp = a->altq_disc) == NULL)
+               return (EINVAL);
+
+       if ((cl = clh_to_clp(cbqp, a->qid)) == NULL)
+               return (EINVAL);
+
+       /* if we are a parent class, then return an error. */
+       if (is_a_parent_class(cl))
+               return (EINVAL);
+
+       /* delete the class */
+       rmc_delete_class(&cbqp->ifnp, cl);
+
+       /*
+        * free the class handle
+        */
+       for (i = 0; i < CBQ_MAX_CLASSES; i++)
+               if (cbqp->cbq_class_tbl[i] == cl) {
+                       cbqp->cbq_class_tbl[i] = NULL;
+                       if (cl == cbqp->ifnp.root_)
+                               cbqp->ifnp.root_ = NULL;
+                       if (cl == cbqp->ifnp.default_)
+                               cbqp->ifnp.default_ = NULL;
+                       break;
+               }
+
+       return (0);
+}
+
+int
+cbq_getqstats(struct pf_altq *a, void *ubuf, int *nbytes)
+{
+       cbq_state_t     *cbqp;
+       struct rm_class *cl;
+       class_stats_t    stats;
+       int              error = 0;
+
+       if ((cbqp = altq_lookup(a->ifname, ALTQT_CBQ)) == NULL)
+               return (EBADF);
+
+       if ((cl = clh_to_clp(cbqp, a->qid)) == NULL)
+               return (EINVAL);
+
+       if (*nbytes < sizeof(stats))
+               return (EINVAL);
+
+       get_class_stats(&stats, cl);
+
+       if ((error = copyout((caddr_t)&stats, ubuf, sizeof(stats))) != 0)
+               return (error);
+       *nbytes = sizeof(stats);
+       return (0);
+}
+
+/*
+ * int
+ * cbq_enqueue(struct ifaltq *ifq, struct mbuf *m, struct altq_pktattr *pattr)
+ *             - Queue data packets.
+ *
+ *     cbq_enqueue is set to ifp->if_altqenqueue and called by an upper
+ *     layer (e.g. ether_output).  cbq_enqueue queues the given packet
+ *     to the cbq, then invokes the driver's start routine.
+ *
+ *     Assumptions:    called in splimp
+ *     Returns:        0 if the queueing is successful.
+ *                     ENOBUFS if a packet dropping occurred as a result of
+ *                     the queueing.
+ */
+
+static int
+cbq_enqueue(struct ifaltq *ifq, struct mbuf *m, struct altq_pktattr *pktattr)
+{
+       cbq_state_t     *cbqp = (cbq_state_t *)ifq->altq_disc;
+       struct rm_class *cl;
+       int              len;
+
+       /* grab class set by classifier */
+       if ((m->m_flags & M_PKTHDR) == 0) {
+               /* should not happen */
+               if_printf(ifq->altq_ifp, "altq: packet does not have pkthdr\n");
+               m_freem(m);
+               return (ENOBUFS);
+       }
+       if (m->m_pkthdr.fw_flags & ALTQ_MBUF_TAGGED)
+               cl = clh_to_clp(cbqp, m->m_pkthdr.altq_qid);
+       else
+               cl = NULL;
+       if (cl == NULL) {
+               cl = cbqp->ifnp.default_;
+               if (cl == NULL) {
+                       m_freem(m);
+                       return (ENOBUFS);
+               }
+       }
+       cl->pktattr_ = NULL;
+       len = m_pktlen(m);
+       if (rmc_queue_packet(cl, m) != 0) {
+               /* drop occurred.  some mbuf was freed in rmc_queue_packet. */
+               PKTCNTR_ADD(&cl->stats_.drop_cnt, len);
+               return (ENOBUFS);
+       }
+
+       /* successfully queued. */
+       ++cbqp->cbq_qlen;
+       ++ifq->ifq_len;
+       return (0);
+}
+
+static struct mbuf *
+cbq_dequeue(struct ifaltq *ifq, int op)
+{
+       cbq_state_t     *cbqp = (cbq_state_t *)ifq->altq_disc;
+       struct mbuf     *m;
+
+       m = rmc_dequeue_next(&cbqp->ifnp, op);
+
+       if (m && op == ALTDQ_REMOVE) {
+               --cbqp->cbq_qlen;  /* decrement # of packets in cbq */
+               --ifq->ifq_len;
+
+               /* Update the class. */
+               rmc_update_class_util(&cbqp->ifnp);
+       }
+       return (m);
+}
+
+/*
+ * void
+ * cbqrestart(queue_t *) - Restart sending of data.
+ * called from rmc_restart in splimp via timeout after waking up
+ * a suspended class.
+ *     Returns:        NONE
+ */
+
+static void
+cbqrestart(struct ifaltq *ifq)
+{
+       cbq_state_t     *cbqp;
+       struct ifnet    *ifp;
+
+       if (!ifq_is_enabled(ifq))
+               /* cbq must have been detached */
+               return;
+
+       if ((cbqp = (cbq_state_t *)ifq->altq_disc) == NULL)
+               /* should not happen */
+               return;
+
+       ifp = ifq->altq_ifp;
+       if (ifp->if_start &&
+           cbqp->cbq_qlen > 0 && (ifp->if_flags & IFF_OACTIVE) == 0)
+               (*ifp->if_start)(ifp);
+}
+
+static void
+cbq_purge(cbq_state_t *cbqp)
+{
+       struct rm_class *cl;
+       int              i;
+
+       for (i = 0; i < CBQ_MAX_CLASSES; i++)
+               if ((cl = cbqp->cbq_class_tbl[i]) != NULL)
+                       rmc_dropall(cl);
+       if (ifq_is_enabled(cbqp->ifnp.ifq_))
+               cbqp->ifnp.ifq_->ifq_len = 0;
+}
+
+#endif /* ALTQ_CBQ */
diff --git a/sys/net/altq/altq_cbq.h b/sys/net/altq/altq_cbq.h
new file mode 100644 (file)
index 0000000..ef28af2
--- /dev/null
@@ -0,0 +1,114 @@
+/*     $KAME: altq_cbq.h,v 1.12 2003/10/03 05:05:15 kjc Exp $  */
+/*     $DragonFly: src/sys/net/altq/altq_cbq.h,v 1.1 2005/02/11 22:25:57 joerg Exp $ */
+
+/*
+ * Copyright (c) Sun Microsystems, Inc. 1993-1998 All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *      This product includes software developed by the SMCC Technology
+ *      Development Group at Sun Microsystems, Inc.
+ *
+ * 4. The name of the Sun Microsystems, Inc nor may not be used to endorse or
+ *      promote products derived from this software without specific prior
+ *      written permission.
+ *
+ * SUN MICROSYSTEMS DOES NOT CLAIM MERCHANTABILITY OF THIS SOFTWARE OR THE
+ * SUITABILITY OF THIS SOFTWARE FOR ANY PARTICULAR PURPOSE.  The software is
+ * provided "as is" without express or implied warranty of any kind.
+ *
+ * These notices must be retained in any copies of any part of this software.
+ */
+
+#ifndef _ALTQ_ALTQ_CBQ_H_
+#define        _ALTQ_ALTQ_CBQ_H_
+
+#include <net/altq/altq.h>
+#include <net/altq/altq_rmclass.h>
+#include <net/altq/altq_red.h>
+#include <net/altq/altq_rio.h>
+
+#define        NULL_CLASS_HANDLE       0
+
+/* class flags should be same as class flags in rm_class.h */
+#define        CBQCLF_RED              0x0001  /* use RED */
+#define        CBQCLF_ECN              0x0002  /* use RED/ECN */
+#define        CBQCLF_RIO              0x0004  /* use RIO */
+#define        CBQCLF_CLEARDSCP        0x0008  /* clear diffserv codepoint */
+#define        CBQCLF_BORROW           0x0010  /* borrow from parent */
+
+/* class flags only for root class */
+#define        CBQCLF_WRR              0x0100  /* weighted-round robin */
+#define        CBQCLF_EFFICIENT        0x0200  /* work-conserving */
+
+/* class flags for special classes */
+#define        CBQCLF_ROOTCLASS        0x1000  /* root class */
+#define        CBQCLF_DEFCLASS         0x2000  /* default class */
+#define        CBQCLF_CLASSMASK        0xf000  /* class mask */
+
+#define        CBQ_MAXQSIZE            200
+#define        CBQ_MAXPRI              RM_MAXPRIO
+
+typedef struct _cbq_class_stats_ {
+       uint32_t        handle;
+       u_int           depth;
+
+       struct pktcntr  xmit_cnt;       /* packets sent in this class */
+       struct pktcntr  drop_cnt;       /* dropped packets */
+       u_int           over;           /* # times went over limit */
+       u_int           borrows;        /* # times tried to borrow */
+       u_int           overactions;    /* # times invoked overlimit action */
+       u_int           delays;         /* # times invoked delay actions */
+
+       /* other static class parameters useful for debugging */
+       int             priority;
+       int             maxidle;
+       int             minidle;
+       int             offtime;
+       int             qmax;
+       int             ns_per_byte;
+       int             wrr_allot;
+
+       int             qcnt;           /* # packets in queue */
+       int             avgidle;
+
+       /* red and rio related info */
+       int             qtype;
+       struct redstats red[3];
+} class_stats_t;
+
+#ifdef _KERNEL
+/*
+ * Define macros only good for kernel drivers and modules.
+ */
+#define        CBQ_WATCHDOG            (hz / 20)
+#define        CBQ_TIMEOUT             10
+#define        CBQ_LS_TIMEOUT          (20 * hz / 1000)
+
+#define        CBQ_MAX_CLASSES 256
+
+/*
+ * Define State structures.
+ */
+typedef struct cbqstate {
+       int                      cbq_qlen;      /* # of packets in cbq */
+       struct rm_class         *cbq_class_tbl[CBQ_MAX_CLASSES];
+
+       struct rm_ifdat          ifnp;
+       struct callout           cbq_callout;   /* for timeouts */
+} cbq_state_t;
+
+#endif /* _KERNEL */
+
+#endif /* !_ALTQ_ALTQ_CBQ_H_ */
diff --git a/sys/net/altq/altq_classq.h b/sys/net/altq/altq_classq.h
new file mode 100644 (file)
index 0000000..3924c79
--- /dev/null
@@ -0,0 +1,184 @@
+/*     $KAME: altq_classq.h,v 1.6 2003/01/07 07:33:38 kjc Exp $        */
+/*     $DragonFly: src/sys/net/altq/altq_classq.h,v 1.1 2005/02/11 22:25:57 joerg Exp $ */
+
+/*
+ * Copyright (c) 1991-1997 Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *     This product includes software developed by the Network Research
+ *     Group at Lawrence Berkeley Laboratory.
+ * 4. Neither the name of the University nor of the Laboratory may be used
+ *    to endorse or promote products derived from this software without
+ *    specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * class queue definitions extracted from rm_class.h.
+ */
+#ifndef _ALTQ_ALTQ_CLASSQ_H_
+#define        _ALTQ_ALTQ_CLASSQ_H_
+
+/*
+ * Packet Queue types: RED or DROPHEAD.
+ */
+#define        Q_DROPHEAD      0x00
+#define        Q_RED           0x01
+#define        Q_RIO           0x02
+#define        Q_DROPTAIL      0x03
+
+#ifdef _KERNEL
+
+/*
+ * Packet Queue structures and macros to manipulate them.
+ */
+struct _class_queue_ {
+       struct mbuf     *tail_; /* Tail of packet queue */
+       int     qlen_;          /* Queue length (in number of packets) */
+       int     qlim_;          /* Queue limit (in number of packets*) */
+       int     qtype_;         /* Queue type */
+};
+
+typedef struct _class_queue_   class_queue_t;
+
+#define        qtype(q)        (q)->qtype_             /* Get queue type */
+#define        qlimit(q)       (q)->qlim_              /* Max packets to be queued */
+#define        qlen(q)         (q)->qlen_              /* Current queue length. */
+#define        qtail(q)        (q)->tail_              /* Tail of the queue */
+#define        qhead(q)        ((q)->tail_ ? (q)->tail_->m_nextpkt : NULL)
+
+#define        qempty(q)       ((q)->qlen_ == 0)       /* Is the queue empty?? */
+#define        q_is_red(q)     ((q)->qtype_ == Q_RED)  /* Is the queue a red queue */
+#define        q_is_rio(q)     ((q)->qtype_ == Q_RIO)  /* Is the queue a rio queue */
+#define        q_is_red_or_rio(q)      ((q)->qtype_ == Q_RED || (q)->qtype_ == Q_RIO)
+
+static __inline void
+_addq(class_queue_t *q, struct mbuf *m)
+{
+        struct mbuf *m0;
+
+       if ((m0 = qtail(q)) != NULL)
+               m->m_nextpkt = m0->m_nextpkt;
+       else
+               m0 = m;
+       m0->m_nextpkt = m;
+       qtail(q) = m;
+       qlen(q)++;
+}
+
+static __inline struct mbuf *
+_getq(class_queue_t *q)
+{
+       struct mbuf  *m, *m0;
+
+       if ((m = qtail(q)) == NULL)
+               return (NULL);
+       if ((m0 = m->m_nextpkt) != m)
+               m->m_nextpkt = m0->m_nextpkt;
+       else
+               qtail(q) = NULL;
+       qlen(q)--;
+       m0->m_nextpkt = NULL;
+       return (m0);
+}
+
+/* drop a packet at the tail of the queue */
+static __inline struct mbuf *
+_getq_tail(class_queue_t *q)
+{
+       struct mbuf *m, *m0, *prev;
+
+       if ((m = m0 = qtail(q)) == NULL)
+               return NULL;
+       do {
+               prev = m0;
+               m0 = m0->m_nextpkt;
+       } while (m0 != m);
+       prev->m_nextpkt = m->m_nextpkt;
+       if (prev == m)
+               qtail(q) = NULL;
+       else
+               qtail(q) = prev;
+       qlen(q)--;
+       m->m_nextpkt = NULL;
+       return (m);
+}
+
+/* randomly select a packet in the queue */
+static __inline struct mbuf *
+_getq_random(class_queue_t *q)
+{
+       struct mbuf *m;
+       int i, n;
+
+       if ((m = qtail(q)) == NULL)
+               return NULL;
+       if (m->m_nextpkt == m)
+               qtail(q) = NULL;
+       else {
+               struct mbuf *prev = NULL;
+
+               n = random() % qlen(q) + 1;
+               for (i = 0; i < n; i++) {
+                       prev = m;
+                       m = m->m_nextpkt;
+               }
+               prev->m_nextpkt = m->m_nextpkt;
+               if (m == qtail(q))
+                       qtail(q) = prev;
+       }
+       qlen(q)--;
+       m->m_nextpkt = NULL;
+       return (m);
+}
+
+static __inline void
+_removeq(class_queue_t *q, struct mbuf *m)
+{
+       struct mbuf *m0, *prev;
+
+       m0 = qtail(q);
+       do {
+               prev = m0;
+               m0 = m0->m_nextpkt;
+       } while (m0 != m);
+       prev->m_nextpkt = m->m_nextpkt;
+       if (prev == m)
+               qtail(q) = NULL;
+       else if (qtail(q) == m)
+               qtail(q) = prev;
+       qlen(q)--;
+}
+
+static __inline void
+_flushq(class_queue_t *q)
+{
+       struct mbuf *m;
+
+       while ((m = _getq(q)) != NULL)
+               m_freem(m);
+}
+
+#endif /* _KERNEL */
+
+#endif /* _ALTQ_ALTQ_CLASSQ_H_ */
diff --git a/sys/net/altq/altq_hfsc.c b/sys/net/altq/altq_hfsc.c
new file mode 100644 (file)
index 0000000..0efcbe4
--- /dev/null
@@ -0,0 +1,1624 @@
+/*     $KAME: altq_hfsc.c,v 1.25 2004/04/17 10:54:48 kjc Exp $ */
+/*     $DragonFly: src/sys/net/altq/altq_hfsc.c,v 1.1 2005/02/11 22:25:57 joerg Exp $ */
+
+/*
+ * Copyright (c) 1997-1999 Carnegie Mellon University. All Rights Reserved.
+ *
+ * Permission to use, copy, modify, and distribute this software and
+ * its documentation is hereby granted (including for commercial or
+ * for-profit use), provided that both the copyright notice and this
+ * permission notice appear in all copies of the software, derivative
+ * works, or modified versions, and any portions thereof.
+ *
+ * THIS SOFTWARE IS EXPERIMENTAL AND IS KNOWN TO HAVE BUGS, SOME OF
+ * WHICH MAY HAVE SERIOUS CONSEQUENCES.  CARNEGIE MELLON PROVIDES THIS
+ * SOFTWARE IN ITS ``AS IS'' CONDITION, AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+ * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ * Carnegie Mellon encourages (but does not require) users of this
+ * software to return any improvements or extensions that they make,
+ * and to grant Carnegie Mellon the rights to redistribute these
+ * changes without encumbrance.
+ */
+/*
+ * H-FSC is described in Proceedings of SIGCOMM'97,
+ * "A Hierarchical Fair Service Curve Algorithm for Link-Sharing,
+ * Real-Time and Priority Service"
+ * by Ion Stoica, Hui Zhang, and T. S. Eugene Ng.
+ *
+ * Oleg Cherevko <olwi@aq.ml.com.ua> added the upperlimit for link-sharing.
+ * when a class has an upperlimit, the fit-time is computed from the
+ * upperlimit service curve.  the link-sharing scheduler does not schedule
+ * a class whose fit-time exceeds the current time.
+ */
+
+#include "opt_altq.h"
+#include "opt_inet.h"
+#include "opt_inet6.h"
+
+#ifdef ALTQ_HFSC  /* hfsc is enabled by ALTQ_HFSC option in opt_altq.h */
+
+#include <sys/param.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/socket.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/queue.h>
+
+#include <net/if.h>
+#include <net/ifq_var.h>
+#include <netinet/in.h>
+
+#include <net/pf/pfvar.h>
+#include <net/altq/altq.h>
+#include <net/altq/altq_hfsc.h>
+
+/*
+ * function prototypes
+ */
+static int     hfsc_clear_interface(struct hfsc_if *);
+static int     hfsc_request(struct ifaltq *, int, void *);
+static void    hfsc_purge(struct hfsc_if *);
+static struct hfsc_class *hfsc_class_create(struct hfsc_if *,
+                                           struct service_curve *,
+                                           struct service_curve *,
+                                           struct service_curve *,
+                                           struct hfsc_class *, int, int, int);
+static int     hfsc_class_destroy(struct hfsc_class *);
+static struct hfsc_class *hfsc_nextclass(struct hfsc_class *);
+static int     hfsc_enqueue(struct ifaltq *, struct mbuf *,
+                            struct altq_pktattr *);
+static struct mbuf *hfsc_dequeue(struct ifaltq *, int);
+
+static int     hfsc_addq(struct hfsc_class *, struct mbuf *);
+static struct mbuf *hfsc_getq(struct hfsc_class *);
+static struct mbuf *hfsc_pollq(struct hfsc_class *);
+static void    hfsc_purgeq(struct hfsc_class *);
+
+static void    update_cfmin(struct hfsc_class *);
+static void    set_active(struct hfsc_class *, int);
+static void    set_passive(struct hfsc_class *);
+
+static void    init_ed(struct hfsc_class *, int);
+static void    update_ed(struct hfsc_class *, int);
+static void    update_d(struct hfsc_class *, int);
+static void    init_vf(struct hfsc_class *, int);
+static void    update_vf(struct hfsc_class *, int, uint64_t);
+static ellist_t *ellist_alloc(void);
+static void    ellist_destroy(ellist_t *);
+static void    ellist_insert(struct hfsc_class *);
+static void    ellist_remove(struct hfsc_class *);
+static void    ellist_update(struct hfsc_class *);
+struct hfsc_class *ellist_get_mindl(ellist_t *, uint64_t);
+static actlist_t *actlist_alloc(void);
+static void    actlist_destroy(actlist_t *);
+static void    actlist_insert(struct hfsc_class *);
+static void    actlist_remove(struct hfsc_class *);
+static void    actlist_update(struct hfsc_class *);
+
+static struct hfsc_class *actlist_firstfit(struct hfsc_class *, uint64_t);
+
+static __inline uint64_t       seg_x2y(uint64_t, uint64_t);
+static __inline uint64_t       seg_y2x(uint64_t, uint64_t);
+static __inline uint64_t       m2sm(u_int);
+static __inline uint64_t       m2ism(u_int);
+static __inline uint64_t       d2dx(u_int);
+static u_int                   sm2m(uint64_t);
+static u_int                   dx2d(uint64_t);
+
+static void    sc2isc(struct service_curve *, struct internal_sc *);
+static void    rtsc_init(struct runtime_sc *, struct internal_sc *,
+                         uint64_t, uint64_t);
+static uint64_t        rtsc_y2x(struct runtime_sc *, uint64_t);
+static uint64_t        rtsc_x2y(struct runtime_sc *, uint64_t);
+static void    rtsc_min(struct runtime_sc *, struct internal_sc *,
+                        uint64_t, uint64_t);
+
+static void    get_class_stats(struct hfsc_classstats *, struct hfsc_class *);
+static struct hfsc_class *clh_to_clp(struct hfsc_if *, uint32_t);
+
+/*
+ * macros
+ */
+#define        is_a_parent_class(cl)   ((cl)->cl_children != NULL)
+
+#define        HT_INFINITY     0xffffffffffffffffLL    /* infinite time value */
+
+int
+hfsc_pfattach(struct pf_altq *a)
+{
+       struct ifnet *ifp;
+       int s, error;
+
+       if ((ifp = ifunit(a->ifname)) == NULL || a->altq_disc == NULL)
+               return (EINVAL);
+       s = splimp();
+       error = altq_attach(&ifp->if_snd, ALTQT_HFSC, a->altq_disc,
+           hfsc_enqueue, hfsc_dequeue, hfsc_request, NULL, NULL);
+       splx(s);
+       return (error);
+}
+
+int
+hfsc_add_altq(struct pf_altq *a)
+{
+       struct hfsc_if *hif;
+       struct ifnet *ifp;
+
+       if ((ifp = ifunit(a->ifname)) == NULL)
+               return (EINVAL);
+       if (!ifq_is_ready(&ifp->if_snd))
+               return (ENODEV);
+
+       hif = malloc(sizeof(struct hfsc_if), M_ALTQ, M_WAITOK | M_ZERO);
+
+       hif->hif_eligible = ellist_alloc();
+       hif->hif_ifq = &ifp->if_snd;
+
+       /* keep the state in pf_altq */
+       a->altq_disc = hif;
+
+       return (0);
+}
+
+int
+hfsc_remove_altq(struct pf_altq *a)
+{
+       struct hfsc_if *hif;
+
+       if ((hif = a->altq_disc) == NULL)
+               return (EINVAL);
+       a->altq_disc = NULL;
+
+       hfsc_clear_interface(hif);
+       hfsc_class_destroy(hif->hif_rootclass);
+
+       ellist_destroy(hif->hif_eligible);
+
+       free(hif, M_ALTQ);
+
+       return (0);
+}
+
+int
+hfsc_add_queue(struct pf_altq *a)
+{
+       struct hfsc_if *hif;
+       struct hfsc_class *cl, *parent;
+       struct hfsc_opts *opts;
+       struct service_curve rtsc, lssc, ulsc;
+
+       if ((hif = a->altq_disc) == NULL)
+               return (EINVAL);
+
+       opts = &a->pq_u.hfsc_opts;
+
+       if (a->parent_qid == HFSC_NULLCLASS_HANDLE && hif->hif_rootclass == NULL)
+               parent = NULL;
+       else if ((parent = clh_to_clp(hif, a->parent_qid)) == NULL)
+               return (EINVAL);
+
+       if (a->qid == 0)
+               return (EINVAL);
+
+       if (clh_to_clp(hif, a->qid) != NULL)
+               return (EBUSY);
+
+       rtsc.m1 = opts->rtsc_m1;
+       rtsc.d  = opts->rtsc_d;
+       rtsc.m2 = opts->rtsc_m2;
+       lssc.m1 = opts->lssc_m1;
+       lssc.d  = opts->lssc_d;
+       lssc.m2 = opts->lssc_m2;
+       ulsc.m1 = opts->ulsc_m1;
+       ulsc.d  = opts->ulsc_d;
+       ulsc.m2 = opts->ulsc_m2;
+
+       cl = hfsc_class_create(hif, &rtsc, &lssc, &ulsc, parent, a->qlimit,
+                              opts->flags, a->qid);
+       if (cl == NULL)
+               return (ENOMEM);
+
+       return (0);
+}
+
+int
+hfsc_remove_queue(struct pf_altq *a)
+{
+       struct hfsc_if *hif;
+       struct hfsc_class *cl;
+
+       if ((hif = a->altq_disc) == NULL)
+               return (EINVAL);
+
+       if ((cl = clh_to_clp(hif, a->qid)) == NULL)
+               return (EINVAL);
+
+       return (hfsc_class_destroy(cl));
+}
+
+int
+hfsc_getqstats(struct pf_altq *a, void *ubuf, int *nbytes)
+{
+       struct hfsc_if *hif;
+       struct hfsc_class *cl;
+       struct hfsc_classstats stats;
+       int error = 0;
+
+       if ((hif = altq_lookup(a->ifname, ALTQT_HFSC)) == NULL)
+               return (EBADF);
+
+       if ((cl = clh_to_clp(hif, a->qid)) == NULL)
+               return (EINVAL);
+
+       if (*nbytes < sizeof(stats))
+               return (EINVAL);
+
+       get_class_stats(&stats, cl);
+
+       if ((error = copyout((caddr_t)&stats, ubuf, sizeof(stats))) != 0)
+               return (error);
+       *nbytes = sizeof(stats);
+       return (0);
+}
+
+/*
+ * bring the interface back to the initial state by discarding
+ * all the filters and classes except the root class.
+ */
+static int
+hfsc_clear_interface(struct hfsc_if *hif)
+{
+       struct hfsc_class *cl;
+
+       if (hif->hif_rootclass == NULL)
+               return (0);
+
+
+       /* clear out the classes */
+       while ((cl = hif->hif_rootclass->cl_children) != NULL) {
+               /*
+                * remove the first leaf class found in the hierarchy
+                * then start over
+                */
+               for (; cl != NULL; cl = hfsc_nextclass(cl)) {
+                       if (!is_a_parent_class(cl)) {
+                               hfsc_class_destroy(cl);
+                               break;
+                       }
+               }
+       }
+
+       return (0);
+}
+
+static int
+hfsc_request(struct ifaltq *ifq, int req, void *arg)
+{
+       struct hfsc_if *hif = (struct hfsc_if *)ifq->altq_disc;
+
+       switch (req) {
+       case ALTRQ_PURGE:
+               hfsc_purge(hif);
+               break;
+       }
+       return (0);
+}
+
+/* discard all the queued packets on the interface */
+static void
+hfsc_purge(struct hfsc_if *hif)
+{
+       struct hfsc_class *cl;
+
+       for (cl = hif->hif_rootclass; cl != NULL; cl = hfsc_nextclass(cl)) {
+               if (!qempty(cl->cl_q))
+                       hfsc_purgeq(cl);
+       }
+       if (ifq_is_enabled(hif->hif_ifq))
+               hif->hif_ifq->ifq_len = 0;
+}
+
+struct hfsc_class *
+hfsc_class_create(struct hfsc_if *hif, struct service_curve *rsc,
+                 struct service_curve *fsc, struct service_curve *usc,
+                 struct hfsc_class *parent, int qlimit, int flags, int qid)
+{
+       struct hfsc_class *cl, *p;
+       int i, s;
+
+       if (hif->hif_classes >= HFSC_MAX_CLASSES)
+               return (NULL);
+
+#ifndef ALTQ_RED
+       if (flags & HFCF_RED) {
+#ifdef ALTQ_DEBUG
+               printf("hfsc_class_create: RED not configured for HFSC!\n");
+#endif
+               return (NULL);
+       }
+#endif
+
+       cl = malloc(sizeof(*cl), M_ALTQ, M_WAITOK | M_ZERO);
+       cl->cl_q = malloc(sizeof(*cl->cl_q), M_ALTQ, M_WAITOK | M_ZERO);
+       cl->cl_actc = actlist_alloc();
+
+       if (qlimit == 0)
+               qlimit = 50;  /* use default */
+       qlimit(cl->cl_q) = qlimit;
+       qtype(cl->cl_q) = Q_DROPTAIL;
+       qlen(cl->cl_q) = 0;
+       cl->cl_flags = flags;
+#ifdef ALTQ_RED
+       if (flags & (HFCF_RED|HFCF_RIO)) {
+               int red_flags, red_pkttime;
+               u_int m2;
+
+               m2 = 0;
+               if (rsc != NULL && rsc->m2 > m2)
+                       m2 = rsc->m2;
+               if (fsc != NULL && fsc->m2 > m2)
+                       m2 = fsc->m2;
+               if (usc != NULL && usc->m2 > m2)
+                       m2 = usc->m2;
+
+               red_flags = 0;
+               if (flags & HFCF_ECN)
+                       red_flags |= REDF_ECN;
+#ifdef ALTQ_RIO
+               if (flags & HFCF_CLEARDSCP)
+                       red_flags |= RIOF_CLEARDSCP;
+#endif
+               if (m2 < 8)
+                       red_pkttime = 1000 * 1000 * 1000; /* 1 sec */
+               else
+                       red_pkttime = (int64_t)hif->hif_ifq->altq_ifp->if_mtu
+                               * 1000 * 1000 * 1000 / (m2 / 8);
+               if (flags & HFCF_RED) {
+                       cl->cl_red = red_alloc(0, 0,
+                           qlimit(cl->cl_q) * 10/100,
+                           qlimit(cl->cl_q) * 30/100,
+                           red_flags, red_pkttime);
+                       if (cl->cl_red != NULL)
+                               qtype(cl->cl_q) = Q_RED;
+               }
+#ifdef ALTQ_RIO
+               else {
+                       cl->cl_red = (red_t *)rio_alloc(0, NULL,
+                           red_flags, red_pkttime);
+                       if (cl->cl_red != NULL)
+                               qtype(cl->cl_q) = Q_RIO;
+               }
+#endif
+       }
+#endif /* ALTQ_RED */
+
+       if (rsc != NULL && (rsc->m1 != 0 || rsc->m2 != 0)) {
+               cl->cl_rsc = malloc(sizeof(*cl->cl_rsc), M_ALTQ, M_WAITOK);
+               sc2isc(rsc, cl->cl_rsc);
+               rtsc_init(&cl->cl_deadline, cl->cl_rsc, 0, 0);
+               rtsc_init(&cl->cl_eligible, cl->cl_rsc, 0, 0);
+       }
+       if (fsc != NULL && (fsc->m1 != 0 || fsc->m2 != 0)) {
+               cl->cl_fsc = malloc(sizeof(*cl->cl_fsc), M_ALTQ, M_WAITOK);
+               if (cl->cl_fsc == NULL)
+                       goto err_ret;
+               sc2isc(fsc, cl->cl_fsc);
+               rtsc_init(&cl->cl_virtual, cl->cl_fsc, 0, 0);
+       }
+       if (usc != NULL && (usc->m1 != 0 || usc->m2 != 0)) {
+               cl->cl_usc = malloc(sizeof(*cl->cl_usc), M_ALTQ, M_WAITOK);
+               if (cl->cl_usc == NULL)
+                       goto err_ret;
+               sc2isc(usc, cl->cl_usc);
+               rtsc_init(&cl->cl_ulimit, cl->cl_usc, 0, 0);
+       }
+
+       cl->cl_id = hif->hif_classid++;
+       cl->cl_handle = qid;
+       cl->cl_hif = hif;
+       cl->cl_parent = parent;
+
+       s = splimp();
+       hif->hif_classes++;
+
+       /*
+        * find a free slot in the class table.  if the slot matching
+        * the lower bits of qid is free, use this slot.  otherwise,
+        * use the first free slot.
+        */
+       i = qid % HFSC_MAX_CLASSES;
+       if (hif->hif_class_tbl[i] == NULL)
+               hif->hif_class_tbl[i] = cl;
+       else {
+               for (i = 0; i < HFSC_MAX_CLASSES; i++) {
+                       if (hif->hif_class_tbl[i] == NULL) {
+                               hif->hif_class_tbl[i] = cl;
+                               break;
+                       }
+               }
+               if (i == HFSC_MAX_CLASSES) {
+                       splx(s);
+                       goto err_ret;
+               }
+       }
+
+       if (flags & HFCF_DEFAULTCLASS)
+               hif->hif_defaultclass = cl;
+
+       if (parent == NULL) {
+               /* this is root class */
+               hif->hif_rootclass = cl;
+       } else if (parent->cl_children == NULL) {
+               /* add this class to the children list of the parent */
+               parent->cl_children = cl;
+       } else {
+               p = parent->cl_children;
+               while (p->cl_siblings != NULL)
+                       p = p->cl_siblings;
+               p->cl_siblings = cl;
+       }
+       splx(s);
+
+       return (cl);
+
+ err_ret:
+       if (cl->cl_actc != NULL)
+               actlist_destroy(cl->cl_actc);
+       if (cl->cl_red != NULL) {
+#ifdef ALTQ_RIO
+               if (q_is_rio(cl->cl_q))
+                       rio_destroy((rio_t *)cl->cl_red);
+#endif
+#ifdef ALTQ_RED
+               if (q_is_red(cl->cl_q))
+                       red_destroy(cl->cl_red);
+#endif
+       }
+       if (cl->cl_fsc != NULL)
+               free(cl->cl_fsc, M_ALTQ);
+       if (cl->cl_rsc != NULL)
+               free(cl->cl_rsc, M_ALTQ);
+       if (cl->cl_usc != NULL)
+               free(cl->cl_usc, M_ALTQ);
+       if (cl->cl_q != NULL)
+               free(cl->cl_q, M_ALTQ);
+       free(cl, M_ALTQ);
+       return (NULL);
+}
+
+static int
+hfsc_class_destroy(struct hfsc_class *cl)
+{
+       int i, s;
+
+       if (cl == NULL)
+               return (0);
+
+       if (is_a_parent_class(cl))
+               return (EBUSY);
+
+       s = splimp();
+
+       if (!qempty(cl->cl_q))
+               hfsc_purgeq(cl);
+
+       if (cl->cl_parent == NULL) {
+               /* this is root class */
+       } else {
+               struct hfsc_class *p = cl->cl_parent->cl_children;
+
+               if (p == cl) {
+                       cl->cl_parent->cl_children = cl->cl_siblings;
+               } else {
+                       do {
+                               if (p->cl_siblings == cl) {
+                                       p->cl_siblings = cl->cl_siblings;
+                                       break;
+                               }
+                       } while ((p = p->cl_siblings) != NULL);
+               }
+               KKASSERT(p != NULL);
+       }
+
+       for (i = 0; i < HFSC_MAX_CLASSES; i++) {
+               if (cl->cl_hif->hif_class_tbl[i] == cl) {
+                       cl->cl_hif->hif_class_tbl[i] = NULL;
+                       break;
+               }
+       }
+
+       cl->cl_hif->hif_classes--;
+       splx(s);
+
+       actlist_destroy(cl->cl_actc);
+
+       if (cl->cl_red != NULL) {
+#ifdef ALTQ_RIO
+               if (q_is_rio(cl->cl_q))
+                       rio_destroy((rio_t *)cl->cl_red);
+#endif
+#ifdef ALTQ_RED
+               if (q_is_red(cl->cl_q))
+                       red_destroy(cl->cl_red);
+#endif
+       }
+
+       if (cl == cl->cl_hif->hif_rootclass)
+               cl->cl_hif->hif_rootclass = NULL;
+       if (cl == cl->cl_hif->hif_defaultclass)
+               cl->cl_hif->hif_defaultclass = NULL;
+
+       if (cl->cl_usc != NULL)
+               free(cl->cl_usc, M_ALTQ);
+       if (cl->cl_fsc != NULL)
+               free(cl->cl_fsc, M_ALTQ);
+       if (cl->cl_rsc != NULL)
+               free(cl->cl_rsc, M_ALTQ);
+       free(cl->cl_q, M_ALTQ);
+       free(cl, M_ALTQ);
+
+       return (0);
+}
+
+/*
+ * hfsc_nextclass returns the next class in the tree.
+ *   usage:
+ *     for (cl = hif->hif_rootclass; cl != NULL; cl = hfsc_nextclass(cl))
+ *             do_something;
+ */
+static struct hfsc_class *
+hfsc_nextclass(struct hfsc_class *cl)
+{
+       if (cl->cl_children != NULL) {
+               cl = cl->cl_children;
+       } else if (cl->cl_siblings != NULL) {
+               cl = cl->cl_siblings;
+       } else {
+               while ((cl = cl->cl_parent) != NULL) {
+                       if (cl->cl_siblings != NULL) {
+                               cl = cl->cl_siblings;
+                               break;
+                       }
+               }
+       }
+
+       return (cl);
+}
+
+/*
+ * hfsc_enqueue is an enqueue function to be registered to
+ * (*altq_enqueue) in struct ifaltq.
+ */
+static int
+hfsc_enqueue(struct ifaltq *ifq, struct mbuf *m, struct altq_pktattr *pktattr)
+{
+       struct hfsc_if  *hif = (struct hfsc_if *)ifq->altq_disc;
+       struct hfsc_class *cl;
+       int len;
+
+       /* grab class set by classifier */
+       if ((m->m_flags & M_PKTHDR) == 0) {
+               /* should not happen */
+               if_printf(ifq->altq_ifp, "altq: packet does not have pkthdr\n");
+               m_freem(m);
+               return (ENOBUFS);
+       }
+       if (m->m_pkthdr.fw_flags & ALTQ_MBUF_TAGGED)
+               cl = clh_to_clp(hif, m->m_pkthdr.altq_qid);
+       else
+               cl = NULL;
+       if (cl == NULL || is_a_parent_class(cl)) {
+               cl = hif->hif_defaultclass;
+               if (cl == NULL) {
+                       m_freem(m);
+                       return (ENOBUFS);
+               }
+       }
+       cl->cl_pktattr = NULL;
+       len = m_pktlen(m);
+       if (hfsc_addq(cl, m) != 0) {
+               /* drop occurred.  mbuf was freed in hfsc_addq. */
+               PKTCNTR_ADD(&cl->cl_stats.drop_cnt, len);
+               return (ENOBUFS);
+       }
+       ifq->ifq_len++;
+       cl->cl_hif->hif_packets++;
+
+       /* successfully queued. */
+       if (qlen(cl->cl_q) == 1)
+               set_active(cl, m_pktlen(m));
+
+       return (0);
+}
+
+/*
+ * hfsc_dequeue is a dequeue function to be registered to
+ * (*altq_dequeue) in struct ifaltq.
+ *
+ * note: ALTDQ_POLL returns the next packet without removing the packet
+ *     from the queue.  ALTDQ_REMOVE is a normal dequeue operation.
+ *     ALTDQ_REMOVE must return the same packet if called immediately
+ *     after ALTDQ_POLL.
+ */
+static struct mbuf *
+hfsc_dequeue(struct ifaltq *ifq, int op)
+{
+       struct hfsc_if  *hif = (struct hfsc_if *)ifq->altq_disc;
+       struct hfsc_class *cl;
+       struct mbuf *m;
+       int len, next_len;
+       int realtime = 0;
+       uint64_t cur_time;
+
+       if (hif->hif_packets == 0) {
+               /* no packet in the tree */
+               return (NULL);
+       }
+
+       cur_time = read_machclk();
+
+       if (op == ALTDQ_REMOVE && hif->hif_pollcache != NULL) {
+               cl = hif->hif_pollcache;
+               hif->hif_pollcache = NULL;
+               /* check if the class was scheduled by real-time criteria */
+               if (cl->cl_rsc != NULL)
+                       realtime = (cl->cl_e <= cur_time);
+       } else {
+               /*
+                * if there are eligible classes, use real-time criteria.
+                * find the class with the minimum deadline among
+                * the eligible classes.
+                */
+               if ((cl = ellist_get_mindl(hif->hif_eligible, cur_time)) != NULL) {
+                       realtime = 1;
+               } else {
+#ifdef ALTQ_DEBUG
+                       int fits = 0;
+#endif
+                       /*
+                        * use link-sharing criteria
+                        * get the class with the minimum vt in the hierarchy
+                        */
+                       cl = hif->hif_rootclass;
+                       while (is_a_parent_class(cl)) {
+
+                               cl = actlist_firstfit(cl, cur_time);
+                               if (cl == NULL) {
+#ifdef ALTQ_DEBUG
+                                       if (fits > 0)
+                                               printf("%d fit but none found\n",fits);
+#endif
+                                       return (NULL);
+                               }
+                               /*
+                                * update parent's cl_cvtmin.
+                                * don't update if the new vt is smaller.
+                                */
+                               if (cl->cl_parent->cl_cvtmin < cl->cl_vt)
+                                       cl->cl_parent->cl_cvtmin = cl->cl_vt;
+#ifdef ALTQ_DEBUG
+                               fits++;
+#endif
+                       }
+               }
+
+               if (op == ALTDQ_POLL) {
+                       hif->hif_pollcache = cl;
+                       m = hfsc_pollq(cl);
+                       return (m);
+               }
+       }
+
+       m = hfsc_getq(cl);
+       if (m == NULL)
+               panic("hfsc_dequeue:");
+       len = m_pktlen(m);
+       cl->cl_hif->hif_packets--;
+       ifq->ifq_len--;
+       PKTCNTR_ADD(&cl->cl_stats.xmit_cnt, len);
+
+       update_vf(cl, len, cur_time);
+       if (realtime)
+               cl->cl_cumul += len;
+
+       if (!qempty(cl->cl_q)) {
+               if (cl->cl_rsc != NULL) {
+                       /* update ed */
+                       next_len = m_pktlen(qhead(cl->cl_q));
+
+                       if (realtime)
+                               update_ed(cl, next_len);
+                       else
+                               update_d(cl, next_len);
+               }
+       } else {
+               /* the class becomes passive */
+               set_passive(cl);
+       }
+
+       return (m);
+}
+
+static int
+hfsc_addq(struct hfsc_class *cl, struct mbuf *m)
+{
+
+#ifdef ALTQ_RIO
+       if (q_is_rio(cl->cl_q))
+               return rio_addq((rio_t *)cl->cl_red, cl->cl_q,
+                               m, cl->cl_pktattr);
+#endif
+#ifdef ALTQ_RED
+       if (q_is_red(cl->cl_q))
+               return red_addq(cl->cl_red, cl->cl_q, m, cl->cl_pktattr);
+#endif
+       if (qlen(cl->cl_q) >= qlimit(cl->cl_q)) {
+               m_freem(m);
+               return (-1);
+       }
+
+       if (cl->cl_flags & HFCF_CLEARDSCP)
+               write_dsfield(m, cl->cl_pktattr, 0);
+
+       _addq(cl->cl_q, m);
+
+       return (0);
+}
+
+static struct mbuf *
+hfsc_getq(struct hfsc_class *cl)
+{
+#ifdef ALTQ_RIO
+       if (q_is_rio(cl->cl_q))
+               return rio_getq((rio_t *)cl->cl_red, cl->cl_q);
+#endif
+#ifdef ALTQ_RED
+       if (q_is_red(cl->cl_q))
+               return red_getq(cl->cl_red, cl->cl_q);
+#endif
+       return _getq(cl->cl_q);
+}
+
+static struct mbuf *
+hfsc_pollq(struct hfsc_class *cl)
+{
+       return qhead(cl->cl_q);
+}
+
+static void
+hfsc_purgeq(struct hfsc_class *cl)
+{
+       struct mbuf *m;
+
+       if (qempty(cl->cl_q))
+               return;
+
+       while ((m = _getq(cl->cl_q)) != NULL) {
+               PKTCNTR_ADD(&cl->cl_stats.drop_cnt, m_pktlen(m));
+               m_freem(m);
+               cl->cl_hif->hif_packets--;
+               cl->cl_hif->hif_ifq->ifq_len--;
+       }
+       KKASSERT(qlen(cl->cl_q) == 0);
+
+       update_vf(cl, 0, 0);    /* remove cl from the actlist */
+       set_passive(cl);
+}
+
+static void
+set_active(struct hfsc_class *cl, int len)
+{
+       if (cl->cl_rsc != NULL)
+               init_ed(cl, len);
+       if (cl->cl_fsc != NULL)
+               init_vf(cl, len);
+
+       cl->cl_stats.period++;
+}
+
+static void
+set_passive(struct hfsc_class *cl)
+{
+       if (cl->cl_rsc != NULL)
+               ellist_remove(cl);
+
+       /*
+        * actlist is now handled in update_vf() so that update_vf(cl, 0, 0)
+        * needs to be called explicitly to remove a class from actlist
+        */
+}
+
+static void
+init_ed(struct hfsc_class *cl, int next_len)
+{
+       uint64_t cur_time;
+
+       cur_time = read_machclk();
+
+       /* update the deadline curve */
+       rtsc_min(&cl->cl_deadline, cl->cl_rsc, cur_time, cl->cl_cumul);
+
+       /*
+        * update the eligible curve.
+        * for concave, it is equal to the deadline curve.
+        * for convex, it is a linear curve with slope m2.
+        */
+       cl->cl_eligible = cl->cl_deadline;
+       if (cl->cl_rsc->sm1 <= cl->cl_rsc->sm2) {
+               cl->cl_eligible.dx = 0;
+               cl->cl_eligible.dy = 0;
+       }
+
+       /* compute e and d */
+       cl->cl_e = rtsc_y2x(&cl->cl_eligible, cl->cl_cumul);
+       cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len);
+
+       ellist_insert(cl);
+}
+
+static void
+update_ed(struct hfsc_class *cl, int next_len)
+{
+       cl->cl_e = rtsc_y2x(&cl->cl_eligible, cl->cl_cumul);
+       cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len);
+
+       ellist_update(cl);
+}
+
+static void
+update_d(struct hfsc_class *cl, int next_len)
+{
+       cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len);
+}
+
+static void
+init_vf(struct hfsc_class *cl, int len)
+{
+       struct hfsc_class *max_cl, *p;
+       uint64_t vt, f, cur_time;
+       int go_active;
+
+       cur_time = 0;
+       go_active = 1;
+       for ( ; cl->cl_parent != NULL; cl = cl->cl_parent) {
+               if (go_active && cl->cl_nactive++ == 0)
+                       go_active = 1;
+               else
+                       go_active = 0;
+
+               if (go_active) {
+                       max_cl = actlist_last(cl->cl_parent->cl_actc);
+                       if (max_cl != NULL) {
+                               /*
+                                * set vt to the average of the min and max
+                                * classes.  if the parent's period didn't
+                                * change, don't decrease vt of the class.
+                                */
+                               vt = max_cl->cl_vt;
+                               if (cl->cl_parent->cl_cvtmin != 0)
+                                       vt = (cl->cl_parent->cl_cvtmin + vt)/2;
+
+                               if (cl->cl_parent->cl_vtperiod !=
+                                   cl->cl_parentperiod || vt > cl->cl_vt)
+                                       cl->cl_vt = vt;
+                       } else {
+                               /*
+                                * first child for a new parent backlog period.
+                                * add parent's cvtmax to vtoff of children
+                                * to make a new vt (vtoff + vt) larger than
+                                * the vt in the last period for all children.
+                                */
+                               vt = cl->cl_parent->cl_cvtmax;
+                               for (p = cl->cl_parent->cl_children; p != NULL;
+                                    p = p->cl_siblings)
+                                       p->cl_vtoff += vt;
+                               cl->cl_vt = 0;
+                               cl->cl_parent->cl_cvtmax = 0;
+                               cl->cl_parent->cl_cvtmin = 0;
+                       }
+                       cl->cl_initvt = cl->cl_vt;
+
+                       /* update the virtual curve */
+                       vt = cl->cl_vt + cl->cl_vtoff;
+                       rtsc_min(&cl->cl_virtual, cl->cl_fsc, vt, cl->cl_total);
+                       if (cl->cl_virtual.x == vt) {
+                               cl->cl_virtual.x -= cl->cl_vtoff;
+                               cl->cl_vtoff = 0;
+                       }
+                       cl->cl_vtadj = 0;
+
+                       cl->cl_vtperiod++;  /* increment vt period */
+                       cl->cl_parentperiod = cl->cl_parent->cl_vtperiod;
+                       if (cl->cl_parent->cl_nactive == 0)
+                               cl->cl_parentperiod++;
+                       cl->cl_f = 0;
+
+                       actlist_insert(cl);
+
+                       if (cl->cl_usc != NULL) {
+                               /* class has upper limit curve */
+                               if (cur_time == 0)
+                                       cur_time = read_machclk();
+
+                               /* update the ulimit curve */
+                               rtsc_min(&cl->cl_ulimit, cl->cl_usc, cur_time,
+                                   cl->cl_total);
+                               /* compute myf */
+                               cl->cl_myf = rtsc_y2x(&cl->cl_ulimit,
+                                   cl->cl_total);
+                               cl->cl_myfadj = 0;
+                       }
+               }
+
+               if (cl->cl_myf > cl->cl_cfmin)
+                       f = cl->cl_myf;
+               else
+                       f = cl->cl_cfmin;
+               if (f != cl->cl_f) {
+                       cl->cl_f = f;
+                       update_cfmin(cl->cl_parent);
+               }
+       }
+}
+
+static void
+update_vf(struct hfsc_class *cl, int len, uint64_t cur_time)
+{
+       uint64_t f, myf_bound, delta;
+       int go_passive;
+
+       go_passive = qempty(cl->cl_q);
+
+       for (; cl->cl_parent != NULL; cl = cl->cl_parent) {
+               cl->cl_total += len;
+
+               if (cl->cl_fsc == NULL || cl->cl_nactive == 0)
+                       continue;
+
+               if (go_passive && --cl->cl_nactive == 0)
+                       go_passive = 1;
+               else
+                       go_passive = 0;
+
+               if (go_passive) {
+                       /* no more active child, going passive */
+
+                       /* update cvtmax of the parent class */
+                       if (cl->cl_vt > cl->cl_parent->cl_cvtmax)
+                               cl->cl_parent->cl_cvtmax = cl->cl_vt;
+
+                       /* remove this class from the vt list */
+                       actlist_remove(cl);
+
+                       update_cfmin(cl->cl_parent);
+
+                       continue;
+               }
+
+               /*
+                * update vt and f
+                */
+               cl->cl_vt = rtsc_y2x(&cl->cl_virtual, cl->cl_total)
+                   - cl->cl_vtoff + cl->cl_vtadj;
+
+               /*
+                * if vt of the class is smaller than cvtmin,
+                * the class was skipped in the past due to non-fit.
+                * if so, we need to adjust vtadj.
+                */
+               if (cl->cl_vt < cl->cl_parent->cl_cvtmin) {
+                       cl->cl_vtadj += cl->cl_parent->cl_cvtmin - cl->cl_vt;
+                       cl->cl_vt = cl->cl_parent->cl_cvtmin;
+               }
+
+               /* update the vt list */
+               actlist_update(cl);
+
+               if (cl->cl_usc != NULL) {
+                       cl->cl_myf = cl->cl_myfadj
+                           + rtsc_y2x(&cl->cl_ulimit, cl->cl_total);
+
+                       /*
+                        * if myf lags behind by more than one clock tick
+                        * from the current time, adjust myfadj to prevent
+                        * a rate-limited class from going greedy.
+                        * in a steady state under rate-limiting, myf
+                        * fluctuates within one clock tick.
+                        */
+                       myf_bound = cur_time - machclk_per_tick;
+                       if (cl->cl_myf < myf_bound) {
+                               delta = cur_time - cl->cl_myf;
+                               cl->cl_myfadj += delta;
+                               cl->cl_myf += delta;
+                       }
+               }
+
+               /* cl_f is max(cl_myf, cl_cfmin) */
+               if (cl->cl_myf > cl->cl_cfmin)
+                       f = cl->cl_myf;
+               else
+                       f = cl->cl_cfmin;
+               if (f != cl->cl_f) {
+                       cl->cl_f = f;
+                       update_cfmin(cl->cl_parent);
+               }
+       }
+}
+
+static void
+update_cfmin(struct hfsc_class *cl)
+{
+       struct hfsc_class *p;
+       uint64_t cfmin;
+
+       if (TAILQ_EMPTY(cl->cl_actc)) {
+               cl->cl_cfmin = 0;
+               return;
+       }
+       cfmin = HT_INFINITY;
+       TAILQ_FOREACH(p, cl->cl_actc, cl_actlist) {
+               if (p->cl_f == 0) {
+                       cl->cl_cfmin = 0;
+                       return;
+               }
+               if (p->cl_f < cfmin)
+                       cfmin = p->cl_f;
+       }
+       cl->cl_cfmin = cfmin;
+}
+
+/*
+ * TAILQ based ellist and actlist implementation
+ * (ion wanted to make a calendar queue based implementation)
+ */
+/*
+ * eligible list holds backlogged classes being sorted by their eligible times.
+ * there is one eligible list per interface.
+ */
+
+static ellist_t *
+ellist_alloc(void)
+{
+       ellist_t *head;
+
+       head = malloc(sizeof(ellist_t *), M_ALTQ, M_WAITOK);
+       TAILQ_INIT(head);
+       return (head);
+}
+
+static void
+ellist_destroy(ellist_t *head)
+{
+       free(head, M_ALTQ);
+}
+
+static void
+ellist_insert(struct hfsc_class *cl)
+{
+       struct hfsc_if *hif = cl->cl_hif;
+       struct hfsc_class *p;
+
+       /* check the last entry first */
+       if ((p = TAILQ_LAST(hif->hif_eligible, _eligible)) == NULL ||
+           p->cl_e <= cl->cl_e) {
+               TAILQ_INSERT_TAIL(hif->hif_eligible, cl, cl_ellist);
+               return;
+       }
+
+       TAILQ_FOREACH(p, hif->hif_eligible, cl_ellist) {
+               if (cl->cl_e < p->cl_e) {
+                       TAILQ_INSERT_BEFORE(p, cl, cl_ellist);
+                       return;
+               }
+       }
+       KKASSERT(0); /* should not reach here */
+}
+
+static void
+ellist_remove(struct hfsc_class *cl)
+{
+       struct hfsc_if *hif = cl->cl_hif;
+
+       TAILQ_REMOVE(hif->hif_eligible, cl, cl_ellist);
+}
+
+static void
+ellist_update(struct hfsc_class *cl)
+{
+       struct hfsc_if *hif = cl->cl_hif;
+       struct hfsc_class *p, *last;
+
+       /*
+        * the eligible time of a class increases monotonically.
+        * if the next entry has a larger eligible time, nothing to do.
+        */
+       p = TAILQ_NEXT(cl, cl_ellist);
+       if (p == NULL || cl->cl_e <= p->cl_e)
+               return;
+
+       /* check the last entry */
+       last = TAILQ_LAST(hif->hif_eligible, _eligible);
+       KKASSERT(last != NULL);
+       if (last->cl_e <= cl->cl_e) {
+               TAILQ_REMOVE(hif->hif_eligible, cl, cl_ellist);
+               TAILQ_INSERT_TAIL(hif->hif_eligible, cl, cl_ellist);
+               return;
+       }
+
+       /*
+        * the new position must be between the next entry
+        * and the last entry
+        */
+       while ((p = TAILQ_NEXT(p, cl_ellist)) != NULL) {
+               if (cl->cl_e < p->cl_e) {
+                       TAILQ_REMOVE(hif->hif_eligible, cl, cl_ellist);
+                       TAILQ_INSERT_BEFORE(p, cl, cl_ellist);
+                       return;
+               }
+       }
+       KKASSERT(0); /* should not reach here */
+}
+
+/* find the class with the minimum deadline among the eligible classes */
+struct hfsc_class *
+ellist_get_mindl(ellist_t *head, uint64_t cur_time)
+{
+       struct hfsc_class *p, *cl = NULL;
+
+       TAILQ_FOREACH(p, head, cl_ellist) {
+               if (p->cl_e > cur_time)
+                       break;
+               if (cl == NULL || p->cl_d < cl->cl_d)
+                       cl = p;
+       }
+       return (cl);
+}
+
+/*
+ * active children list holds backlogged child classes being sorted
+ * by their virtual time.
+ * each intermediate class has one active children list.
+ */
+static actlist_t *
+actlist_alloc(void)
+{
+       actlist_t *head;
+
+       head = malloc(sizeof(*head), M_ALTQ, M_WAITOK);
+       TAILQ_INIT(head);
+       return (head);
+}
+
+static void
+actlist_destroy(actlist_t *head)
+{
+       free(head, M_ALTQ);
+}
+static void
+actlist_insert(struct hfsc_class *cl)
+{
+       struct hfsc_class *p;
+
+       /* check the last entry first */
+       if ((p = TAILQ_LAST(cl->cl_parent->cl_actc, _active)) == NULL
+           || p->cl_vt <= cl->cl_vt) {
+               TAILQ_INSERT_TAIL(cl->cl_parent->cl_actc, cl, cl_actlist);
+               return;
+       }
+
+       TAILQ_FOREACH(p, cl->cl_parent->cl_actc, cl_actlist) {
+               if (cl->cl_vt < p->cl_vt) {
+                       TAILQ_INSERT_BEFORE(p, cl, cl_actlist);
+                       return;
+               }
+       }
+       KKASSERT(0); /* should not reach here */
+}
+
+static void
+actlist_remove(struct hfsc_class *cl)
+{
+       TAILQ_REMOVE(cl->cl_parent->cl_actc, cl, cl_actlist);
+}
+
+static void
+actlist_update(struct hfsc_class *cl)
+{
+       struct hfsc_class *p, *last;
+
+       /*
+        * the virtual time of a class increases monotonically during its
+        * backlogged period.
+        * if the next entry has a larger virtual time, nothing to do.
+        */
+       p = TAILQ_NEXT(cl, cl_actlist);
+       if (p == NULL || cl->cl_vt < p->cl_vt)
+               return;
+
+       /* check the last entry */
+       last = TAILQ_LAST(cl->cl_parent->cl_actc, _active);
+       KKASSERT(last != NULL);
+       if (last->cl_vt <= cl->cl_vt) {
+               TAILQ_REMOVE(cl->cl_parent->cl_actc, cl, cl_actlist);
+               TAILQ_INSERT_TAIL(cl->cl_parent->cl_actc, cl, cl_actlist);
+               return;
+       }
+
+       /*
+        * the new position must be between the next entry
+        * and the last entry
+        */
+       while ((p = TAILQ_NEXT(p, cl_actlist)) != NULL) {
+               if (cl->cl_vt < p->cl_vt) {
+                       TAILQ_REMOVE(cl->cl_parent->cl_actc, cl, cl_actlist);
+                       TAILQ_INSERT_BEFORE(p, cl, cl_actlist);
+                       return;
+               }
+       }
+       KKASSERT(0); /* should not reach here */
+}
+
+static struct hfsc_class *
+actlist_firstfit(struct hfsc_class *cl, uint64_t cur_time)
+{
+       struct hfsc_class *p;
+
+       TAILQ_FOREACH(p, cl->cl_actc, cl_actlist) {
+               if (p->cl_f <= cur_time)
+                       return (p);
+       }
+       return (NULL);
+}
+
+/*
+ * service curve support functions
+ *
+ *  external service curve parameters
+ *     m: bits/sec
+ *     d: msec
+ *  internal service curve parameters
+ *     sm: (bytes/tsc_interval) << SM_SHIFT
+ *     ism: (tsc_count/byte) << ISM_SHIFT
+ *     dx: tsc_count
+ *
+ * SM_SHIFT and ISM_SHIFT are scaled in order to keep effective digits.
+ * we should be able to handle 100K-1Gbps linkspeed with 200Hz-1GHz CPU
+ * speed.  SM_SHIFT and ISM_SHIFT are selected to have at least 3 effective
+ * digits in decimal using the following table.
+ *
+ *  bits/sec    100Kbps     1Mbps     10Mbps     100Mbps    1Gbps
+ *  ----------+-------------------------------------------------------
+ *  bytes/nsec  12.5e-6    125e-6     1250e-6    12500e-6   125000e-6
+ *  sm(500MHz)  25.0e-6    250e-6     2500e-6    25000e-6   250000e-6
+ *  sm(200MHz)  62.5e-6    625e-6     6250e-6    62500e-6   625000e-6
+ *
+ *  nsec/byte   80000      8000       800        80         8
+ *  ism(500MHz) 40000      4000       400        40         4
+ *  ism(200MHz) 16000      1600       160        16         1.6
+ */
+#define        SM_SHIFT        24
+#define        ISM_SHIFT       10
+
+#define        SM_MASK         ((1LL << SM_SHIFT) - 1)
+#define        ISM_MASK        ((1LL << ISM_SHIFT) - 1)
+
+static __inline uint64_t
+seg_x2y(uint64_t x, uint64_t sm)
+{
+       uint64_t y;
+
+       /*
+        * compute
+        *      y = x * sm >> SM_SHIFT
+        * but divide it for the upper and lower bits to avoid overflow
+        */
+       y = (x >> SM_SHIFT) * sm + (((x & SM_MASK) * sm) >> SM_SHIFT);
+       return (y);
+}
+
+static __inline uint64_t
+seg_y2x(uint64_t y, uint64_t ism)
+{
+       uint64_t x;
+
+       if (y == 0)
+               x = 0;
+       else if (ism == HT_INFINITY)
+               x = HT_INFINITY;
+       else
+               x = (y >> ISM_SHIFT) * ism + (((y & ISM_MASK) * ism) >> ISM_SHIFT);
+
+       return (x);
+}
+
+static __inline uint64_t
+m2sm(u_int m)
+{
+       uint64_t sm;
+
+       sm = ((uint64_t)m << SM_SHIFT) / 8 / machclk_freq;
+       return (sm);
+}
+
+static __inline uint64_t
+m2ism(u_int m)
+{
+       uint64_t ism;
+
+       if (m == 0)
+               ism = HT_INFINITY;
+       else
+               ism = ((uint64_t)machclk_freq << ISM_SHIFT) * 8 / m;
+       return (ism);
+}
+
+static __inline uint64_t
+d2dx(u_int d)
+{
+       uint64_t dx;
+
+       dx = ((uint64_t)d * machclk_freq) / 1000;
+       return (dx);
+}
+
+static u_int
+sm2m(uint64_t sm)
+{
+       uint64_t m;
+
+       m = (sm * 8 * machclk_freq) >> SM_SHIFT;
+       return ((u_int)m);
+}
+
+static u_int
+dx2d(uint64_t dx)
+{
+       uint64_t d;
+
+       d = dx * 1000 / machclk_freq;
+       return ((u_int)d);
+}
+
+static void
+sc2isc(struct service_curve *sc, struct internal_sc *isc)
+{
+       isc->sm1 = m2sm(sc->m1);
+       isc->ism1 = m2ism(sc->m1);
+       isc->dx = d2dx(sc->d);
+       isc->dy = seg_x2y(isc->dx, isc->sm1);
+       isc->sm2 = m2sm(sc->m2);
+       isc->ism2 = m2ism(sc->m2);
+}
+
+/*
+ * initialize the runtime service curve with the given internal
+ * service curve starting at (x, y).
+ */
+static void
+rtsc_init(struct runtime_sc *rtsc, struct internal_sc *isc, uint64_t x, uint64_t y)
+{
+       rtsc->x = x;
+       rtsc->y = y;
+       rtsc->sm1 = isc->sm1;
+       rtsc->ism1 = isc->ism1;
+       rtsc->dx = isc->dx;
+       rtsc->dy = isc->dy;
+       rtsc->sm2 = isc->sm2;
+       rtsc->ism2 = isc->ism2;
+}
+
+/*
+ * calculate the y-projection of the runtime service curve by the
+ * given x-projection value
+ */
+static uint64_t
+rtsc_y2x(struct runtime_sc *rtsc, uint64_t y)
+{
+       uint64_t x;
+
+       if (y < rtsc->y) {
+               x = rtsc->x;
+       } else if (y <= rtsc->y + rtsc->dy) {
+               /* x belongs to the 1st segment */
+               if (rtsc->dy == 0)
+                       x = rtsc->x + rtsc->dx;
+               else
+                       x = rtsc->x + seg_y2x(y - rtsc->y, rtsc->ism1);
+       } else {
+               /* x belongs to the 2nd segment */
+               x = rtsc->x + rtsc->dx
+                   + seg_y2x(y - rtsc->y - rtsc->dy, rtsc->ism2);
+       }
+       return (x);
+}
+
+static uint64_t
+rtsc_x2y(struct runtime_sc *rtsc, uint64_t x)
+{
+       uint64_t y;
+
+       if (x <= rtsc->x) {
+               y = rtsc->y;
+       } else if (x <= rtsc->x + rtsc->dx) {
+               /* y belongs to the 1st segment */
+               y = rtsc->y + seg_x2y(x - rtsc->x, rtsc->sm1);
+       } else
+               /* y belongs to the 2nd segment */
+               y = rtsc->y + rtsc->dy
+                   + seg_x2y(x - rtsc->x - rtsc->dx, rtsc->sm2);
+       return (y);
+}
+
+/*
+ * update the runtime service curve by taking the minimum of the current
+ * runtime service curve and the service curve starting at (x, y).
+ */
+static void
+rtsc_min(struct runtime_sc *rtsc, struct internal_sc *isc, uint64_t x, uint64_t y)
+{
+       uint64_t y1, y2, dx, dy;
+
+       if (isc->sm1 <= isc->sm2) {
+               /* service curve is convex */
+               y1 = rtsc_x2y(rtsc, x);
+               if (y1 < y)
+                       /* the current rtsc is smaller */
+                       return;
+               rtsc->x = x;
+               rtsc->y = y;
+               return;
+       }
+
+       /*
+        * service curve is concave
+        * compute the two y values of the current rtsc
+        *      y1: at x
+        *      y2: at (x + dx)
+        */
+       y1 = rtsc_x2y(rtsc, x);
+       if (y1 <= y) {
+               /* rtsc is below isc, no change to rtsc */
+               return;
+       }
+
+       y2 = rtsc_x2y(rtsc, x + isc->dx);
+       if (y2 >= y + isc->dy) {
+               /* rtsc is above isc, replace rtsc by isc */
+               rtsc->x = x;
+               rtsc->y = y;
+               rtsc->dx = isc->dx;
+               rtsc->dy = isc->dy;
+               return;
+       }
+
+       /*
+        * the two curves intersect
+        * compute the offsets (dx, dy) using the reverse
+        * function of seg_x2y()
+        *      seg_x2y(dx, sm1) == seg_x2y(dx, sm2) + (y1 - y)
+        */
+       dx = ((y1 - y) << SM_SHIFT) / (isc->sm1 - isc->sm2);
+       /*
+        * check if (x, y1) belongs to the 1st segment of rtsc.
+        * if so, add the offset.
+        */
+       if (rtsc->x + rtsc->dx > x)
+               dx += rtsc->x + rtsc->dx - x;
+       dy = seg_x2y(dx, isc->sm1);
+
+       rtsc->x = x;
+       rtsc->y = y;
+       rtsc->dx = dx;
+       rtsc->dy = dy;
+}
+
+static void
+get_class_stats(struct hfsc_classstats *sp, struct hfsc_class *cl)
+{
+       sp->class_id = cl->cl_id;
+       sp->class_handle = cl->cl_handle;
+
+       if (cl->cl_rsc != NULL) {
+               sp->rsc.m1 = sm2m(cl->cl_rsc->sm1);
+               sp->rsc.d = dx2d(cl->cl_rsc->dx);
+               sp->rsc.m2 = sm2m(cl->cl_rsc->sm2);
+       } else {
+               sp->rsc.m1 = 0;
+               sp->rsc.d = 0;
+               sp->rsc.m2 = 0;
+       }
+       if (cl->cl_fsc != NULL) {
+               sp->fsc.m1 = sm2m(cl->cl_fsc->sm1);
+               sp->fsc.d = dx2d(cl->cl_fsc->dx);
+               sp->fsc.m2 = sm2m(cl->cl_fsc->sm2);
+       } else {
+               sp->fsc.m1 = 0;
+               sp->fsc.d = 0;
+               sp->fsc.m2 = 0;
+       }
+       if (cl->cl_usc != NULL) {
+               sp->usc.m1 = sm2m(cl->cl_usc->sm1);
+               sp->usc.d = dx2d(cl->cl_usc->dx);
+               sp->usc.m2 = sm2m(cl->cl_usc->sm2);
+       } else {
+               sp->usc.m1 = 0;
+               sp->usc.d = 0;
+               sp->usc.m2 = 0;
+       }
+
+       sp->total = cl->cl_total;
+       sp->cumul = cl->cl_cumul;
+
+       sp->d = cl->cl_d;
+       sp->e = cl->cl_e;
+       sp->vt = cl->cl_vt;
+       sp->f = cl->cl_f;
+
+       sp->initvt = cl->cl_initvt;
+       sp->vtperiod = cl->cl_vtperiod;
+       sp->parentperiod = cl->cl_parentperiod;
+       sp->nactive = cl->cl_nactive;
+       sp->vtoff = cl->cl_vtoff;
+       sp->cvtmax = cl->cl_cvtmax;
+       sp->myf = cl->cl_myf;
+       sp->cfmin = cl->cl_cfmin;
+       sp->cvtmin = cl->cl_cvtmin;
+       sp->myfadj = cl->cl_myfadj;
+       sp->vtadj = cl->cl_vtadj;
+
+       sp->cur_time = read_machclk();
+       sp->machclk_freq = machclk_freq;
+
+       sp->qlength = qlen(cl->cl_q);
+       sp->qlimit = qlimit(cl->cl_q);
+       sp->xmit_cnt = cl->cl_stats.xmit_cnt;
+       sp->drop_cnt = cl->cl_stats.drop_cnt;
+       sp->period = cl->cl_stats.period;
+
+       sp->qtype = qtype(cl->cl_q);
+#ifdef ALTQ_RED
+       if (q_is_red(cl->cl_q))
+               red_getstats(cl->cl_red, &sp->red[0]);
+#endif
+#ifdef ALTQ_RIO
+       if (q_is_rio(cl->cl_q))
+               rio_getstats((rio_t *)cl->cl_red, &sp->red[0]);
+#endif
+}
+
+/* convert a class handle to the corresponding class pointer */
+static struct hfsc_class *
+clh_to_clp(struct hfsc_if *hif, uint32_t chandle)
+{
+       int i;
+       struct hfsc_class *cl;
+
+       if (chandle == 0)
+               return (NULL);
+       /*
+        * first, try optimistically the slot matching the lower bits of
+        * the handle.  if it fails, do the linear table search.
+        */
+       i = chandle % HFSC_MAX_CLASSES;
+       if ((cl = hif->hif_class_tbl[i]) != NULL && cl->cl_handle == chandle)
+               return (cl);
+       for (i = 0; i < HFSC_MAX_CLASSES; i++)
+               if ((cl = hif->hif_class_tbl[i]) != NULL &&
+                   cl->cl_handle == chandle)
+                       return (cl);
+       return (NULL);
+}
+
+#endif /* ALTQ_HFSC */
diff --git a/sys/net/altq/altq_hfsc.h b/sys/net/altq/altq_hfsc.h
new file mode 100644 (file)
index 0000000..256ae60
--- /dev/null
@@ -0,0 +1,240 @@
+/*     $KAME: altq_hfsc.h,v 1.12 2003/12/05 05:40:46 kjc Exp $ */
+/*     $DragonFly: src/sys/net/altq/altq_hfsc.h,v 1.1 2005/02/11 22:25:57 joerg Exp $ */
+
+/*
+ * Copyright (c) 1997-1999 Carnegie Mellon University. All Rights Reserved.
+ *
+ * Permission to use, copy, modify, and distribute this software and
+ * its documentation is hereby granted (including for commercial or
+ * for-profit use), provided that both the copyright notice and this
+ * permission notice appear in all copies of the software, derivative
+ * works, or modified versions, and any portions thereof.
+ *
+ * THIS SOFTWARE IS EXPERIMENTAL AND IS KNOWN TO HAVE BUGS, SOME OF
+ * WHICH MAY HAVE SERIOUS CONSEQUENCES.  CARNEGIE MELLON PROVIDES THIS
+ * SOFTWARE IN ITS ``AS IS'' CONDITION, AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+ * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ * Carnegie Mellon encourages (but does not require) users of this
+ * software to return any improvements or extensions that they make,
+ * and to grant Carnegie Mellon the rights to redistribute these
+ * changes without encumbrance.
+ */
+#ifndef _ALTQ_ALTQ_HFSC_H_
+#define        _ALTQ_ALTQ_HFSC_H_
+
+#include <net/altq/altq.h>
+#include <net/altq/altq_classq.h>
+#include <net/altq/altq_red.h>
+#include <net/altq/altq_rio.h>
+
+struct service_curve {
+       u_int   m1;     /* slope of the first segment in bits/sec */
+       u_int   d;      /* the x-projection of the first segment in msec */
+       u_int   m2;     /* slope of the second segment in bits/sec */
+};
+
+/* special class handles */
+#define        HFSC_NULLCLASS_HANDLE   0
+#define        HFSC_MAX_CLASSES        64
+
+/* hfsc class flags */
+#define        HFCF_RED                0x0001  /* use RED */
+#define        HFCF_ECN                0x0002  /* use RED/ECN */
+#define        HFCF_RIO                0x0004  /* use RIO */
+#define        HFCF_CLEARDSCP          0x0010  /* clear diffserv codepoint */
+#define        HFCF_DEFAULTCLASS       0x1000  /* default class */
+
+/* service curve types */
+#define        HFSC_REALTIMESC         1
+#define        HFSC_LINKSHARINGSC      2
+#define        HFSC_UPPERLIMITSC       4
+#define        HFSC_DEFAULTSC          (HFSC_REALTIMESC|HFSC_LINKSHARINGSC)
+
+struct hfsc_classstats {
+       u_int                   class_id;
+       uint32_t                class_handle;
+       struct service_curve    rsc;
+       struct service_curve    fsc;
+       struct service_curve    usc;    /* upper limit service curve */
+
+       uint64_t                total;  /* total work in bytes */
+       uint64_t                cumul;  /* cumulative work in bytes
+                                          done by real-time criteria */
+       uint64_t                d;              /* deadline */
+       uint64_t                e;              /* eligible time */
+       uint64_t                vt;             /* virtual time */
+       uint64_t                f;              /* fit time for upper-limit */
+
+       /* info helpful for debugging */
+       uint64_t                initvt;         /* init virtual time */
+       uint64_t                vtoff;          /* cl_vt_ipoff */
+       uint64_t                cvtmax;         /* cl_maxvt */
+       uint64_t                myf;            /* cl_myf */
+       uint64_t                cfmin;          /* cl_mincf */
+       uint64_t                cvtmin;         /* cl_mincvt */
+       uint64_t                myfadj;         /* cl_myfadj */
+       uint64_t                vtadj;          /* cl_vtadj */
+       uint64_t                cur_time;
+       uint32_t                machclk_freq;
+
+       u_int                   qlength;
+       u_int                   qlimit;
+       struct pktcntr          xmit_cnt;
+       struct pktcntr          drop_cnt;
+       u_int                   period;
+
+       u_int                   vtperiod;       /* vt period sequence no */
+       u_int                   parentperiod;   /* parent's vt period seqno */
+       int                     nactive;        /* number of active children */
+
+       /* red and rio related info */
+       int                     qtype;
+       struct redstats         red[3];
+};
+
+#ifdef _KERNEL
+/*
+ * kernel internal service curve representation
+ *     coordinates are given by 64 bit unsigned integers.
+ *     x-axis: unit is clock count.  for the intel x86 architecture,
+ *             the raw Pentium TSC (Timestamp Counter) value is used.
+ *             virtual time is also calculated in this time scale.
+ *     y-axis: unit is byte.
+ *
+ *     the service curve parameters are converted to the internal
+ *     representation.
+ *     the slope values are scaled to avoid overflow.
+ *     the inverse slope values as well as the y-projection of the 1st
+ *     segment are kept in order to to avoid 64-bit divide operations
+ *     that are expensive on 32-bit architectures.
+ *
+ *  note: Intel Pentium TSC never wraps around in several thousands of years.
+ *     x-axis doesn't wrap around for 1089 years with 1GHz clock.
+ *      y-axis doesn't wrap around for 4358 years with 1Gbps bandwidth.
+ */
+
+/* kernel internal representation of a service curve */
+struct internal_sc {
+       uint64_t        sm1;    /* scaled slope of the 1st segment */
+       uint64_t        ism1;   /* scaled inverse-slope of the 1st segment */
+       uint64_t        dx;     /* the x-projection of the 1st segment */
+       uint64_t        dy;     /* the y-projection of the 1st segment */
+       uint64_t        sm2;    /* scaled slope of the 2nd segment */
+       uint64_t        ism2;   /* scaled inverse-slope of the 2nd segment */
+};
+
+/* runtime service curve */
+struct runtime_sc {
+       uint64_t        x;      /* current starting position on x-axis */
+       uint64_t        y;      /* current starting position on x-axis */
+       uint64_t        sm1;    /* scaled slope of the 1st segment */
+       uint64_t        ism1;   /* scaled inverse-slope of the 1st segment */
+       uint64_t        dx;     /* the x-projection of the 1st segment */
+       uint64_t        dy;     /* the y-projection of the 1st segment */
+       uint64_t        sm2;    /* scaled slope of the 2nd segment */
+       uint64_t        ism2;   /* scaled inverse-slope of the 2nd segment */
+};
+
+/* for TAILQ based ellist and actlist implementation */
+struct hfsc_class;
+typedef TAILQ_HEAD(_eligible, hfsc_class) ellist_t;
+typedef TAILQ_ENTRY(hfsc_class) elentry_t;
+typedef TAILQ_HEAD(_active, hfsc_class) actlist_t;
+typedef TAILQ_ENTRY(hfsc_class) actentry_t;
+#define        ellist_first(s)         TAILQ_FIRST(s)
+#define        actlist_first(s)        TAILQ_FIRST(s)
+#define        actlist_last(s)         TAILQ_LAST(s, _active)
+
+struct hfsc_class {
+       u_int           cl_id;          /* class id (just for debug) */
+       uint32_t        cl_handle;      /* class handle */
+       struct hfsc_if  *cl_hif;        /* back pointer to struct hfsc_if */
+       int             cl_flags;       /* misc flags */
+
+       struct hfsc_class *cl_parent;   /* parent class */
+       struct hfsc_class *cl_siblings; /* sibling classes */
+       struct hfsc_class *cl_children; /* child classes */
+
+       class_queue_t   *cl_q;          /* class queue structure */
+       struct red      *cl_red;        /* RED state */
+       struct altq_pktattr *cl_pktattr; /* saved header used by ECN */
+
+       uint64_t        cl_total;       /* total work in bytes */
+       uint64_t        cl_cumul;       /* cumulative work in bytes
+                                          done by real-time criteria */
+       uint64_t        cl_d;           /* deadline */
+       uint64_t        cl_e;           /* eligible time */
+       uint64_t        cl_vt;          /* virtual time */
+       uint64_t        cl_f;           /* time when this class will fit for
+                                          link-sharing, max(myf, cfmin) */
+       uint64_t        cl_myf;         /* my fit-time (as calculated from this
+                                          class's own upperlimit curve) */
+       uint64_t        cl_myfadj;      /* my fit-time adjustment
+                                          (to cancel history dependence) */
+       uint64_t        cl_cfmin;       /* earliest children's fit-time (used
+                                          with cl_myf to obtain cl_f) */
+       uint64_t        cl_cvtmin;      /* minimal virtual time among the
+                                          children fit for link-sharing
+                                          (monotonic within a period) */
+       uint64_t        cl_vtadj;       /* intra-period cumulative vt
+                                          adjustment */
+       uint64_t        cl_vtoff;       /* inter-period cumulative vt offset */
+       uint64_t        cl_cvtmax;      /* max child's vt in the last period */
+
+       uint64_t        cl_initvt;      /* init virtual time (for debugging) */
+
+       struct internal_sc *cl_rsc;     /* internal real-time service curve */
+       struct internal_sc *cl_fsc;     /* internal fair service curve */
+       struct internal_sc *cl_usc;     /* internal upperlimit service curve */
+       struct runtime_sc  cl_deadline; /* deadline curve */
+       struct runtime_sc  cl_eligible; /* eligible curve */
+       struct runtime_sc  cl_virtual;  /* virtual curve */
+       struct runtime_sc  cl_ulimit;   /* upperlimit curve */
+
+       u_int           cl_vtperiod;    /* vt period sequence no */
+       u_int           cl_parentperiod;  /* parent's vt period seqno */
+       int             cl_nactive;     /* number of active children */
+       actlist_t       *cl_actc;       /* active children list */
+
+       actentry_t      cl_actlist;     /* active children list entry */
+       elentry_t       cl_ellist;      /* eligible list entry */
+
+       struct {
+               struct pktcntr  xmit_cnt;
+               struct pktcntr  drop_cnt;
+               u_int period;
+       } cl_stats;
+};
+
+/*
+ * hfsc interface state
+ */
+struct hfsc_if {
+       struct hfsc_if          *hif_next;      /* interface state list */
+       struct ifaltq           *hif_ifq;       /* backpointer to ifaltq */
+       struct hfsc_class       *hif_rootclass;         /* root class */
+       struct hfsc_class       *hif_defaultclass;      /* default class */
+       struct hfsc_class       *hif_class_tbl[HFSC_MAX_CLASSES];
+       struct hfsc_class       *hif_pollcache; /* cache for poll operation */
+
+       u_int   hif_classes;                    /* # of classes in the tree */
+       u_int   hif_packets;                    /* # of packets in the tree */
+       u_int   hif_classid;                    /* class id sequence number */
+
+       ellist_t *hif_eligible;                 /* eligible list */
+};
+
+#endif /* _KERNEL */
+
+#endif /* _ALTQ_ALTQ_HFSC_H_ */
diff --git a/sys/net/altq/altq_priq.c b/sys/net/altq/altq_priq.c
new file mode 100644 (file)
index 0000000..b759ffb
--- /dev/null
@@ -0,0 +1,550 @@
+/*     $KAME: altq_priq.c,v 1.12 2004/04/17 10:54:48 kjc Exp $ */
+/*     $DragonFly: src/sys/net/altq/altq_priq.c,v 1.1 2005/02/11 22:25:57 joerg Exp $ */
+
+/*
+ * Copyright (C) 2000-2003
+ *     Sony Computer Science Laboratories Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * priority queue
+ */
+
+#include "opt_altq.h"
+#include "opt_inet.h"
+#include "opt_inet6.h"
+
+#ifdef ALTQ_PRIQ  /* priq is enabled by ALTQ_PRIQ option in opt_altq.h */
+
+#include <sys/param.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/socket.h>
+#include <sys/sockio.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/errno.h>
+#include <sys/kernel.h>
+#include <sys/queue.h>
+
+#include <net/if.h>
+#include <net/ifq_var.h>
+#include <netinet/in.h>
+
+#include <net/pf/pfvar.h>
+#include <net/altq/altq.h>
+#include <net/altq/altq_priq.h>
+
+/*
+ * function prototypes
+ */
+static int     priq_clear_interface(struct priq_if *);
+static int     priq_request(struct ifaltq *, int, void *);
+static void    priq_purge(struct priq_if *);
+static struct priq_class *priq_class_create(struct priq_if *, int, int, int, int);
+static int     priq_class_destroy(struct priq_class *);
+static int     priq_enqueue(struct ifaltq *, struct mbuf *, struct altq_pktattr *);
+static struct mbuf *priq_dequeue(struct ifaltq *, int);
+
+static int     priq_addq(struct priq_class *, struct mbuf *);
+static struct mbuf *priq_getq(struct priq_class *);
+static struct mbuf *priq_pollq(struct priq_class *);
+static void    priq_purgeq(struct priq_class *);
+
+static void    get_class_stats(struct priq_classstats *, struct priq_class *);
+static struct priq_class *clh_to_clp(struct priq_if *, uint32_t);
+
+int
+priq_pfattach(struct pf_altq *a)
+{
+       struct ifnet *ifp;
+       int s, error;
+
+       if ((ifp = ifunit(a->ifname)) == NULL || a->altq_disc == NULL)
+               return (EINVAL);
+       s = splimp();
+       error = altq_attach(&ifp->if_snd, ALTQT_PRIQ, a->altq_disc,
+           priq_enqueue, priq_dequeue, priq_request, NULL, NULL);
+       splx(s);
+       return (error);
+}
+
+int
+priq_add_altq(struct pf_altq *a)
+{
+       struct priq_if *pif;
+       struct ifnet *ifp;
+
+       if ((ifp = ifunit(a->ifname)) == NULL)
+               return (EINVAL);
+       if (!ifq_is_ready(&ifp->if_snd))
+               return (ENODEV);
+
+       pif = malloc(sizeof(*pif), M_ALTQ, M_WAITOK | M_ZERO);
+       pif->pif_bandwidth = a->ifbandwidth;
+       pif->pif_maxpri = -1;
+       pif->pif_ifq = &ifp->if_snd;
+
+       /* keep the state in pf_altq */
+       a->altq_disc = pif;
+
+       return (0);
+}
+
+int
+priq_remove_altq(struct pf_altq *a)
+{
+       struct priq_if *pif;
+
+       if ((pif = a->altq_disc) == NULL)
+               return (EINVAL);
+       a->altq_disc = NULL;
+
+       priq_clear_interface(pif);
+
+       free(pif, M_ALTQ);
+       return (0);
+}
+
+int
+priq_add_queue(struct pf_altq *a)
+{
+       struct priq_if *pif;
+       struct priq_class *cl;
+
+       if ((pif = a->altq_disc) == NULL)
+               return (EINVAL);
+
+       /* check parameters */
+       if (a->priority >= PRIQ_MAXPRI)
+               return (EINVAL);
+       if (a->qid == 0)
+               return (EINVAL);
+       if (pif->pif_classes[a->priority] != NULL)
+               return (EBUSY);
+       if (clh_to_clp(pif, a->qid) != NULL)
+               return (EBUSY);
+
+       cl = priq_class_create(pif, a->priority, a->qlimit,
+                              a->pq_u.priq_opts.flags, a->qid);
+       if (cl == NULL)
+               return (ENOMEM);
+
+       return (0);
+}
+
+int
+priq_remove_queue(struct pf_altq *a)
+{
+       struct priq_if *pif;
+       struct priq_class *cl;
+
+       if ((pif = a->altq_disc) == NULL)
+               return (EINVAL);
+
+       if ((cl = clh_to_clp(pif, a->qid)) == NULL)
+               return (EINVAL);
+
+       return (priq_class_destroy(cl));
+}
+
+int
+priq_getqstats(struct pf_altq *a, void *ubuf, int *nbytes)
+{
+       struct priq_if *pif;
+       struct priq_class *cl;
+       struct priq_classstats stats;
+       int error = 0;
+
+       if ((pif = altq_lookup(a->ifname, ALTQT_PRIQ)) == NULL)
+               return (EBADF);
+
+       if ((cl = clh_to_clp(pif, a->qid)) == NULL)
+               return (EINVAL);
+
+       if (*nbytes < sizeof(stats))
+               return (EINVAL);
+
+       get_class_stats(&stats, cl);
+
+       if ((error = copyout((caddr_t)&stats, ubuf, sizeof(stats))) != 0)
+               return (error);
+       *nbytes = sizeof(stats);
+       return (0);
+}
+
+/*
+ * bring the interface back to the initial state by discarding
+ * all the filters and classes.
+ */
+static int
+priq_clear_interface(struct priq_if *pif)
+{
+       struct priq_class *cl;
+       int pri;
+
+       /* clear out the classes */
+       for (pri = 0; pri <= pif->pif_maxpri; pri++) {
+               if ((cl = pif->pif_classes[pri]) != NULL)
+                       priq_class_destroy(cl);
+       }
+
+       return (0);
+}
+
+static int
+priq_request(struct ifaltq *ifq, int req, void *arg)
+{
+       struct priq_if *pif = (struct priq_if *)ifq->altq_disc;
+
+       switch (req) {
+       case ALTRQ_PURGE:
+               priq_purge(pif);
+               break;
+       }
+       return (0);
+}
+
+/* discard all the queued packets on the interface */
+static void
+priq_purge(struct priq_if *pif)
+{
+       struct priq_class *cl;
+       int pri;
+
+       for (pri = 0; pri <= pif->pif_maxpri; pri++) {
+               if ((cl = pif->pif_classes[pri]) != NULL && !qempty(cl->cl_q))
+                       priq_purgeq(cl);
+       }
+       if (ifq_is_enabled(pif->pif_ifq))
+               pif->pif_ifq->ifq_len = 0;
+}
+
+static struct priq_class *
+priq_class_create(struct priq_if *pif, int pri, int qlimit, int flags, int qid)
+{
+       struct priq_class *cl;
+       int s;
+
+#ifndef ALTQ_RED
+       if (flags & PRCF_RED) {
+#ifdef ALTQ_DEBUG
+               printf("priq_class_create: RED not configured for PRIQ!\n");
+#endif
+               return (NULL);
+       }
+#endif
+
+       if ((cl = pif->pif_classes[pri]) != NULL) {
+               /* modify the class instead of creating a new one */
+               s = splimp();
+               if (!qempty(cl->cl_q))
+                       priq_purgeq(cl);
+               splx(s);
+#ifdef ALTQ_RIO
+               if (q_is_rio(cl->cl_q))
+                       rio_destroy((rio_t *)cl->cl_red);
+#endif
+#ifdef ALTQ_RED
+               if (q_is_red(cl->cl_q))
+                       red_destroy(cl->cl_red);
+#endif
+       } else {
+               cl = malloc(sizeof(*cl), M_ALTQ, M_WAITOK | M_ZERO);
+               cl->cl_q = malloc(sizeof(*cl->cl_q), M_ALTQ, M_WAITOK | M_ZERO);
+       }
+
+       pif->pif_classes[pri] = cl;
+       if (flags & PRCF_DEFAULTCLASS)
+               pif->pif_default = cl;
+       if (qlimit == 0)
+               qlimit = 50;  /* use default */
+       qlimit(cl->cl_q) = qlimit;
+       qtype(cl->cl_q) = Q_DROPTAIL;
+       qlen(cl->cl_q) = 0;
+       cl->cl_flags = flags;
+       cl->cl_pri = pri;
+       if (pri > pif->pif_maxpri)
+               pif->pif_maxpri = pri;
+       cl->cl_pif = pif;
+       cl->cl_handle = qid;
+
+#ifdef ALTQ_RED
+       if (flags & (PRCF_RED|PRCF_RIO)) {
+               int red_flags, red_pkttime;
+
+               red_flags = 0;
+               if (flags & PRCF_ECN)
+                       red_flags |= REDF_ECN;
+#ifdef ALTQ_RIO
+               if (flags & PRCF_CLEARDSCP)
+                       red_flags |= RIOF_CLEARDSCP;
+#endif
+               if (pif->pif_bandwidth < 8)
+                       red_pkttime = 1000 * 1000 * 1000; /* 1 sec */
+               else
+                       red_pkttime = (int64_t)pif->pif_ifq->altq_ifp->if_mtu
+                         * 1000 * 1000 * 1000 / (pif->pif_bandwidth / 8);
+#ifdef ALTQ_RIO
+               if (flags & PRCF_RIO) {
+                       cl->cl_red = (red_t *)rio_alloc(0, NULL,
+                                               red_flags, red_pkttime);
+                       if (cl->cl_red != NULL)
+                               qtype(cl->cl_q) = Q_RIO;
+               } else
+#endif
+               if (flags & PRCF_RED) {
+                       cl->cl_red = red_alloc(0, 0,
+                           qlimit(cl->cl_q) * 10/100,
+                           qlimit(cl->cl_q) * 30/100,
+                           red_flags, red_pkttime);
+                       if (cl->cl_red != NULL)
+                               qtype(cl->cl_q) = Q_RED;
+               }
+       }
+#endif /* ALTQ_RED */
+
+       return (cl);
+}
+
+static int
+priq_class_destroy(struct priq_class *cl)
+{
+       struct priq_if *pif;
+       int s, pri;
+
+       s = splimp();
+
+       if (!qempty(cl->cl_q))
+               priq_purgeq(cl);
+
+       pif = cl->cl_pif;
+       pif->pif_classes[cl->cl_pri] = NULL;
+       if (pif->pif_maxpri == cl->cl_pri) {
+               for (pri = cl->cl_pri; pri >= 0; pri--)
+                       if (pif->pif_classes[pri] != NULL) {
+                               pif->pif_maxpri = pri;
+                               break;
+                       }
+               if (pri < 0)
+                       pif->pif_maxpri = -1;
+       }
+       splx(s);
+
+       if (cl->cl_red != NULL) {
+#ifdef ALTQ_RIO
+               if (q_is_rio(cl->cl_q))
+                       rio_destroy((rio_t *)cl->cl_red);
+#endif
+#ifdef ALTQ_RED
+               if (q_is_red(cl->cl_q))
+                       red_destroy(cl->cl_red);
+#endif
+       }
+       free(cl->cl_q, M_ALTQ);
+       free(cl, M_ALTQ);
+       return (0);
+}
+
+/*
+ * priq_enqueue is an enqueue function to be registered to
+ * (*altq_enqueue) in struct ifaltq.
+ */
+static int
+priq_enqueue(struct ifaltq *ifq, struct mbuf *m, struct altq_pktattr *pktattr)
+{
+       struct priq_if *pif = (struct priq_if *)ifq->altq_disc;
+       struct priq_class *cl;
+       int len;
+
+       /* grab class set by classifier */
+       if ((m->m_flags & M_PKTHDR) == 0) {
+               /* should not happen */
+               if_printf(ifq->altq_ifp, "altq: packet does not have pkthdr\n");
+               m_freem(m);
+               return (ENOBUFS);
+       }
+       if (m->m_pkthdr.fw_flags & ALTQ_MBUF_TAGGED)
+               cl = clh_to_clp(pif, m->m_pkthdr.altq_qid);
+       else
+               cl = NULL;
+       if (cl == NULL) {
+               cl = pif->pif_default;
+               if (cl == NULL) {
+                       m_freem(m);
+                       return (ENOBUFS);
+               }
+       }
+       cl->cl_pktattr = NULL;
+       len = m_pktlen(m);
+       if (priq_addq(cl, m) != 0) {
+               /* drop occurred.  mbuf was freed in priq_addq. */
+               PKTCNTR_ADD(&cl->cl_dropcnt, len);
+               return (ENOBUFS);
+       }
+       ifq->ifq_len++;
+
+       /* successfully queued. */
+       return (0);
+}
+
+/*
+ * priq_dequeue is a dequeue function to be registered to
+ * (*altq_dequeue) in struct ifaltq.
+ *
+ * note: ALTDQ_POLL returns the next packet without removing the packet
+ *     from the queue.  ALTDQ_REMOVE is a normal dequeue operation.
+ *     ALTDQ_REMOVE must return the same packet if called immediately
+ *     after ALTDQ_POLL.
+ */
+static struct mbuf *
+priq_dequeue(struct ifaltq *ifq, int op)
+{
+       struct priq_if *pif = (struct priq_if *)ifq->altq_disc;
+       struct priq_class *cl;
+       struct mbuf *m;
+       int pri;
+
+       if (ifq_is_empty(ifq)) {
+               /* no packet in the queue */
+               return (NULL);
+       }
+
+       for (pri = pif->pif_maxpri;  pri >= 0; pri--) {
+               if ((cl = pif->pif_classes[pri]) != NULL && !qempty(cl->cl_q)) {
+                       if (op == ALTDQ_POLL)
+                               return (priq_pollq(cl));
+
+                       m = priq_getq(cl);
+                       if (m != NULL) {
+                               ifq->ifq_len--;
+                               if (qempty(cl->cl_q))
+                                       cl->cl_period++;
+                               PKTCNTR_ADD(&cl->cl_xmitcnt, m_pktlen(m));
+                       }
+                       return (m);
+               }
+       }
+       return (NULL);
+}
+
+static int
+priq_addq(struct priq_class *cl, struct mbuf *m)
+{
+#ifdef ALTQ_RIO
+       if (q_is_rio(cl->cl_q))
+               return rio_addq((rio_t *)cl->cl_red, cl->cl_q, m,
+                               cl->cl_pktattr);
+#endif
+#ifdef ALTQ_RED
+       if (q_is_red(cl->cl_q))
+               return red_addq(cl->cl_red, cl->cl_q, m, cl->cl_pktattr);
+#endif
+       if (qlen(cl->cl_q) >= qlimit(cl->cl_q)) {
+               m_freem(m);
+               return (-1);
+       }
+
+       if (cl->cl_flags & PRCF_CLEARDSCP)
+               write_dsfield(m, cl->cl_pktattr, 0);
+
+       _addq(cl->cl_q, m);
+
+       return (0);
+}
+
+static struct mbuf *
+priq_getq(struct priq_class *cl)
+{
+#ifdef ALTQ_RIO
+       if (q_is_rio(cl->cl_q))
+               return rio_getq((rio_t *)cl->cl_red, cl->cl_q);
+#endif
+#ifdef ALTQ_RED
+       if (q_is_red(cl->cl_q))
+               return red_getq(cl->cl_red, cl->cl_q);
+#endif
+       return _getq(cl->cl_q);
+}
+
+static struct mbuf *
+priq_pollq(struct priq_class *cl)
+{
+       return qhead(cl->cl_q);
+}
+
+static void
+priq_purgeq(struct priq_class *cl)
+{
+       struct mbuf *m;
+
+       if (qempty(cl->cl_q))
+               return;
+
+       while ((m = _getq(cl->cl_q)) != NULL) {
+               PKTCNTR_ADD(&cl->cl_dropcnt, m_pktlen(m));
+               m_freem(m);
+       }
+       KKASSERT(qlen(cl->cl_q) == 0);
+}
+
+static void
+get_class_stats(struct priq_classstats *sp, struct priq_class *cl)
+{
+       sp->class_handle = cl->cl_handle;
+       sp->qlength = qlen(cl->cl_q);
+       sp->qlimit = qlimit(cl->cl_q);
+       sp->period = cl->cl_period;
+       sp->xmitcnt = cl->cl_xmitcnt;
+       sp->dropcnt = cl->cl_dropcnt;
+
+       sp->qtype = qtype(cl->cl_q);
+#ifdef ALTQ_RED
+       if (q_is_red(cl->cl_q))
+               red_getstats(cl->cl_red, &sp->red[0]);
+#endif
+#ifdef ALTQ_RIO
+       if (q_is_rio(cl->cl_q))
+               rio_getstats((rio_t *)cl->cl_red, &sp->red[0]);
+#endif
+}
+
+/* convert a class handle to the corresponding class pointer */
+static struct priq_class *
+clh_to_clp(struct priq_if *pif, uint32_t chandle)
+{
+       struct priq_class *cl;
+       int idx;
+
+       if (chandle == 0)
+               return (NULL);
+
+       for (idx = pif->pif_maxpri; idx >= 0; idx--)
+               if ((cl = pif->pif_classes[idx]) != NULL &&
+                   cl->cl_handle == chandle)
+                       return (cl);
+
+       return (NULL);
+}
+
+#endif /* ALTQ_PRIQ */
diff --git a/sys/net/altq/altq_priq.h b/sys/net/altq/altq_priq.h
new file mode 100644 (file)
index 0000000..79b1b5a
--- /dev/null
@@ -0,0 +1,95 @@
+/*     $KAME: altq_priq.h,v 1.7 2003/10/03 05:05:15 kjc Exp $  */
+/*     $DragonFly: src/sys/net/altq/altq_priq.h,v 1.1 2005/02/11 22:25:57 joerg Exp $ */
+
+/*
+ * Copyright (C) 2000-2003
+ *     Sony Computer Science Laboratories Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _ALTQ_ALTQ_PRIQ_H_
+#define        _ALTQ_ALTQ_PRIQ_H_
+
+#include <net/altq/altq.h>
+#include <net/altq/altq_classq.h>
+#include <net/altq/altq_red.h>
+#include <net/altq/altq_rio.h>
+
+#define        PRIQ_MAXPRI     16      /* upper limit of the number of priorities */
+
+/* priq class flags */
+#define        PRCF_RED                0x0001  /* use RED */
+#define        PRCF_ECN                0x0002  /* use RED/ECN */
+#define        PRCF_RIO                0x0004  /* use RIO */
+#define        PRCF_CLEARDSCP          0x0010  /* clear diffserv codepoint */
+#define        PRCF_DEFAULTCLASS       0x1000  /* default class */
+
+/* special class handles */
+#define        PRIQ_NULLCLASS_HANDLE   0
+
+struct priq_classstats {
+       uint32_t                class_handle;
+
+       u_int                   qlength;
+       u_int                   qlimit;
+       u_int                   period;
+       struct pktcntr          xmitcnt;  /* transmitted packet counter */
+       struct pktcntr          dropcnt;  /* dropped packet counter */
+
+       /* red and rio related info */
+       int                     qtype;
+       struct redstats         red[3]; /* rio has 3 red stats */
+};
+
+#ifdef _KERNEL
+
+struct priq_class {
+       uint32_t        cl_handle;      /* class handle */
+       class_queue_t   *cl_q;          /* class queue structure */
+       struct red      *cl_red;        /* RED state */
+       int             cl_pri;         /* priority */
+       int             cl_flags;       /* class flags */
+       struct priq_if  *cl_pif;        /* back pointer to pif */
+       struct altq_pktattr *cl_pktattr; /* saved header used by ECN */
+
+       /* statistics */
+       u_int           cl_period;      /* backlog period */
+       struct pktcntr  cl_xmitcnt;     /* transmitted packet counter */
+       struct pktcntr  cl_dropcnt;     /* dropped packet counter */
+};
+
+/*
+ * priq interface state
+ */
+struct priq_if {
+       struct priq_if          *pif_next;      /* interface state list */
+       struct ifaltq           *pif_ifq;       /* backpointer to ifaltq */
+       u_int                   pif_bandwidth;  /* link bandwidth in bps */
+       int                     pif_maxpri;     /* max priority in use */
+       struct priq_class       *pif_default;   /* default class */
+       struct priq_class       *pif_classes[PRIQ_MAXPRI]; /* classes */
+};
+
+#endif /* _KERNEL */
+
+#endif /* _ALTQ_ALTQ_PRIQ_H_ */
diff --git a/sys/net/altq/altq_red.c b/sys/net/altq/altq_red.c
new file mode 100644 (file)
index 0000000..f93287f
--- /dev/null
@@ -0,0 +1,601 @@
+/*     $KAME: altq_red.c,v 1.19 2004/04/17 10:54:49 kjc Exp $  */
+/*     $DragonFly: src/sys/net/altq/altq_red.c,v 1.1 2005/02/11 22:25:57 joerg Exp $ */
+
+/*
+ * Copyright (C) 1997-2003
+ *     Sony Computer Science Laboratories Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+/*
+ * Copyright (c) 1990-1994 Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *     This product includes software developed by the Computer Systems
+ *     Engineering Group at Lawrence Berkeley Laboratory.
+ * 4. Neither the name of the University nor of the Laboratory may be used
+ *    to endorse or promote products derived from this software without
+ *    specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "opt_altq.h"
+#include "opt_inet.h"
+#include "opt_inet6.h"
+
+#ifdef ALTQ_RED        /* red is enabled by ALTQ_RED option in opt_altq.h */
+
+#include <sys/param.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/socket.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+
+#include <net/if.h>
+
+#include <netinet/in.h>
+#include <netinet/in_systm.h>
+#include <netinet/ip.h>
+#ifdef INET6
+#include <netinet/ip6.h>
+#endif
+
+#include <net/pf/pfvar.h>
+#include <net/altq/altq.h>
+#include <net/altq/altq_red.h>
+
+/*
+ * ALTQ/RED (Random Early Detection) implementation using 32-bit
+ * fixed-point calculation.
+ *
+ * written by kjc using the ns code as a reference.
+ * you can learn more about red and ns from Sally's home page at
+ * http://www-nrg.ee.lbl.gov/floyd/
+ *
+ * most of the red parameter values are fixed in this implementation
+ * to prevent fixed-point overflow/underflow.
+ * if you change the parameters, watch out for overflow/underflow!
+ *
+ * the parameters used are recommended values by Sally.
+ * the corresponding ns config looks:
+ *     q_weight=0.00195
+ *     minthresh=5 maxthresh=15 queue-size=60
+ *     linterm=30
+ *     dropmech=drop-tail
+ *     bytes=false (can't be handled by 32-bit fixed-point)
+ *     doubleq=false dqthresh=false
+ *     wait=true
+ */
+/*
+ * alternative red parameters for a slow link.
+ *
+ * assume the queue length becomes from zero to L and keeps L, it takes
+ * N packets for q_avg to reach 63% of L.
+ * when q_weight is 0.002, N is about 500 packets.
+ * for a slow link like dial-up, 500 packets takes more than 1 minute!
+ * when q_weight is 0.008, N is about 127 packets.
+ * when q_weight is 0.016, N is about 63 packets.
+ * bursts of 50 packets are allowed for 0.002, bursts of 25 packets
+ * are allowed for 0.016.
+ * see Sally's paper for more details.
+ */
+/* normal red parameters */
+#define        W_WEIGHT        512     /* inverse of weight of EWMA (511/512) */
+                               /* q_weight = 0.00195 */
+
+/* red parameters for a slow link */
+#define        W_WEIGHT_1      128     /* inverse of weight of EWMA (127/128) */
+                               /* q_weight = 0.0078125 */
+
+/* red parameters for a very slow link (e.g., dialup) */
+#define        W_WEIGHT_2      64      /* inverse of weight of EWMA (63/64) */
+                               /* q_weight = 0.015625 */
+
+/* fixed-point uses 12-bit decimal places */
+#define        FP_SHIFT        12      /* fixed-point shift */
+
+/* red parameters for drop probability */
+#define        INV_P_MAX       10      /* inverse of max drop probability */
+#define        TH_MIN          5       /* min threshold */
+#define        TH_MAX          15      /* max threshold */
+
+#define        RED_LIMIT       60      /* default max queue lenght */
+#define        RED_STATS               /* collect statistics */
+
+/*
+ * our default policy for forced-drop is drop-tail.
+ * (in altq-1.1.2 or earlier, the default was random-drop.
+ * but it makes more sense to punish the cause of the surge.)
+ * to switch to the random-drop policy, define "RED_RANDOM_DROP".
+ */
+
+/* default red parameter values */
+static int default_th_min = TH_MIN;
+static int default_th_max = TH_MAX;
+static int default_inv_pmax = INV_P_MAX;
+
+/*
+ * red support routines
+ */
+red_t *
+red_alloc(int weight, int inv_pmax, int th_min, int th_max, int flags, int pkttime)
+{
+       red_t *rp;
+       int w, i;
+       int npkts_per_sec;
+
+       rp = malloc(sizeof(*rp), M_ALTQ, M_WAITOK | M_ZERO);
+       rp->red_avg = 0;
+       rp->red_idle = 1;
+
+       if (weight == 0)
+               rp->red_weight = W_WEIGHT;
+       else
+               rp->red_weight = weight;
+       if (inv_pmax == 0)
+               rp->red_inv_pmax = default_inv_pmax;
+       else
+               rp->red_inv_pmax = inv_pmax;
+       if (th_min == 0)
+               rp->red_thmin = default_th_min;
+       else
+               rp->red_thmin = th_min;
+       if (th_max == 0)
+               rp->red_thmax = default_th_max;
+       else
+               rp->red_thmax = th_max;
+
+       rp->red_flags = flags;
+
+       if (pkttime == 0)
+               /* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */
+               rp->red_pkttime = 800;
+       else
+               rp->red_pkttime = pkttime;
+
+       if (weight == 0) {
+               /* when the link is very slow, adjust red parameters */
+               npkts_per_sec = 1000000 / rp->red_pkttime;
+               if (npkts_per_sec < 50) {
+                       /* up to about 400Kbps */
+                       rp->red_weight = W_WEIGHT_2;
+               } else if (npkts_per_sec < 300) {
+                       /* up to about 2.4Mbps */
+                       rp->red_weight = W_WEIGHT_1;
+               }
+       }
+
+       /* calculate wshift.  weight must be power of 2 */
+       w = rp->red_weight;
+       for (i = 0; w > 1; i++)
+               w = w >> 1;
+       rp->red_wshift = i;
+       w = 1 << rp->red_wshift;
+       if (w != rp->red_weight) {
+               printf("invalid weight value %d for red! use %d\n",
+                      rp->red_weight, w);
+               rp->red_weight = w;
+       }
+
+       /*
+        * thmin_s and thmax_s are scaled versions of th_min and th_max
+        * to be compared with avg.
+        */
+       rp->red_thmin_s = rp->red_thmin << (rp->red_wshift + FP_SHIFT);
+       rp->red_thmax_s = rp->red_thmax << (rp->red_wshift + FP_SHIFT);
+
+       /*
+        * precompute probability denominator
+        *  probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point
+        */
+       rp->red_probd = (2 * (rp->red_thmax - rp->red_thmin)
+                        * rp->red_inv_pmax) << FP_SHIFT;
+
+       /* allocate weight table */
+       rp->red_wtab = wtab_alloc(rp->red_weight);
+
+       microtime(&rp->red_last);
+       return (rp);
+}
+
+void
+red_destroy(red_t *rp)
+{
+       wtab_destroy(rp->red_wtab);
+       free(rp, M_ALTQ);
+}
+
+void
+red_getstats(red_t *rp, struct redstats *sp)
+{
+       sp->q_avg = rp->red_avg >> rp->red_wshift;
+       sp->xmit_cnt = rp->red_stats.xmit_cnt;
+       sp->drop_cnt = rp->red_stats.drop_cnt;
+       sp->drop_forced = rp->red_stats.drop_forced;
+       sp->drop_unforced = rp->red_stats.drop_unforced;
+       sp->marked_packets = rp->red_stats.marked_packets;
+}
+
+int
+red_addq(red_t *rp, class_queue_t *q, struct mbuf *m, struct altq_pktattr *pktattr)
+{
+       int avg, droptype;
+       int n;
+
+       avg = rp->red_avg;
+
+       /*
+        * if we were idle, we pretend that n packets arrived during
+        * the idle period.
+        */
+       if (rp->red_idle) {
+               struct timeval now;
+               int t;
+
+               rp->red_idle = 0;
+               microtime(&now);
+               t = (now.tv_sec - rp->red_last.tv_sec);
+               if (t > 60) {
+                       /*
+                        * being idle for more than 1 minute, set avg to zero.
+                        * this prevents t from overflow.
+                        */
+                       avg = 0;
+               } else {
+                       t = t * 1000000 + (now.tv_usec - rp->red_last.tv_usec);
+                       n = t / rp->red_pkttime - 1;
+
+                       /* the following line does (avg = (1 - Wq)^n * avg) */
+                       if (n > 0)
+                               avg = (avg >> FP_SHIFT) *
+                                   pow_w(rp->red_wtab, n);
+               }
+       }
+
+       /* run estimator. (note: avg is scaled by WEIGHT in fixed-point) */
+       avg += (qlen(q) << FP_SHIFT) - (avg >> rp->red_wshift);
+       rp->red_avg = avg;              /* save the new value */
+
+       /*
+        * red_count keeps a tally of arriving traffic that has not
+        * been dropped.
+        */
+       rp->red_count++;
+
+       /* see if we drop early */
+       droptype = DTYPE_NODROP;
+       if (avg >= rp->red_thmin_s && qlen(q) > 1) {
+               if (avg >= rp->red_thmax_s) {
+                       /* avg >= th_max: forced drop */
+                       droptype = DTYPE_FORCED;
+               } else if (rp->red_old == 0) {
+                       /* first exceeds th_min */
+                       rp->red_count = 1;
+                       rp->red_old = 1;
+               } else if (drop_early((avg - rp->red_thmin_s) >> rp->red_wshift,
+                                     rp->red_probd, rp->red_count)) {
+                       /* mark or drop by red */
+                       if ((rp->red_flags & REDF_ECN) &&
+                           mark_ecn(m, pktattr, rp->red_flags)) {
+                               /* successfully marked.  do not drop. */
+                               rp->red_count = 0;
+#ifdef RED_STATS
+                               rp->red_stats.marked_packets++;
+#endif
+                       } else {
+                               /* unforced drop by red */
+                               droptype = DTYPE_EARLY;
+                       }
+               }
+       } else {
+               /* avg < th_min */
+               rp->red_old = 0;
+       }
+
+       /*
+        * if the queue length hits the hard limit, it's a forced drop.
+        */
+       if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q))
+               droptype = DTYPE_FORCED;
+
+#ifdef RED_RANDOM_DROP
+       /* if successful or forced drop, enqueue this packet. */
+       if (droptype != DTYPE_EARLY)
+               _addq(q, m);
+#else
+       /* if successful, enqueue this packet. */
+       if (droptype == DTYPE_NODROP)
+               _addq(q, m);
+#endif
+       if (droptype != DTYPE_NODROP) {
+               if (droptype == DTYPE_EARLY) {
+                       /* drop the incoming packet */
+#ifdef RED_STATS
+                       rp->red_stats.drop_unforced++;
+#endif
+               } else {
+                       /* forced drop, select a victim packet in the queue. */
+#ifdef RED_RANDOM_DROP
+                       m = _getq_random(q);
+#endif
+#ifdef RED_STATS
+                       rp->red_stats.drop_forced++;
+#endif
+               }
+#ifdef RED_STATS
+               PKTCNTR_ADD(&rp->red_stats.drop_cnt, m_pktlen(m));
+#endif
+               rp->red_count = 0;
+               m_freem(m);
+               return (-1);
+       }
+       /* successfully queued */
+#ifdef RED_STATS
+       PKTCNTR_ADD(&rp->red_stats.xmit_cnt, m_pktlen(m));
+#endif
+       return (0);
+}
+
+/*
+ * early-drop probability is calculated as follows:
+ *   prob = p_max * (avg - th_min) / (th_max - th_min)
+ *   prob_a = prob / (2 - count*prob)
+ *         = (avg-th_min) / (2*(th_max-th_min)*inv_p_max - count*(avg-th_min))
+ * here prob_a increases as successive undrop count increases.
+ * (prob_a starts from prob/2, becomes prob when (count == (1 / prob)),
+ * becomes 1 when (count >= (2 / prob))).
+ */
+int
+drop_early(int fp_len, int fp_probd, int count)
+{
+       int d;          /* denominator of drop-probability */
+
+       d = fp_probd - count * fp_len;
+       if (d <= 0) {
+               /* count exceeds the hard limit: drop or mark */
+               return (1);
+       }
+
+       /*
+        * now the range of d is [1..600] in fixed-point. (when
+        * th_max-th_min=10 and p_max=1/30)
+        * drop probability = (avg - TH_MIN) / d
+        */
+
+       if ((arc4random() % d) < fp_len) {
+               /* drop or mark */
+               return (1);
+       }
+       /* no drop/mark */
+       return (0);
+}
+
+/*
+ * try to mark CE bit to the packet.
+ *    returns 1 if successfully marked, 0 otherwise.
+ */
+int
+mark_ecn(struct mbuf *m, struct altq_pktattr *pktattr, int flags)
+{
+       struct mbuf *m0;
+       void *hdr;
+       int  af;
+
+       if ((m->m_pkthdr.fw_flags & ALTQ_MBUF_TAGGED) == 0)
+               return (0);
+       af = m->m_pkthdr.ecn_af;
+       hdr = m->m_pkthdr.header;
+
+       if (af != AF_INET && af != AF_INET6)
+               return (0);
+
+       /* verify that pattr_hdr is within the mbuf data */
+       for (m0 = m; m0 != NULL; m0 = m0->m_next) {
+               if (((caddr_t)hdr >= m0->m_data) &&
+                   ((caddr_t)hdr < m0->m_data + m0->m_len))
+                       break;
+       }
+       if (m0 == NULL) {
+               /* ick, tag info is stale */
+               return (0);
+       }
+
+       switch (af) {
+       case AF_INET:
+               if (flags & REDF_ECN4) {
+                       struct ip *ip = hdr;
+                       uint8_t otos;
+                       int sum;
+
+                       if (ip->ip_v != 4)
+                               return (0);     /* version mismatch! */
+
+                       if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT)
+                               return (0);     /* not-ECT */
+                       if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
+                               return (1);     /* already marked */
+
+                       /*
+                        * ecn-capable but not marked,
+                        * mark CE and update checksum
+                        */
+                       otos = ip->ip_tos;
+                       ip->ip_tos |= IPTOS_ECN_CE;
+                       /*
+                        * update checksum (from RFC1624)
+                        *         HC' = ~(~HC + ~m + m')
+                        */
+                       sum = ~ntohs(ip->ip_sum) & 0xffff;
+                       sum += (~otos & 0xffff) + ip->ip_tos;
+                       sum = (sum >> 16) + (sum & 0xffff);
+                       sum += (sum >> 16);  /* add carry */
+                       ip->ip_sum = htons(~sum & 0xffff);
+                       return (1);
+               }
+               break;
+#ifdef INET6
+       case AF_INET6:
+               if (flags & REDF_ECN6) {
+                       struct ip6_hdr *ip6 = hdr;
+                       uint32_t flowlabel;
+
+                       flowlabel = ntohl(ip6->ip6_flow);
+                       if ((flowlabel >> 28) != 6)
+                               return (0);     /* version mismatch! */
+                       if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
+                           (IPTOS_ECN_NOTECT << 20))
+                               return (0);     /* not-ECT */
+                       if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
+                           (IPTOS_ECN_CE << 20))
+                               return (1);     /* already marked */
+                       /*
+                        * ecn-capable but not marked,  mark CE
+                        */
+                       flowlabel |= (IPTOS_ECN_CE << 20);
+                       ip6->ip6_flow = htonl(flowlabel);
+                       return (1);
+               }
+               break;
+#endif  /* INET6 */
+       }
+
+       /* not marked */
+       return (0);
+}
+
+struct mbuf *
+red_getq(red_t *rp, class_queue_t *q)
+{
+       struct mbuf *m;
+
+       if ((m = _getq(q)) == NULL) {
+               if (rp->red_idle == 0) {
+                       rp->red_idle = 1;
+                       microtime(&rp->red_last);
+               }
+               return NULL;
+       }
+
+       rp->red_idle = 0;
+       return (m);
+}
+
+/*
+ * helper routine to calibrate avg during idle.
+ * pow_w(wtab, n) returns (1 - Wq)^n in fixed-point
+ * here Wq = 1/weight and the code assumes Wq is close to zero.
+ *
+ * w_tab[n] holds ((1 - Wq)^(2^n)) in fixed-point.
+ */
+static SLIST_HEAD(, wtab) wtab_list = SLIST_HEAD_INITIALIZER(&wtab_list);
+
+struct wtab *
+wtab_alloc(int weight)
+{
+       struct wtab *w;
+       int i;
+
+       SLIST_FOREACH(w, &wtab_list, w_link) {
+               if (w->w_weight == weight) {
+                       w->w_refcount++;
+                       return (w);
+               }
+       }
+
+       w = malloc(sizeof(*w), M_ALTQ, M_WAITOK | M_ZERO);
+       w->w_weight = weight;
+       w->w_refcount = 1;
+       SLIST_INSERT_HEAD(&wtab_list, w, w_link);
+
+       /* initialize the weight table */
+       w->w_tab[0] = ((weight - 1) << FP_SHIFT) / weight;
+       for (i = 1; i < 32; i++) {
+               w->w_tab[i] = (w->w_tab[i-1] * w->w_tab[i-1]) >> FP_SHIFT;
+               if (w->w_tab[i] == 0 && w->w_param_max == 0)
+                       w->w_param_max = 1 << i;
+       }
+
+       return (w);
+}
+
+int
+wtab_destroy(struct wtab *w)
+{
+       if (--w->w_refcount > 0)
+               return (0);
+
+       SLIST_REMOVE(&wtab_list, w, wtab, w_link);
+       free(w, M_ALTQ);
+
+       return (0);
+}
+
+int32_t
+pow_w(struct wtab *w, int n)
+{
+       int i, bit;
+       int32_t val;
+
+       if (n >= w->w_param_max)
+               return (0);
+
+       val = 1 << FP_SHIFT;
+       if (n <= 0)
+               return (val);
+
+       bit = 1;
+       i = 0;
+       while (n) {
+               if (n & bit) {
+                       val = (val * w->w_tab[i]) >> FP_SHIFT;
+                       n &= ~bit;
+               }
+               i++;
+               bit <<=  1;
+       }
+       return (val);
+}
+
+#endif /* ALTQ_RED */
diff --git a/sys/net/altq/altq_red.h b/sys/net/altq/altq_red.h
new file mode 100644 (file)
index 0000000..dffe5dd
--- /dev/null
@@ -0,0 +1,123 @@
+/*     $KAME: altq_red.h,v 1.8 2003/07/10 12:07:49 kjc Exp $   */
+/*     $DragonFly: src/sys/net/altq/altq_red.h,v 1.1 2005/02/11 22:25:57 joerg Exp $ */
+
+/*
+ * Copyright (C) 1997-2003
+ *     Sony Computer Science Laboratories Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _ALTQ_ALTQ_RED_H_
+#define        _ALTQ_ALTQ_RED_H_
+
+#include <net/altq/altq_classq.h>
+
+/* red flags */
+#define        REDF_ECN4       0x01    /* use packet marking for IPv4 packets */
+#define        REDF_ECN6       0x02    /* use packet marking for IPv6 packets */
+#define        REDF_ECN        (REDF_ECN4 | REDF_ECN6)
+
+/*
+ * simpler versions of red parameters and statistics used by other
+ * disciplines (e.g., CBQ)
+ */
+struct redparams {
+       int th_min;             /* red min threshold */
+       int th_max;             /* red max threshold */
+       int inv_pmax;           /* inverse of max drop probability */
+};
+
+struct redstats {
+       int             q_avg;
+       struct pktcntr  xmit_cnt;
+       struct pktcntr  drop_cnt;
+       u_int           drop_forced;
+       u_int           drop_unforced;
+       u_int           marked_packets;
+};
+
+#ifdef _KERNEL
+
+/* weight table structure for idle time calibration */
+struct wtab {
+       SLIST_ENTRY(wtab) w_link;
+       int              w_weight;
+       int              w_param_max;
+       int              w_refcount;
+       int32_t          w_tab[32];
+};
+
+typedef struct red {
+       int             red_pkttime;    /* average packet time in micro sec
+                                          used for idle calibration */
+       int             red_flags;      /* red flags */
+
+       /* red parameters */
+       int             red_weight;     /* weight for EWMA */
+       int             red_inv_pmax;   /* inverse of max drop probability */
+       int             red_thmin;      /* red min threshold */
+       int             red_thmax;      /* red max threshold */
+
+       /* variables for internal use */
+       int             red_wshift;     /* log(red_weight) */
+       int             red_thmin_s;    /* th_min scaled by avgshift */
+       int             red_thmax_s;    /* th_max scaled by avgshift */
+       int             red_probd;      /* drop probability denominator */
+
+       int             red_avg;        /* queue len avg scaled by avgshift */
+       int             red_count;      /* packet count since last dropped/
+                                          marked packet */
+       int             red_idle;       /* queue was empty */
+       int             red_old;        /* avg is above th_min */
+       struct wtab     *red_wtab;      /* weight table */
+       struct timeval   red_last;      /* time when the queue becomes idle */
+
+       struct {
+               struct pktcntr  xmit_cnt;
+               struct pktcntr  drop_cnt;
+               u_int           drop_forced;
+               u_int           drop_unforced;
+               u_int           marked_packets;
+       } red_stats;
+} red_t;
+
+/* red drop types */
+#define        DTYPE_NODROP    0       /* no drop */
+#define        DTYPE_FORCED    1       /* a "forced" drop */
+#define        DTYPE_EARLY     2       /* an "unforced" (early) drop */
+
+red_t          *red_alloc(int, int, int, int, int, int);
+void            red_destroy(red_t *);
+void            red_getstats(red_t *, struct redstats *);
+int             red_addq(red_t *, class_queue_t *, struct mbuf *,
+                         struct altq_pktattr *);
+struct mbuf    *red_getq(red_t *, class_queue_t *);
+int             drop_early(int, int, int);
+int             mark_ecn(struct mbuf *, struct altq_pktattr *, int);
+struct wtab    *wtab_alloc(int);
+int             wtab_destroy(struct wtab *);
+int32_t                 pow_w(struct wtab *, int);
+
+#endif /* _KERNEL */
+
+#endif /* _ALTQ_ALTQ_RED_H_ */
diff --git a/sys/net/altq/altq_rio.c b/sys/net/altq/altq_rio.c
new file mode 100644 (file)
index 0000000..5f98b3f
--- /dev/null
@@ -0,0 +1,424 @@
+/*     $KAME: altq_rio.c,v 1.17 2003/07/10 12:07:49 kjc Exp $  */
+/*     $DragonFly: src/sys/net/altq/altq_rio.c,v 1.1 2005/02/11 22:25:57 joerg Exp $ */
+
+/*
+ * Copyright (C) 1998-2003
+ *     Sony Computer Science Laboratories Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * Copyright (c) 1990-1994 Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *     This product includes software developed by the Computer Systems
+ *     Engineering Group at Lawrence Berkeley Laboratory.
+ * 4. Neither the name of the University nor of the Laboratory may be used
+ *    to endorse or promote products derived from this software without
+ *    specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "opt_altq.h"
+#include "opt_inet.h"
+#include "opt_inet6.h"
+
+#ifdef ALTQ_RIO        /* rio is enabled by ALTQ_RIO option in opt_altq.h */
+
+#include <sys/param.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/socket.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+
+#include <net/if.h>
+
+#include <netinet/in.h>
+#include <netinet/in_systm.h>
+#include <netinet/ip.h>
+#ifdef INET6
+#include <netinet/ip6.h>
+#endif
+
+#include <net/pf/pfvar.h>
+#include <net/altq/altq.h>
+#include <net/altq/altq_red.h>
+#include <net/altq/altq_rio.h>
+
+/*
+ * RIO: RED with IN/OUT bit
+ *   described in
+ *     "Explicit Allocation of Best Effort Packet Delivery Service"
+ *     David D. Clark and Wenjia Fang, MIT Lab for Computer Science
+ *     http://diffserv.lcs.mit.edu/Papers/exp-alloc-ddc-wf.{ps,pdf}
+ *
+ * this implementation is extended to support more than 2 drop precedence
+ * values as described in RFC2597 (Assured Forwarding PHB Group).
+ *
+ */
+/*
+ * AF DS (differentiated service) codepoints.
+ * (classes can be mapped to CBQ or H-FSC classes.)
+ *
+ *      0   1   2   3   4   5   6   7
+ *    +---+---+---+---+---+---+---+---+
+ *    |   CLASS   |DropPre| 0 |  CU   |
+ *    +---+---+---+---+---+---+---+---+
+ *
+ *    class 1: 001
+ *    class 2: 010
+ *    class 3: 011
+ *    class 4: 100
+ *
+ *    low drop prec:    01
+ *    medium drop prec: 10
+ *    high drop prec:   11
+ */
+
+/* normal red parameters */
+#define        W_WEIGHT        512     /* inverse of weight of EWMA (511/512) */
+                               /* q_weight = 0.00195 */
+
+/* red parameters for a slow link */
+#define        W_WEIGHT_1      128     /* inverse of weight of EWMA (127/128) */
+                               /* q_weight = 0.0078125 */
+
+/* red parameters for a very slow link (e.g., dialup) */
+#define        W_WEIGHT_2      64      /* inverse of weight of EWMA (63/64) */
+                               /* q_weight = 0.015625 */
+
+/* fixed-point uses 12-bit decimal places */
+#define        FP_SHIFT        12      /* fixed-point shift */
+
+/* red parameters for drop probability */
+#define        INV_P_MAX       10      /* inverse of max drop probability */
+#define        TH_MIN           5      /* min threshold */
+#define        TH_MAX          15      /* max threshold */
+
+#define        RIO_LIMIT       60      /* default max queue lenght */
+#define        RIO_STATS               /* collect statistics */
+
+/* default rio parameter values */
+static struct redparams default_rio_params[RIO_NDROPPREC] = {
+  /* th_min,            th_max,     inv_pmax */
+  { TH_MAX * 2 + TH_MIN, TH_MAX * 3, INV_P_MAX }, /* low drop precedence */
+  { TH_MAX + TH_MIN,    TH_MAX * 2, INV_P_MAX }, /* medium drop precedence */
+  { TH_MIN,             TH_MAX,     INV_P_MAX }  /* high drop precedence */
+};
+
+/* internal function prototypes */
+static int     dscp2index(uint8_t);
+/* YYY Do we really need this? */
+#define        AF_DROPPRECMASK         0x18
+#define        DSCP_MASK       0xfc
+
+rio_t *
+rio_alloc(int weight, struct redparams *params, int flags, int pkttime)
+{
+       rio_t *rp;
+       int w, i;
+       int npkts_per_sec;
+
+       rp = malloc(sizeof(*rp), M_ALTQ, M_WAITOK | M_ZERO);
+
+       rp->rio_flags = flags;
+       if (pkttime == 0)
+               /* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */
+               rp->rio_pkttime = 800;
+       else
+               rp->rio_pkttime = pkttime;
+
+       if (weight != 0)
+               rp->rio_weight = weight;
+       else {
+               /* use default */
+               rp->rio_weight = W_WEIGHT;
+
+               /* when the link is very slow, adjust red parameters */
+               npkts_per_sec = 1000000 / rp->rio_pkttime;
+               if (npkts_per_sec < 50) {
+                       /* up to about 400Kbps */
+                       rp->rio_weight = W_WEIGHT_2;
+               } else if (npkts_per_sec < 300) {
+                       /* up to about 2.4Mbps */
+                       rp->rio_weight = W_WEIGHT_1;
+               }
+       }
+
+       /* calculate wshift.  weight must be power of 2 */
+       w = rp->rio_weight;
+       for (i = 0; w > 1; i++)
+               w = w >> 1;
+       rp->rio_wshift = i;
+       w = 1 << rp->rio_wshift;
+       if (w != rp->rio_weight) {
+               printf("invalid weight value %d for red! use %d\n",
+                      rp->rio_weight, w);
+               rp->rio_weight = w;
+       }
+
+       /* allocate weight table */
+       rp->rio_wtab = wtab_alloc(rp->rio_weight);
+
+       for (i = 0; i < RIO_NDROPPREC; i++) {
+               struct dropprec_state *prec = &rp->rio_precstate[i];
+
+               prec->avg = 0;
+               prec->idle = 1;
+
+               if (params == NULL || params[i].inv_pmax == 0)
+                       prec->inv_pmax = default_rio_params[i].inv_pmax;
+               else
+                       prec->inv_pmax = params[i].inv_pmax;
+               if (params == NULL || params[i].th_min == 0)
+                       prec->th_min = default_rio_params[i].th_min;
+               else
+                       prec->th_min = params[i].th_min;
+               if (params == NULL || params[i].th_max == 0)
+                       prec->th_max = default_rio_params[i].th_max;
+               else
+                       prec->th_max = params[i].th_max;
+
+               /*
+                * th_min_s and th_max_s are scaled versions of th_min
+                * and th_max to be compared with avg.
+                */
+               prec->th_min_s = prec->th_min << (rp->rio_wshift + FP_SHIFT);
+               prec->th_max_s = prec->th_max << (rp->rio_wshift + FP_SHIFT);
+
+               /*
+                * precompute probability denominator
+                *  probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point
+                */
+               prec->probd = (2 * (prec->th_max - prec->th_min)
+                              * prec->inv_pmax) << FP_SHIFT;
+
+               microtime(&prec->last);
+       }
+
+       return (rp);
+}
+
+void
+rio_destroy(rio_t *rp)
+{
+       wtab_destroy(rp->rio_wtab);
+       free(rp, M_ALTQ);
+}
+
+void
+rio_getstats(rio_t *rp, struct redstats *sp)
+{
+       int i;
+
+       for (i = 0; i < RIO_NDROPPREC; i++) {
+               bcopy(&rp->q_stats[i], sp, sizeof(struct redstats));
+               sp->q_avg = rp->rio_precstate[i].avg >> rp->rio_wshift;
+               sp++;
+       }
+}
+
+#if (RIO_NDROPPREC == 3)
+/*
+ * internally, a drop precedence value is converted to an index
+ * starting from 0.
+ */
+static int
+dscp2index(uint8_t dscp)
+{
+       int dpindex = dscp & AF_DROPPRECMASK;
+
+       if (dpindex == 0)
+               return (0);
+       return ((dpindex >> 3) - 1);
+}
+#endif
+
+#if 1
+/*
+ * kludge: when a packet is dequeued, we need to know its drop precedence
+ * in order to keep the queue length of each drop precedence.
+ * use m_pkthdr.rcvif to pass this info.
+ */
+#define        RIOM_SET_PRECINDEX(m, idx)      \
+       do { (m)->m_pkthdr.rcvif = (struct ifnet *)((long)(idx)); } while (0)
+#define        RIOM_GET_PRECINDEX(m)   \
+       ({ long idx; idx = (long)((m)->m_pkthdr.rcvif); \
+       (m)->m_pkthdr.rcvif = NULL; idx; })
+#endif
+
+int
+rio_addq(rio_t *rp, class_queue_t *q, struct mbuf *m, struct altq_pktattr *pktattr)
+{
+       int avg, droptype;
+       uint8_t dsfield, odsfield;
+       int dpindex, i, n, t;
+       struct timeval now;
+       struct dropprec_state *prec;
+
+       dsfield = odsfield = read_dsfield(m, pktattr);
+       dpindex = dscp2index(dsfield);
+
+       /*
+        * update avg of the precedence states whose drop precedence
+        * is larger than or equal to the drop precedence of the packet
+        */
+       now.tv_sec = 0;
+       for (i = dpindex; i < RIO_NDROPPREC; i++) {
+               prec = &rp->rio_precstate[i];
+               avg = prec->avg;
+               if (prec->idle) {
+                       prec->idle = 0;
+                       if (now.tv_sec == 0)
+                               microtime(&now);
+                       t = (now.tv_sec - prec->last.tv_sec);
+                       if (t > 60)
+                               avg = 0;
+                       else {
+                               t = t * 1000000 +
+                                       (now.tv_usec - prec->last.tv_usec);
+                               n = t / rp->rio_pkttime;
+                               /* calculate (avg = (1 - Wq)^n * avg) */
+                               if (n > 0)
+                                       avg = (avg >> FP_SHIFT) *
+                                               pow_w(rp->rio_wtab, n);
+                       }
+               }
+
+               /* run estimator. (avg is scaled by WEIGHT in fixed-point) */
+               avg += (prec->qlen << FP_SHIFT) - (avg >> rp->rio_wshift);
+               prec->avg = avg;                /* save the new value */
+               /*
+                * count keeps a tally of arriving traffic that has not
+                * been dropped.
+                */
+               prec->count++;
+       }
+
+       prec = &rp->rio_precstate[dpindex];
+       avg = prec->avg;
+
+       /* see if we drop early */
+       droptype = DTYPE_NODROP;
+       if (avg >= prec->th_min_s && prec->qlen > 1) {
+               if (avg >= prec->th_max_s) {
+                       /* avg >= th_max: forced drop */
+                       droptype = DTYPE_FORCED;
+               } else if (prec->old == 0) {
+                       /* first exceeds th_min */
+                       prec->count = 1;
+                       prec->old = 1;
+               } else if (drop_early((avg - prec->th_min_s) >> rp->rio_wshift,
+                                     prec->probd, prec->count)) {
+                       /* unforced drop by red */
+                       droptype = DTYPE_EARLY;
+               }
+       } else {
+               /* avg < th_min */
+               prec->old = 0;
+       }
+
+       /*
+        * if the queue length hits the hard limit, it's a forced drop.
+        */
+       if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q))
+               droptype = DTYPE_FORCED;
+
+       if (droptype != DTYPE_NODROP) {
+               /* always drop incoming packet (as opposed to randomdrop) */
+               for (i = dpindex; i < RIO_NDROPPREC; i++)
+                       rp->rio_precstate[i].count = 0;
+#ifdef RIO_STATS
+               if (droptype == DTYPE_EARLY)
+                       rp->q_stats[dpindex].drop_unforced++;
+               else
+                       rp->q_stats[dpindex].drop_forced++;
+               PKTCNTR_ADD(&rp->q_stats[dpindex].drop_cnt, m_pktlen(m));
+#endif
+               m_freem(m);
+               return (-1);
+       }
+
+       for (i = dpindex; i < RIO_NDROPPREC; i++)
+               rp->rio_precstate[i].qlen++;
+
+       /* save drop precedence index in mbuf hdr */
+       RIOM_SET_PRECINDEX(m, dpindex);
+
+       if (rp->rio_flags & RIOF_CLEARDSCP)
+               dsfield &= ~DSCP_MASK;
+
+       if (dsfield != odsfield)
+               write_dsfield(m, pktattr, dsfield);
+
+       _addq(q, m);
+
+#ifdef RIO_STATS
+       PKTCNTR_ADD(&rp->q_stats[dpindex].xmit_cnt, m_pktlen(m));
+#endif
+       return (0);
+}
+
+struct mbuf *
+rio_getq(rio_t *rp, class_queue_t *q)
+{
+       struct mbuf *m;
+       int dpindex, i;
+
+       if ((m = _getq(q)) == NULL)
+               return (NULL);
+
+       dpindex = RIOM_GET_PRECINDEX(m);
+       for (i = dpindex; i < RIO_NDROPPREC; i++) {
+               if (--rp->rio_precstate[i].qlen == 0) {
+                       if (rp->rio_precstate[i].idle == 0) {
+                               rp->rio_precstate[i].idle = 1;
+                               microtime(&rp->rio_precstate[i].last);
+                       }
+               }
+       }
+       return (m);
+}
+
+#endif /* ALTQ_RIO */
diff --git a/sys/net/altq/altq_rio.h b/sys/net/altq/altq_rio.h
new file mode 100644 (file)
index 0000000..d5c23c9
--- /dev/null
@@ -0,0 +1,94 @@
+/*     $KAME: altq_rio.h,v 1.9 2003/07/10 12:07:49 kjc Exp $   */
+/*     $DragonFly: src/sys/net/altq/altq_rio.h,v 1.1 2005/02/11 22:25:57 joerg Exp $ */
+
+/*
+ * Copyright (C) 1998-2003
+ *     Sony Computer Science Laboratories Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _ALTQ_ALTQ_RIO_H_
+#define        _ALTQ_ALTQ_RIO_H_
+
+#include <net/altq/altq_classq.h>
+
+/*
+ * RIO: RED with IN/OUT bit
+ * (extended to support more than 2 drop precedence values)
+ */
+#define        RIO_NDROPPREC   3       /* number of drop precedence values */
+
+/* rio flags */
+#define        RIOF_ECN4       0x01    /* use packet marking for IPv4 packets */
+#define        RIOF_ECN6       0x02    /* use packet marking for IPv6 packets */
+#define        RIOF_ECN        (RIOF_ECN4 | RIOF_ECN6)
+#define        RIOF_CLEARDSCP  0x200   /* clear diffserv codepoint */
+
+#ifdef _KERNEL
+
+typedef struct rio {
+       /* per drop precedence structure */
+       struct dropprec_state {
+               /* red parameters */
+               int     inv_pmax;       /* inverse of max drop probability */
+               int     th_min;         /* red min threshold */
+               int     th_max;         /* red max threshold */
+
+               /* variables for internal use */
+               int     th_min_s;       /* th_min scaled by avgshift */
+               int     th_max_s;       /* th_max scaled by avgshift */
+               int     probd;          /* drop probability denominator */
+
+               int     qlen;           /* queue length */
+               int     avg;            /* (scaled) queue length average */
+               int     count;          /* packet count since the last dropped/
+                                          marked packet */
+               int     idle;           /* queue was empty */
+               int     old;            /* avg is above th_min */
+               struct timeval  last;   /* timestamp when queue becomes idle */
+       } rio_precstate[RIO_NDROPPREC];
+
+       int              rio_wshift;    /* log(red_weight) */
+       int              rio_weight;    /* weight for EWMA */
+       struct wtab     *rio_wtab;      /* weight table */
+
+       int              rio_pkttime;   /* average packet time in micro sec
+                                          used for idle calibration */
+       int              rio_flags;     /* rio flags */
+
+       uint8_t          rio_codepoint; /* codepoint value to tag packets */
+       uint8_t          rio_codepointmask;     /* codepoint mask bits */
+
+       struct redstats q_stats[RIO_NDROPPREC]; /* statistics */
+} rio_t;
+
+rio_t          *rio_alloc(int, struct redparams *, int, int);
+void            rio_destroy(rio_t *);
+void            rio_getstats(rio_t *, struct redstats *);
+int             rio_addq(rio_t *, class_queue_t *, struct mbuf *,
+                         struct altq_pktattr *);
+struct mbuf    *rio_getq(rio_t *, class_queue_t *);
+
+#endif /* _KERNEL */
+
+#endif /* _ALTQ_ALTQ_RIO_H_ */
diff --git a/sys/net/altq/altq_rmclass.c b/sys/net/altq/altq_rmclass.c
new file mode 100644 (file)
index 0000000..f800618
--- /dev/null
@@ -0,0 +1,1652 @@
+/*     $KAME: altq_rmclass.c,v 1.18 2003/11/06 06:32:53 kjc Exp $      */
+/*     $DragonFly: src/sys/net/altq/altq_rmclass.c,v 1.1 2005/02/11 22:25:57 joerg Exp $ */
+
+/*
+ * Copyright (c) 1991-1997 Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *      This product includes software developed by the Network Research
+ *      Group at Lawrence Berkeley Laboratory.
+ * 4. Neither the name of the University nor of the Laboratory may be used
+ *    to endorse or promote products derived from this software without
+ *    specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * LBL code modified by speer@eng.sun.com, May 1977.
+ * For questions and/or comments, please send mail to cbq@ee.lbl.gov
+ */
+
+#ident "@(#)rm_class.c  1.48     97/12/05 SMI"
+
+#include "opt_altq.h"
+#include "opt_inet.h"
+#include "opt_inet6.h"
+
+#ifdef ALTQ_CBQ        /* cbq is enabled by ALTQ_CBQ option in opt_altq.h */
+
+#include <sys/param.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/socket.h>
+#include <sys/systm.h>
+#include <sys/callout.h>
+#include <sys/errno.h>
+#include <sys/time.h>
+
+#include <net/if.h>
+
+#include <net/altq/altq.h>
+#include <net/altq/altq_rmclass.h>
+#include <net/altq/altq_rmclass_debug.h>
+#include <net/altq/altq_red.h>
+#include <net/altq/altq_rio.h>
+
+#ifdef CBQ_TRACE
+static struct cbqtrace cbqtrace_buffer[NCBQTRACE+1];
+static struct cbqtrace *cbqtrace_ptr = NULL;
+static int cbqtrace_count;
+#endif
+
+/*
+ * Local Macros
+ */
+
+#define        reset_cutoff(ifd)       { ifd->cutoff_ = RM_MAXDEPTH; }
+
+/*
+ * Local routines.
+ */
+
+static int     rmc_satisfied(struct rm_class *, struct timeval *);
+static void    rmc_wrr_set_weights(struct rm_ifdat *);
+static void    rmc_depth_compute(struct rm_class *);
+static void    rmc_depth_recompute(rm_class_t *);
+
+static struct mbuf *_rmc_wrr_dequeue_next(struct rm_ifdat *, int);
+static struct mbuf *_rmc_prr_dequeue_next(struct rm_ifdat *, int);
+
+static int     _rmc_addq(rm_class_t *, struct mbuf *);
+static void    _rmc_dropq(rm_class_t *);
+static struct mbuf *_rmc_getq(rm_class_t *);
+static struct mbuf *_rmc_pollq(rm_class_t *);
+
+static int     rmc_under_limit(struct rm_class *, struct timeval *);
+static void    rmc_tl_satisfied(struct rm_ifdat *, struct timeval *);
+static void    rmc_drop_action(struct rm_class *);
+static void    rmc_restart(void *);
+static void    rmc_root_overlimit(struct rm_class *, struct rm_class *);
+
+#define        BORROW_OFFTIME
+/*
+ * BORROW_OFFTIME (experimental):
+ * borrow the offtime of the class borrowing from.
+ * the reason is that when its own offtime is set, the class is unable
+ * to borrow much, especially when cutoff is taking effect.
+ * but when the borrowed class is overloaded (advidle is close to minidle),
+ * use the borrowing class's offtime to avoid overload.
+ */
+#define        ADJUST_CUTOFF
+/*
+ * ADJUST_CUTOFF (experimental):
+ * if no underlimit class is found due to cutoff, increase cutoff and
+ * retry the scheduling loop.
+ * also, don't invoke delay_actions while cutoff is taking effect,
+ * since a sleeping class won't have a chance to be scheduled in the
+ * next loop.
+ *
+ * now heuristics for setting the top-level variable (cutoff_) becomes:
+ *     1. if a packet arrives for a not-overlimit class, set cutoff
+ *        to the depth of the class.
+ *     2. if cutoff is i, and a packet arrives for an overlimit class
+ *        with an underlimit ancestor at a lower level than i (say j),
+ *        then set cutoff to j.
+ *     3. at scheduling a packet, if there is no underlimit class
+ *        due to the current cutoff level, increase cutoff by 1 and
+ *        then try to schedule again.
+ */
+
+/*
+ * rm_class_t *
+ * rmc_newclass(...) - Create a new resource management class at priority
+ * 'pri' on the interface given by 'ifd'.
+ *
+ * nsecPerByte  is the data rate of the interface in nanoseconds/byte.
+ *              E.g., 800 for a 10Mb/s ethernet.  If the class gets less
+ *              than 100% of the bandwidth, this number should be the
+ *              'effective' rate for the class.  Let f be the
+ *              bandwidth fraction allocated to this class, and let
+ *              nsPerByte be the data rate of the output link in
+ *              nanoseconds/byte.  Then nsecPerByte is set to
+ *              nsPerByte / f.  E.g., 1600 (= 800 / .5)
+ *              for a class that gets 50% of an ethernet's bandwidth.
+ *
+ * action       the routine to call when the class is over limit.
+ *
+ * maxq         max allowable queue size for class (in packets).
+ *
+ * parent       parent class pointer.
+ *
+ * borrow       class to borrow from (should be either 'parent' or null).
+ *
+ * maxidle      max value allowed for class 'idle' time estimate (this
+ *              parameter determines how large an initial burst of packets
+ *              can be before overlimit action is invoked.
+ *
+ * offtime      how long 'delay' action will delay when class goes over
+ *              limit (this parameter determines the steady-state burst
+ *              size when a class is running over its limit).
+ *
+ * Maxidle and offtime have to be computed from the following:  If the
+ * average packet size is s, the bandwidth fraction allocated to this
+ * class is f, we want to allow b packet bursts, and the gain of the
+ * averaging filter is g (= 1 - 2^(-RM_FILTER_GAIN)), then:
+ *
+ *   ptime = s * nsPerByte * (1 - f) / f
+ *   maxidle = ptime * (1 - g^b) / g^b
+ *   minidle = -ptime * (1 / (f - 1))
+ *   offtime = ptime * (1 + 1/(1 - g) * (1 - g^(b - 1)) / g^(b - 1)
+ *
+ * Operationally, it's convenient to specify maxidle & offtime in units
+ * independent of the link bandwidth so the maxidle & offtime passed to
+ * this routine are the above values multiplied by 8*f/(1000*nsPerByte).
+ * (The constant factor is a scale factor needed to make the parameters
+ * integers.  This scaling also means that the 'unscaled' values of
+ * maxidle*nsecPerByte/8 and offtime*nsecPerByte/8 will be in microseconds,
+ * not nanoseconds.)  Also note that the 'idle' filter computation keeps
+ * an estimate scaled upward by 2^RM_FILTER_GAIN so the passed value of
+ * maxidle also must be scaled upward by this value.  Thus, the passed
+ * values for maxidle and offtime can be computed as follows:
+ *
+ * maxidle = maxidle * 2^RM_FILTER_GAIN * 8 / (1000 * nsecPerByte)
+ * offtime = offtime * 8 / (1000 * nsecPerByte)
+ *
+ * When USE_HRTIME is employed, then maxidle and offtime become:
+ *     maxidle = maxilde * (8.0 / nsecPerByte);
+ *     offtime = offtime * (8.0 / nsecPerByte);
+ */
+struct rm_class *
+rmc_newclass(int pri, struct rm_ifdat *ifd, u_int nsecPerByte,
+            void (*action)(rm_class_t *, rm_class_t *), int maxq,
+            struct rm_class *parent, struct rm_class *borrow, u_int maxidle,
+            int minidle, u_int offtime, int pktsize, int flags)
+{
+       struct rm_class *cl;
+       struct rm_class *peer;
+       int s;
+
+       if (pri >= RM_MAXPRIO)
+               return (NULL);
+#ifndef ALTQ_RED
+       if (flags & RMCF_RED) {
+#ifdef ALTQ_DEBUG
+               printf("rmc_newclass: RED not configured for CBQ!\n");
+#endif
+               return (NULL);
+       }
+#endif
+#ifndef ALTQ_RIO
+       if (flags & RMCF_RIO) {
+#ifdef ALTQ_DEBUG
+               printf("rmc_newclass: RIO not configured for CBQ!\n");
+#endif
+               return (NULL);
+       }
+#endif
+
+       cl = malloc(sizeof(*cl), M_ALTQ, M_WAITOK | M_ZERO);
+       callout_init(&cl->callout_);
+       cl->q_ = malloc(sizeof(*cl->q_), M_ALTQ, M_WAITOK | M_ZERO);
+
+       /*
+        * Class initialization.
+        */
+       cl->children_ = NULL;
+       cl->parent_ = parent;
+       cl->borrow_ = borrow;
+       cl->leaf_ = 1;
+       cl->ifdat_ = ifd;
+       cl->pri_ = pri;
+       cl->allotment_ = RM_NS_PER_SEC / nsecPerByte; /* Bytes per sec */
+       cl->depth_ = 0;
+       cl->qthresh_ = 0;
+       cl->ns_per_byte_ = nsecPerByte;
+
+       qlimit(cl->q_) = maxq;
+       qtype(cl->q_) = Q_DROPHEAD;
+       qlen(cl->q_) = 0;
+       cl->flags_ = flags;
+
+#if 1 /* minidle is also scaled in ALTQ */
+       cl->minidle_ = (minidle * (int)nsecPerByte) / 8;
+       if (cl->minidle_ > 0)
+               cl->minidle_ = 0;
+#else
+       cl->minidle_ = minidle;
+#endif
+       cl->maxidle_ = (maxidle * nsecPerByte) / 8;
+       if (cl->maxidle_ == 0)
+               cl->maxidle_ = 1;
+#if 1 /* offtime is also scaled in ALTQ */
+       cl->avgidle_ = cl->maxidle_;
+       cl->offtime_ = ((offtime * nsecPerByte) / 8) >> RM_FILTER_GAIN;
+       if (cl->offtime_ == 0)
+               cl->offtime_ = 1;
+#else
+       cl->avgidle_ = 0;
+       cl->offtime_ = (offtime * nsecPerByte) / 8;
+#endif
+       cl->overlimit = action;
+
+#ifdef ALTQ_RED
+       if (flags & (RMCF_RED|RMCF_RIO)) {
+               int red_flags, red_pkttime;
+
+               red_flags = 0;
+               if (flags & RMCF_ECN)
+                       red_flags |= REDF_ECN;
+#ifdef ALTQ_RIO
+               if (flags & RMCF_CLEARDSCP)
+                       red_flags |= RIOF_CLEARDSCP;
+#endif
+               red_pkttime = nsecPerByte * pktsize  / 1000;
+
+               if (flags & RMCF_RED) {
+                       cl->red_ = red_alloc(0, 0,
+                           qlimit(cl->q_) * 10/100,
+                           qlimit(cl->q_) * 30/100,
+                           red_flags, red_pkttime);
+                       if (cl->red_ != NULL)
+                               qtype(cl->q_) = Q_RED;
+               }
+#ifdef ALTQ_RIO
+               else {
+                       cl->red_ = (red_t *)rio_alloc(0, NULL,
+                                                     red_flags, red_pkttime);
+                       if (cl->red_ != NULL)
+                               qtype(cl->q_) = Q_RIO;
+               }
+#endif
+       }
+#endif /* ALTQ_RED */
+
+       /*
+        * put the class into the class tree
+        */
+       s = splimp();
+       if ((peer = ifd->active_[pri]) != NULL) {
+               /* find the last class at this pri */
+               cl->peer_ = peer;
+               while (peer->peer_ != ifd->active_[pri])
+                       peer = peer->peer_;
+               peer->peer_ = cl;
+       } else {
+               ifd->active_[pri] = cl;
+               cl->peer_ = cl;
+       }
+
+       if (cl->parent_) {
+               cl->next_ = parent->children_;
+               parent->children_ = cl;
+               parent->leaf_ = 0;
+       }
+
+       /*
+        * Compute the depth of this class and its ancestors in the class
+        * hierarchy.
+        */
+       rmc_depth_compute(cl);
+
+       /*
+        * If CBQ's WRR is enabled, then initialize the class WRR state.
+        */
+       if (ifd->wrr_) {
+               ifd->num_[pri]++;
+               ifd->alloc_[pri] += cl->allotment_;
+               rmc_wrr_set_weights(ifd);
+       }
+       splx(s);
+       return (cl);
+}
+
+int
+rmc_modclass(struct rm_class *cl, u_int nsecPerByte, int maxq, u_int maxidle,
+            int minidle, u_int offtime, int pktsize)
+{
+       struct rm_ifdat *ifd;
+       u_int old_allotment;
+       int s;
+
+       ifd = cl->ifdat_;
+       old_allotment = cl->allotment_;
+
+       s = splimp();
+       cl->allotment_ = RM_NS_PER_SEC / nsecPerByte; /* Bytes per sec */
+       cl->qthresh_ = 0;
+       cl->ns_per_byte_ = nsecPerByte;
+
+       qlimit(cl->q_) = maxq;
+
+#if 1 /* minidle is also scaled in ALTQ */
+       cl->minidle_ = (minidle * nsecPerByte) / 8;
+       if (cl->minidle_ > 0)
+               cl->minidle_ = 0;
+#else
+       cl->minidle_ = minidle;
+#endif
+       cl->maxidle_ = (maxidle * nsecPerByte) / 8;
+       if (cl->maxidle_ == 0)
+               cl->maxidle_ = 1;
+#if 1 /* offtime is also scaled in ALTQ */
+       cl->avgidle_ = cl->maxidle_;
+       cl->offtime_ = ((offtime * nsecPerByte) / 8) >> RM_FILTER_GAIN;
+       if (cl->offtime_ == 0)
+               cl->offtime_ = 1;
+#else
+       cl->avgidle_ = 0;
+       cl->offtime_ = (offtime * nsecPerByte) / 8;
+#endif
+
+       /*
+        * If CBQ's WRR is enabled, then initialize the class WRR state.
+        */
+       if (ifd->wrr_) {
+               ifd->alloc_[cl->pri_] += cl->allotment_ - old_allotment;
+               rmc_wrr_set_weights(ifd);
+       }
+       splx(s);
+       return (0);
+}
+
+/*
+ * static void
+ * rmc_wrr_set_weights(struct rm_ifdat *ifdat) - This function computes
+ *     the appropriate run robin weights for the CBQ weighted round robin
+ *     algorithm.
+ *
+ *     Returns: NONE
+ */
+
+static void
+rmc_wrr_set_weights(struct rm_ifdat *ifd)
+{
+       int i;
+       struct rm_class *cl, *clh;
+
+       for (i = 0; i < RM_MAXPRIO; i++) {
+               /*
+                * This is inverted from that of the simulator to
+                * maintain precision.
+                */
+               if (ifd->num_[i] == 0)
+                       ifd->M_[i] = 0;
+               else
+                       ifd->M_[i] = ifd->alloc_[i] /
+                               (ifd->num_[i] * ifd->maxpkt_);
+               /*
+                * Compute the weighted allotment for each class.
+                * This takes the expensive div instruction out
+                * of the main loop for the wrr scheduling path.
+                * These only get recomputed when a class comes or
+                * goes.
+                */
+               if (ifd->active_[i] != NULL) {
+                       clh = cl = ifd->active_[i];
+                       do {
+                               /* safe-guard for slow link or alloc_ == 0 */
+                               if (ifd->M_[i] == 0)
+                                       cl->w_allotment_ = 0;
+                               else
+                                       cl->w_allotment_ = cl->allotment_ /
+                                               ifd->M_[i];
+                               cl = cl->peer_;
+                       } while ((cl != NULL) && (cl != clh));
+               }
+       }
+}
+
+int
+rmc_get_weight(struct rm_ifdat *ifd, int pri)
+{
+       if ((pri >= 0) && (pri < RM_MAXPRIO))
+               return (ifd->M_[pri]);
+       else
+               return (0);
+}
+
+/*
+ * static void
+ * rmc_depth_compute(struct rm_class *cl) - This function computes the
+ *     appropriate depth of class 'cl' and its ancestors.
+ *
+ *     Returns:        NONE
+ */
+
+static void
+rmc_depth_compute(struct rm_class *cl)
+{
+       rm_class_t *t = cl, *p;
+
+       /*
+        * Recompute the depth for the branch of the tree.
+        */
+       while (t != NULL) {
+               p = t->parent_;
+               if (p && (t->depth_ >= p->depth_)) {
+                       p->depth_ = t->depth_ + 1;
+                       t = p;
+               } else
+                       t = NULL;
+       }
+}
+
+/*
+ * static void
+ * rmc_depth_recompute(struct rm_class *cl) - This function re-computes
+ *     the depth of the tree after a class has been deleted.
+ *
+ *     Returns:        NONE
+ */
+
+static void
+rmc_depth_recompute(rm_class_t *cl)
+{
+#if 1 /* ALTQ */
+       rm_class_t *p, *t;
+
+       p = cl;
+       while (p != NULL) {
+               if ((t = p->children_) == NULL) {
+                       p->depth_ = 0;
+               } else {
+                       int cdepth = 0;
+
+                       while (t != NULL) {
+                               if (t->depth_ > cdepth)
+                                       cdepth = t->depth_;
+                               t = t->next_;
+                       }
+
+                       if (p->depth_ == cdepth + 1)
+                               /* no change to this parent */
+                               return;
+
+                       p->depth_ = cdepth + 1;
+               }
+
+               p = p->parent_;
+       }
+#else
+       rm_class_t      *t;
+
+       if (cl->depth_ >= 1) {
+               if (cl->children_ == NULL) {
+                       cl->depth_ = 0;
+               } else if ((t = cl->children_) != NULL) {
+                       while (t != NULL) {
+                               if (t->children_ != NULL)
+                                       rmc_depth_recompute(t);
+                               t = t->next_;
+                       }
+               } else
+                       rmc_depth_compute(cl);
+       }
+#endif
+}
+
+/*
+ * void
+ * rmc_delete_class(struct rm_ifdat *ifdat, struct rm_class *cl) - This
+ *     function deletes a class from the link-sharing structure and frees
+ *     all resources associated with the class.
+ *
+ *     Returns: NONE
+ */
+
+void
+rmc_delete_class(struct rm_ifdat *ifd, struct rm_class *cl)
+{
+       struct rm_class *p, *head, *previous;
+       int s;
+
+       KKASSERT(cl->children_ == NULL);
+
+       if (cl->sleeping_)
+               callout_stop(&cl->callout_);
+
+       s = splimp();
+       /*
+        * Free packets in the packet queue.
+        * XXX - this may not be a desired behavior.  Packets should be
+        *              re-queued.
+        */
+       rmc_dropall(cl);
+
+       /*
+        * If the class has a parent, then remove the class from the
+        * class from the parent's children chain.
+        */
+       if (cl->parent_ != NULL) {
+               head = cl->parent_->children_;
+               p = previous = head;
+               if (head->next_ == NULL) {
+                       KKASSERT(head == cl);
+                       cl->parent_->children_ = NULL;
+                       cl->parent_->leaf_ = 1;
+               } else while (p != NULL) {
+                       if (p == cl) {
+                               if (cl == head)
+                                       cl->parent_->children_ = cl->next_;
+                               else
+                                       previous->next_ = cl->next_;
+                               cl->next_ = NULL;
+                               p = NULL;
+                       } else {
+                               previous = p;
+                               p = p->next_;
+                       }
+               }
+       }
+
+       /*
+        * Delete class from class priority peer list.
+        */
+       if ((p = ifd->active_[cl->pri_]) != NULL) {
+               /*
+                * If there is more than one member of this priority
+                * level, then look for class(cl) in the priority level.
+                */
+               if (p != p->peer_) {
+                       while (p->peer_ != cl)
+                               p = p->peer_;
+                       p->peer_ = cl->peer_;
+
+                       if (ifd->active_[cl->pri_] == cl)
+                               ifd->active_[cl->pri_] = cl->peer_;
+               } else {
+                       KKASSERT(p == cl);
+                       ifd->active_[cl->pri_] = NULL;
+               }
+       }
+
+       /*
+        * Recompute the WRR weights.
+        */
+       if (ifd->wrr_) {
+               ifd->alloc_[cl->pri_] -= cl->allotment_;
+               ifd->num_[cl->pri_]--;
+               rmc_wrr_set_weights(ifd);
+       }
+
+       /*
+        * Re-compute the depth of the tree.
+        */
+#if 1 /* ALTQ */
+       rmc_depth_recompute(cl->parent_);
+#else
+       rmc_depth_recompute(ifd->root_);
+#endif
+
+       splx(s);
+
+       /*
+        * Free the class structure.
+        */
+       if (cl->red_ != NULL) {
+#ifdef ALTQ_RIO
+               if (q_is_rio(cl->q_))
+                       rio_destroy((rio_t *)cl->red_);
+#endif
+#ifdef ALTQ_RED
+               if (q_is_red(cl->q_))
+                       red_destroy(cl->red_);
+#endif
+       }
+       free(cl->q_, M_ALTQ);
+       free(cl, M_ALTQ);
+}
+
+/*
+ * void
+ * rmc_init(...) - Initialize the resource management data structures
+ *     associated with the output portion of interface 'ifp'.  'ifd' is
+ *     where the structures will be built (for backwards compatibility, the
+ *     structures aren't kept in the ifnet struct).  'nsecPerByte'
+ *     gives the link speed (inverse of bandwidth) in nanoseconds/byte.
+ *     'restart' is the driver-specific routine that the generic 'delay
+ *     until under limit' action will call to restart output.  `maxq'
+ *     is the queue size of the 'link' & 'default' classes.  'maxqueued'
+ *     is the maximum number of packets that the resource management
+ *     code will allow to be queued 'downstream' (this is typically 1).
+ *
+ *     Returns:        NONE
+ */
+
+void
+rmc_init(struct ifaltq *ifq, struct rm_ifdat *ifd, u_int nsecPerByte,
+         void (*restart)(struct ifaltq *), int maxq, int maxqueued, u_int maxidle,
+        int minidle, u_int offtime, int flags)
+{
+       int i, mtu;
+
+       /*
+        * Initialize the CBQ tracing/debug facility.
+        */
+       CBQTRACEINIT();
+
+       bzero(ifd, sizeof (*ifd));
+       mtu = ifq->altq_ifp->if_mtu;
+       ifd->ifq_ = ifq;
+       ifd->restart = restart;
+       ifd->maxqueued_ = maxqueued;
+       ifd->ns_per_byte_ = nsecPerByte;
+       ifd->maxpkt_ = mtu;
+       ifd->wrr_ = (flags & RMCF_WRR) ? 1 : 0;
+       ifd->efficient_ = (flags & RMCF_EFFICIENT) ? 1 : 0;
+#if 1
+       ifd->maxiftime_ = mtu * nsecPerByte / 1000 * 16;
+       if (mtu * nsecPerByte > 10 * 1000000)
+               ifd->maxiftime_ /= 4;
+#endif
+
+       reset_cutoff(ifd);
+       CBQTRACE(rmc_init, 'INIT', ifd->cutoff_);
+
+       /*
+        * Initialize the CBQ's WRR state.
+        */
+       for (i = 0; i < RM_MAXPRIO; i++) {
+               ifd->alloc_[i] = 0;
+               ifd->M_[i] = 0;
+               ifd->num_[i] = 0;
+               ifd->na_[i] = 0;
+               ifd->active_[i] = NULL;
+       }
+
+       /*
+        * Initialize current packet state.
+        */
+       ifd->qi_ = 0;
+       ifd->qo_ = 0;
+       for (i = 0; i < RM_MAXQUEUED; i++) {
+               ifd->class_[i] = NULL;
+               ifd->curlen_[i] = 0;
+               ifd->borrowed_[i] = NULL;
+       }
+
+       /*
+        * Create the root class of the link-sharing structure.
+        */
+       ifd->root_ = rmc_newclass(0, ifd, nsecPerByte, rmc_root_overlimit,
+                                 maxq, 0, 0, maxidle, minidle, offtime, 0, 0);
+       if (ifd->root_ == NULL) {
+               printf("rmc_init: root class not allocated\n");
+               return ;
+       }
+       ifd->root_->depth_ = 0;
+}
+
+/*
+ * void
+ * rmc_queue_packet(struct rm_class *cl, struct mbuf *m) - Add packet given by
+ *     mbuf 'm' to queue for resource class 'cl'.  This routine is called
+ *     by a driver's if_output routine.  This routine must be called with
+ *     output packet completion interrupts locked out (to avoid racing with
+ *     rmc_dequeue_next).
+ *
+ *     Returns:        0 on successful queueing
+ *                     -1 when packet drop occurs
+ */
+int
+rmc_queue_packet(struct rm_class *cl, struct mbuf *m)
+{
+       struct timeval now;
+       struct rm_ifdat *ifd = cl->ifdat_;
+       int cpri = cl->pri_;
+       int is_empty = qempty(cl->q_);
+
+       RM_GETTIME(now);
+       if (ifd->cutoff_ > 0) {
+               if (TV_LT(&cl->undertime_, &now)) {
+                       if (ifd->cutoff_ > cl->depth_)
+                               ifd->cutoff_ = cl->depth_;
+                       CBQTRACE(rmc_queue_packet, 'ffoc', cl->depth_);
+               }
+#if 1 /* ALTQ */
+               else {
+                       /*
+                        * the class is overlimit. if the class has
+                        * underlimit ancestors, set cutoff to the lowest
+                        * depth among them.
+                        */
+                       struct rm_class *borrow = cl->borrow_;
+
+                       while (borrow != NULL &&
+                              borrow->depth_ < ifd->cutoff_) {
+                               if (TV_LT(&borrow->undertime_, &now)) {
+                                       ifd->cutoff_ = borrow->depth_;
+                                       CBQTRACE(rmc_queue_packet, 'ffob', ifd->cutoff_);
+                                       break;
+                               }
+                               borrow = borrow->borrow_;
+                       }
+               }
+#else /* !ALTQ */
+               else if ((ifd->cutoff_ > 1) && cl->borrow_) {
+                       if (TV_LT(&cl->borrow_->undertime_, &now)) {
+                               ifd->cutoff_ = cl->borrow_->depth_;
+                               CBQTRACE(rmc_queue_packet, 'ffob',
+                                        cl->borrow_->depth_);
+                       }
+               }
+#endif /* !ALTQ */
+       }
+
+       if (_rmc_addq(cl, m) < 0)
+               /* failed */
+               return (-1);
+
+       if (is_empty) {
+               CBQTRACE(rmc_queue_packet, 'ytpe', cl->stats_.handle);
+               ifd->na_[cpri]++;
+       }
+
+       if (qlen(cl->q_) > qlimit(cl->q_)) {
+               /* note: qlimit can be set to 0 or 1 */
+               rmc_drop_action(cl);
+               return (-1);
+       }
+       return (0);
+}
+
+/*
+ * void
+ * rmc_tl_satisfied(struct rm_ifdat *ifd, struct timeval *now) - Check all
+ *     classes to see if there are satified.
+ */
+
+static void
+rmc_tl_satisfied(struct rm_ifdat *ifd, struct timeval *now)
+{
+       int i;
+       rm_class_t *p, *bp;
+
+       for (i = RM_MAXPRIO - 1; i >= 0; i--) {
+               if ((bp = ifd->active_[i]) != NULL) {
+                       p = bp;
+                       do {
+                               if (!rmc_satisfied(p, now)) {
+                                       ifd->cutoff_ = p->depth_;
+                                       return;
+                               }
+                               p = p->peer_;
+                       } while (p != bp);
+               }
+       }
+
+       reset_cutoff(ifd);
+}
+
+/*
+ * rmc_satisfied - Return 1 of the class is satisfied.  O, otherwise.
+ */
+
+static int
+rmc_satisfied(struct rm_class *cl, struct timeval *now)
+{
+       rm_class_t *p;
+
+       if (cl == NULL)
+               return (1);
+       if (TV_LT(now, &cl->undertime_))
+               return (1);
+       if (cl->depth_ == 0) {
+               if (!cl->sleeping_ && (qlen(cl->q_) > cl->qthresh_))
+                       return (0);
+               else
+                       return (1);
+       }
+       if (cl->children_ != NULL) {
+               p = cl->children_;
+               while (p != NULL) {
+                       if (!rmc_satisfied(p, now))
+                               return (0);
+                       p = p->next_;
+               }
+       }
+
+       return (1);
+}
+
+/*
+ * Return 1 if class 'cl' is under limit or can borrow from a parent,
+ * 0 if overlimit.  As a side-effect, this routine will invoke the
+ * class overlimit action if the class if overlimit.
+ */
+
+static int
+rmc_under_limit(struct rm_class *cl, struct timeval *now)
+{
+       rm_class_t *p = cl;
+       rm_class_t *top;
+       struct rm_ifdat *ifd = cl->ifdat_;
+
+       ifd->borrowed_[ifd->qi_] = NULL;
+       /*
+        * If cl is the root class, then always return that it is
+        * underlimit.  Otherwise, check to see if the class is underlimit.
+        */
+       if (cl->parent_ == NULL)
+               return (1);
+
+       if (cl->sleeping_) {
+               if (TV_LT(now, &cl->undertime_))
+                       return (0);
+
+               callout_stop(&cl->callout_);
+               cl->sleeping_ = 0;
+               cl->undertime_.tv_sec = 0;
+               return (1);
+       }
+
+       top = NULL;
+       while (cl->undertime_.tv_sec && TV_LT(now, &cl->undertime_)) {
+               if (((cl = cl->borrow_) == NULL) ||
+                   (cl->depth_ > ifd->cutoff_)) {
+#ifdef ADJUST_CUTOFF
+                       if (cl != NULL)
+                               /* cutoff is taking effect, just
+                                  return false without calling
+                                  the delay action. */
+                               return (0);
+#endif
+#ifdef BORROW_OFFTIME
+                       /*
+                        * check if the class can borrow offtime too.
+                        * borrow offtime from the top of the borrow
+                        * chain if the top class is not overloaded.
+                        */
+                       if (cl != NULL) {
+                               /* cutoff is taking effect, use this class as top. */
+                               top = cl;
+                               CBQTRACE(rmc_under_limit, 'ffou', ifd->cutoff_);
+                       }
+                       if (top != NULL && top->avgidle_ == top->minidle_)
+                               top = NULL;
+                       p->overtime_ = *now;
+                       (p->overlimit)(p, top);
+#else
+                       p->overtime_ = *now;
+                       (p->overlimit)(p, NULL);
+#endif
+                       return (0);
+               }
+               top = cl;
+       }
+
+       if (cl != p)
+               ifd->borrowed_[ifd->qi_] = cl;
+       return (1);
+}
+
+/*
+ * _rmc_wrr_dequeue_next() - This is scheduler for WRR as opposed to
+ *     Packet-by-packet round robin.
+ *
+ * The heart of the weighted round-robin scheduler, which decides which
+ * class next gets to send a packet.  Highest priority first, then
+ * weighted round-robin within priorites.
+ *
+ * Each able-to-send class gets to send until its byte allocation is
+ * exhausted.  Thus, the active pointer is only changed after a class has
+ * exhausted its allocation.
+ *
+ * If the scheduler finds no class that is underlimit or able to borrow,
+ * then the first class found that had a nonzero queue and is allowed to
+ * borrow gets to send.
+ */
+
+static struct mbuf *
+_rmc_wrr_dequeue_next(struct rm_ifdat *ifd, int op)
+{
+       struct rm_class *cl = NULL, *first = NULL;
+       u_int deficit;
+       int cpri;
+       struct mbuf *m;
+       struct timeval now;
+
+       RM_GETTIME(now);
+
+       /*
+        * if the driver polls the top of the queue and then removes
+        * the polled packet, we must return the same packet.
+        */
+       if (op == ALTDQ_REMOVE && ifd->pollcache_) {
+               cl = ifd->pollcache_;
+               cpri = cl->pri_;
+               if (ifd->efficient_) {
+                       /* check if this class is overlimit */
+                       if (cl->undertime_.tv_sec != 0 &&
+                           rmc_under_limit(cl, &now) == 0)
+                               first = cl;
+               }
+               ifd->pollcache_ = NULL;
+               goto _wrr_out;
+       }
+       else {
+               /* mode == ALTDQ_POLL || pollcache == NULL */
+               ifd->pollcache_ = NULL;
+               ifd->borrowed_[ifd->qi_] = NULL;
+       }
+#ifdef ADJUST_CUTOFF
+ _again:
+#endif
+       for (cpri = RM_MAXPRIO - 1; cpri >= 0; cpri--) {
+               if (ifd->na_[cpri] == 0)
+                       continue;
+               deficit = 0;
+               /*
+                * Loop through twice for a priority level, if some class
+                * was unable to send a packet the first round because
+                * of the weighted round-robin mechanism.
+                * During the second loop at this level, deficit==2.
+                * (This second loop is not needed if for every class,
+                * "M[cl->pri_])" times "cl->allotment" is greater than
+                * the byte size for the largest packet in the class.)
+                */
+ _wrr_loop:
+               cl = ifd->active_[cpri];
+               KKASSERT(cl != NULL);
+               do {
+                       if ((deficit < 2) && (cl->bytes_alloc_ <= 0))
+                               cl->bytes_alloc_ += cl->w_allotment_;
+                       if (!qempty(cl->q_)) {
+                               if ((cl->undertime_.tv_sec == 0) ||
+                                   rmc_under_limit(cl, &now)) {
+                                       if (cl->bytes_alloc_ > 0 || deficit > 1)
+                                               goto _wrr_out;
+
+                                       /* underlimit but no alloc */
+                                       deficit = 1;
+#if 1
+                                       ifd->borrowed_[ifd->qi_] = NULL;
+#endif
+                               }
+                               else if (first == NULL && cl->borrow_ != NULL)
+                                       first = cl; /* borrowing candidate */
+                       }
+
+                       cl->bytes_alloc_ = 0;
+                       cl = cl->peer_;
+               } while (cl != ifd->active_[cpri]);
+
+               if (deficit == 1) {
+                       /* first loop found an underlimit class with deficit */
+                       /* Loop on same priority level, with new deficit.  */
+                       deficit = 2;
+                       goto _wrr_loop;
+               }
+       }
+
+#ifdef ADJUST_CUTOFF
+       /*
+        * no underlimit class found.  if cutoff is taking effect,
+        * increase cutoff and try again.
+        */
+       if (first != NULL && ifd->cutoff_ < ifd->root_->depth_) {
+               ifd->cutoff_++;
+               CBQTRACE(_rmc_wrr_dequeue_next, 'ojda', ifd->cutoff_);
+               goto _again;
+       }
+#endif /* ADJUST_CUTOFF */
+       /*
+        * If LINK_EFFICIENCY is turned on, then the first overlimit
+        * class we encounter will send a packet if all the classes
+        * of the link-sharing structure are overlimit.
+        */
+       reset_cutoff(ifd);
+       CBQTRACE(_rmc_wrr_dequeue_next, 'otsr', ifd->cutoff_);
+
+       if (!ifd->efficient_ || first == NULL)
+               return (NULL);
+
+       cl = first;
+       cpri = cl->pri_;
+#if 0  /* too time-consuming for nothing */
+       if (cl->sleeping_)
+               callout_stop(&cl->callout_);
+       cl->sleeping_ = 0;
+       cl->undertime_.tv_sec = 0;
+#endif
+       ifd->borrowed_[ifd->qi_] = cl->borrow_;
+       ifd->cutoff_ = cl->borrow_->depth_;
+
+       /*
+        * Deque the packet and do the book keeping...
+        */
+ _wrr_out:
+       if (op == ALTDQ_REMOVE) {
+               m = _rmc_getq(cl);
+               if (m == NULL)
+                       panic("_rmc_wrr_dequeue_next");
+               if (qempty(cl->q_))
+                       ifd->na_[cpri]--;
+
+               /*
+                * Update class statistics and link data.
+                */
+               if (cl->bytes_alloc_ > 0)
+                       cl->bytes_alloc_ -= m_pktlen(m);
+
+               if ((cl->bytes_alloc_ <= 0) || first == cl)
+                       ifd->active_[cl->pri_] = cl->peer_;
+               else
+                       ifd->active_[cl->pri_] = cl;
+
+               ifd->class_[ifd->qi_] = cl;
+               ifd->curlen_[ifd->qi_] = m_pktlen(m);
+               ifd->now_[ifd->qi_] = now;
+               ifd->qi_ = (ifd->qi_ + 1) % ifd->maxqueued_;
+               ifd->queued_++;
+       } else {
+               /* mode == ALTDQ_PPOLL */
+               m = _rmc_pollq(cl);
+               ifd->pollcache_ = cl;
+       }
+       return (m);
+}
+
+/*
+ * Dequeue & return next packet from the highest priority class that
+ * has a packet to send & has enough allocation to send it.  This
+ * routine is called by a driver whenever it needs a new packet to
+ * output.
+ */
+static struct mbuf *
+_rmc_prr_dequeue_next(struct rm_ifdat *ifd, int op)
+{
+       struct mbuf *m;
+       int cpri;
+       struct rm_class *cl, *first = NULL;
+       struct timeval now;
+
+       RM_GETTIME(now);
+
+       /*
+        * if the driver polls the top of the queue and then removes
+        * the polled packet, we must return the same packet.
+        */
+       if (op == ALTDQ_REMOVE && ifd->pollcache_) {
+               cl = ifd->pollcache_;
+               cpri = cl->pri_;
+               ifd->pollcache_ = NULL;
+               goto _prr_out;
+       } else {
+               /* mode == ALTDQ_POLL || pollcache == NULL */
+               ifd->pollcache_ = NULL;
+               ifd->borrowed_[ifd->qi_] = NULL;
+       }
+#ifdef ADJUST_CUTOFF
+ _again:
+#endif
+       for (cpri = RM_MAXPRIO - 1; cpri >= 0; cpri--) {
+               if (ifd->na_[cpri] == 0)
+                       continue;
+               cl = ifd->active_[cpri];
+               KKASSERT(cl != NULL);
+               do {
+                       if (!qempty(cl->q_)) {
+                               if ((cl->undertime_.tv_sec == 0) ||
+                                   rmc_under_limit(cl, &now))
+                                       goto _prr_out;
+                               if (first == NULL && cl->borrow_ != NULL)
+                                       first = cl;
+                       }
+                       cl = cl->peer_;
+               } while (cl != ifd->active_[cpri]);
+       }
+
+#ifdef ADJUST_CUTOFF
+       /*
+        * no underlimit class found.  if cutoff is taking effect, increase
+        * cutoff and try again.
+        */
+       if (first != NULL && ifd->cutoff_ < ifd->root_->depth_) {
+               ifd->cutoff_++;
+               goto _again;
+       }
+#endif /* ADJUST_CUTOFF */
+       /*
+        * If LINK_EFFICIENCY is turned on, then the first overlimit
+        * class we encounter will send a packet if all the classes
+        * of the link-sharing structure are overlimit.
+        */
+       reset_cutoff(ifd);
+       if (!ifd->efficient_ || first == NULL)
+               return (NULL);
+
+       cl = first;
+       cpri = cl->pri_;
+#if 0  /* too time-consuming for nothing */
+       if (cl->sleeping_)
+               callout_stop(&cl->callout_);
+       cl->sleeping_ = 0;
+       cl->undertime_.tv_sec = 0;
+#endif
+       ifd->borrowed_[ifd->qi_] = cl->borrow_;
+       ifd->cutoff_ = cl->borrow_->depth_;
+
+       /*
+        * Deque the packet and do the book keeping...
+        */
+ _prr_out:
+       if (op == ALTDQ_REMOVE) {
+               m = _rmc_getq(cl);
+               if (m == NULL)
+                       panic("_rmc_prr_dequeue_next");
+               if (qempty(cl->q_))
+                       ifd->na_[cpri]--;
+
+               ifd->active_[cpri] = cl->peer_;
+
+               ifd->class_[ifd->qi_] = cl;
+               ifd->curlen_[ifd->qi_] = m_pktlen(m);
+               ifd->now_[ifd->qi_] = now;
+               ifd->qi_ = (ifd->qi_ + 1) % ifd->maxqueued_;
+               ifd->queued_++;
+       } else {
+               /* mode == ALTDQ_POLL */
+               m = _rmc_pollq(cl);
+               ifd->pollcache_ = cl;
+       }
+       return (m);
+}
+
+/*
+ * struct mbuf *
+ * rmc_dequeue_next(struct rm_ifdat *ifd, struct timeval *now) - this function
+ *     is invoked by the packet driver to get the next packet to be
+ *     dequeued and output on the link.  If WRR is enabled, then the
+ *     WRR dequeue next routine will determine the next packet to sent.
+ *     Otherwise, packet-by-packet round robin is invoked.
+ *
+ *     Returns:        NULL, if a packet is not available or if all
+ *                     classes are overlimit.
+ *
+ *                     Otherwise, Pointer to the next packet.
+ */
+
+struct mbuf *
+rmc_dequeue_next(struct rm_ifdat *ifd, int mode)
+{
+       if (ifd->queued_ >= ifd->maxqueued_)
+               return (NULL);
+       else if (ifd->wrr_)
+               return (_rmc_wrr_dequeue_next(ifd, mode));
+       else
+               return (_rmc_prr_dequeue_next(ifd, mode));
+}
+
+/*
+ * Update the utilization estimate for the packet that just completed.
+ * The packet's class & the parent(s) of that class all get their
+ * estimators updated.  This routine is called by the driver's output-
+ * packet-completion interrupt service routine.
+ */
+
+/*
+ * a macro to approximate "divide by 1000" that gives 0.000999,
+ * if a value has enough effective digits.
+ * (on pentium, mul takes 9 cycles but div takes 46!)
+ */
+#define        NSEC_TO_USEC(t) (((t) >> 10) + ((t) >> 16) + ((t) >> 17))
+void
+rmc_update_class_util(struct rm_ifdat *ifd)
+{
+       int idle, avgidle, pktlen;
+       int pkt_time, tidle;
+       rm_class_t *cl, *borrowed;
+       rm_class_t *borrows;
+       struct timeval *nowp;
+
+       /*
+        * Get the most recent completed class.
+        */
+       if ((cl = ifd->class_[ifd->qo_]) == NULL)
+               return;
+
+       pktlen = ifd->curlen_[ifd->qo_];
+       borrowed = ifd->borrowed_[ifd->qo_];
+       borrows = borrowed;
+
+       PKTCNTR_ADD(&cl->stats_.xmit_cnt, pktlen);
+
+       /*
+        * Run estimator on class and its ancestors.
+        */
+       /*
+        * rm_update_class_util is designed to be called when the
+        * transfer is completed from a xmit complete interrupt,
+        * but most drivers don't implement an upcall for that.
+        * so, just use estimated completion time.
+        * as a result, ifd->qi_ and ifd->qo_ are always synced.
+        */
+       nowp = &ifd->now_[ifd->qo_];
+       /* get pkt_time (for link) in usec */
+#if 1  /* use approximation */
+       pkt_time = ifd->curlen_[ifd->qo_] * ifd->ns_per_byte_;
+       pkt_time = NSEC_TO_USEC(pkt_time);
+#else
+       pkt_time = ifd->curlen_[ifd->qo_] * ifd->ns_per_byte_ / 1000;
+#endif
+#if 1 /* ALTQ4PPP */
+       if (TV_LT(nowp, &ifd->ifnow_)) {
+               int iftime;
+
+               /*
+                * make sure the estimated completion time does not go
+                * too far.  it can happen when the link layer supports
+                * data compression or the interface speed is set to
+                * a much lower value.
+                */
+               TV_DELTA(&ifd->ifnow_, nowp, iftime);
+               if (iftime+pkt_time < ifd->maxiftime_) {
+                       TV_ADD_DELTA(&ifd->ifnow_, pkt_time, &ifd->ifnow_);
+               } else {
+                       TV_ADD_DELTA(nowp, ifd->maxiftime_, &ifd->ifnow_);
+               }
+       } else {
+               TV_ADD_DELTA(nowp, pkt_time, &ifd->ifnow_);
+       }
+#else
+       if (TV_LT(nowp, &ifd->ifnow_)) {
+               TV_ADD_DELTA(&ifd->ifnow_, pkt_time, &ifd->ifnow_);
+       } else {
+               TV_ADD_DELTA(nowp, pkt_time, &ifd->ifnow_);
+       }
+#endif
+
+       while (cl != NULL) {
+               TV_DELTA(&ifd->ifnow_, &cl->last_, idle);
+               if (idle >= 2000000)
+                       /*
+                        * this class is idle enough, reset avgidle.
+                        * (TV_DELTA returns 2000000 us when delta is large.)
+                        */
+                       cl->avgidle_ = cl->maxidle_;
+
+               /* get pkt_time (for class) in usec */
+#if 1  /* use approximation */
+               pkt_time = pktlen * cl->ns_per_byte_;
+               pkt_time = NSEC_TO_USEC(pkt_time);
+#else
+               pkt_time = pktlen * cl->ns_per_byte_ / 1000;
+#endif
+               idle -= pkt_time;
+
+               avgidle = cl->avgidle_;
+               avgidle += idle - (avgidle >> RM_FILTER_GAIN);
+               cl->avgidle_ = avgidle;
+
+               /* Are we overlimit ? */
+               if (avgidle <= 0) {
+                       CBQTRACE(rmc_update_class_util, 'milo', cl->stats_.handle);
+#if 1 /* ALTQ */
+                       /*
+                        * need some lower bound for avgidle, otherwise
+                        * a borrowing class gets unbounded penalty.
+                        */
+                       if (avgidle < cl->minidle_)
+                               avgidle = cl->avgidle_ = cl->minidle_;
+#endif
+                       /* set next idle to make avgidle 0 */
+                       tidle = pkt_time +
+                               (((1 - RM_POWER) * avgidle) >> RM_FILTER_GAIN);
+                       TV_ADD_DELTA(nowp, tidle, &cl->undertime_);
+                       ++cl->stats_.over;
+               } else {
+                       cl->avgidle_ =
+                           (avgidle > cl->maxidle_) ? cl->maxidle_ : avgidle;
+                       cl->undertime_.tv_sec = 0;
+                       if (cl->sleeping_) {
+                               callout_stop(&cl->callout_);
+                               cl->sleeping_ = 0;
+                       }
+               }
+
+               if (borrows != NULL) {
+                       if (borrows != cl)
+                               ++cl->stats_.borrows;
+                       else
+                               borrows = NULL;
+               }
+               cl->last_ = ifd->ifnow_;
+               cl->last_pkttime_ = pkt_time;
+
+#if 1
+               if (cl->parent_ == NULL) {
+                       /* take stats of root class */
+                       PKTCNTR_ADD(&cl->stats_.xmit_cnt, pktlen);
+               }
+#endif
+
+               cl = cl->parent_;
+       }
+
+       /*
+        * Check to see if cutoff needs to set to a new level.
+        */
+       cl = ifd->class_[ifd->qo_];
+       if (borrowed && (ifd->cutoff_ >= borrowed->depth_)) {
+#if 1 /* ALTQ */
+               if ((qlen(cl->q_) <= 0) || TV_LT(nowp, &borrowed->undertime_)) {
+                       rmc_tl_satisfied(ifd, nowp);
+                       CBQTRACE(rmc_update_class_util, 'broe', ifd->cutoff_);
+               } else {
+                       ifd->cutoff_ = borrowed->depth_;
+                       CBQTRACE(rmc_update_class_util, 'ffob', borrowed->depth_);
+               }
+#else /* !ALTQ */
+               if ((qlen(cl->q_) <= 1) || TV_LT(&now, &borrowed->undertime_)) {
+                       reset_cutoff(ifd);
+#ifdef notdef
+                       rmc_tl_satisfied(ifd, &now);
+#endif
+                       CBQTRACE(rmc_update_class_util, 'broe', ifd->cutoff_);
+               } else {
+                       ifd->cutoff_ = borrowed->depth_;
+                       CBQTRACE(rmc_update_class_util, 'ffob', borrowed->depth_);
+               }
+#endif /* !ALTQ */
+       }
+
+       /*
+        * Release class slot
+        */
+       ifd->borrowed_[ifd->qo_] = NULL;
+       ifd->class_[ifd->qo_] = NULL;
+       ifd->qo_ = (ifd->qo_ + 1) % ifd->maxqueued_;
+       ifd->queued_--;
+}
+
+/*
+ * void
+ * rmc_drop_action(struct rm_class *cl) - Generic (not protocol-specific)
+ *     over-limit action routines.  These get invoked by rmc_under_limit()
+ *     if a class with packets to send if over its bandwidth limit & can't
+ *     borrow from a parent class.
+ *
+ *     Returns: NONE
+ */
+
+static void
+rmc_drop_action(struct rm_class *cl)
+{
+       struct rm_ifdat *ifd = cl->ifdat_;
+
+       KKASSERT(qlen(cl->q_) > 0);
+       _rmc_dropq(cl);
+       if (qempty(cl->q_))
+               ifd->na_[cl->pri_]--;
+}
+
+void rmc_dropall(struct rm_class *cl)
+{
+       struct rm_ifdat *ifd = cl->ifdat_;
+
+       if (!qempty(cl->q_)) {
+               _flushq(cl->q_);
+
+               ifd->na_[cl->pri_]--;
+       }
+}
+
+/*
+ * void
+ * rmc_delay_action(struct rm_class *cl) - This function is the generic CBQ
+ *     delay action routine.  It is invoked via rmc_under_limit when the
+ *     packet is discoverd to be overlimit.
+ *
+ *     If the delay action is result of borrow class being overlimit, then
+ *     delay for the offtime of the borrowing class that is overlimit.
+ *
+ *     Returns: NONE
+ */
+
+void
+rmc_delay_action(struct rm_class *cl, struct rm_class *borrow)
+{
+       int delay, t, extradelay;
+
+       cl->stats_.overactions++;
+       TV_DELTA(&cl->undertime_, &cl->overtime_, delay);
+#ifndef BORROW_OFFTIME
+       delay += cl->offtime_;
+#endif
+
+       if (!cl->sleeping_) {
+               CBQTRACE(rmc_delay_action, 'yled', cl->stats_.handle);
+#ifdef BORROW_OFFTIME
+               if (borrow != NULL)
+                       extradelay = borrow->offtime_;
+               else
+#endif
+                       extradelay = cl->offtime_;
+
+#ifdef ALTQ
+               /*
+                * XXX recalculate suspend time:
+                * current undertime is (tidle + pkt_time) calculated
+                * from the last transmission.
+                *      tidle: time required to bring avgidle back to 0
+                *      pkt_time: target waiting time for this class
+                * we need to replace pkt_time by offtime
+                */
+               extradelay -= cl->last_pkttime_;
+#endif
+               if (extradelay > 0) {
+                       TV_ADD_DELTA(&cl->undertime_, extradelay, &cl->undertime_);
+                       delay += extradelay;
+               }
+
+               cl->sleeping_ = 1;
+               cl->stats_.delays++;
+
+               /*
+                * Since packets are phased randomly with respect to the
+                * clock, 1 tick (the next clock tick) can be an arbitrarily
+                * short time so we have to wait for at least two ticks.
+                * NOTE:  If there's no other traffic, we need the timer as
+                * a 'backstop' to restart this class.
+                */
+               if (delay > tick * 2)
+                       t = (delay + tick - 1) / tick;
+               else
+                       t = 2;
+               callout_reset(&cl->callout_, t, rmc_restart, cl);
+       }
+}
+
+/*
+ * void
+ * rmc_restart() - is just a helper routine for rmc_delay_action -- it is
+ *     called by the system timer code & is responsible checking if the
+ *     class is still sleeping (it might have been restarted as a side
+ *     effect of the queue scan on a packet arrival) and, if so, restarting
+ *     output for the class.  Inspecting the class state & restarting output
+ *     require locking the class structure.  In general the driver is
+ *     responsible for locking but this is the only routine that is not
+ *     called directly or indirectly from the interface driver so it has
+ *     know about system locking conventions.  Under bsd, locking is done
+ *     by raising IPL to splimp so that's what's implemented here.  On a
+ *     different system this would probably need to be changed.
+ *
+ *     Returns:        NONE
+ */
+
+static void
+rmc_restart(void *arg)
+{
+       struct rm_class *cl = arg;
+       struct rm_ifdat *ifd = cl->ifdat_;
+       int s;
+
+       s = splimp();
+       if (cl->sleeping_) {
+               cl->sleeping_ = 0;
+               cl->undertime_.tv_sec = 0;
+
+               if (ifd->queued_ < ifd->maxqueued_ && ifd->restart != NULL) {
+                       CBQTRACE(rmc_restart, 'trts', cl->stats_.handle);
+                       (ifd->restart)(ifd->ifq_);
+               }
+       }
+       splx(s);
+}
+
+/*
+ * void
+ * rmc_root_overlimit(struct rm_class *cl) - This the generic overlimit
+ *     handling routine for the root class of the link sharing structure.
+ *
+ *     Returns: NONE
+ */
+
+static void
+rmc_root_overlimit(struct rm_class *cl, struct rm_class *borrow)
+{
+        panic("rmc_root_overlimit");
+}
+
+/*
+ * Packet Queue handling routines.  Eventually, this is to localize the
+ *     effects on the code whether queues are red queues or droptail
+ *     queues.
+ */
+
+static int
+_rmc_addq(rm_class_t *cl, struct mbuf *m)
+{
+#ifdef ALTQ_RIO
+       if (q_is_rio(cl->q_))
+               return rio_addq((rio_t *)cl->red_, cl->q_, m, cl->pktattr_);
+#endif
+#ifdef ALTQ_RED
+       if (q_is_red(cl->q_))
+               return red_addq(cl->red_, cl->q_, m, cl->pktattr_);
+#endif /* ALTQ_RED */
+
+       if (cl->flags_ & RMCF_CLEARDSCP)
+               write_dsfield(m, cl->pktattr_, 0);
+
+       _addq(cl->q_, m);
+       return (0);
+}
+
+/* note: _rmc_dropq is not called for red */
+static void
+_rmc_dropq(rm_class_t *cl)
+{
+       struct mbuf *m;
+
+       if ((m = _getq(cl->q_)) != NULL)
+               m_freem(m);
+}
+
+static struct mbuf *
+_rmc_getq(rm_class_t *cl)
+{
+#ifdef ALTQ_RIO
+       if (q_is_rio(cl->q_))
+               return rio_getq((rio_t *)cl->red_, cl->q_);
+#endif
+#ifdef ALTQ_RED
+       if (q_is_red(cl->q_))
+               return red_getq(cl->red_, cl->q_);
+#endif
+       return _getq(cl->q_);
+}
+
+static struct mbuf *
+_rmc_pollq(rm_class_t *cl)
+{
+       return qhead(cl->q_);
+}
+
+#ifdef CBQ_TRACE
+/*
+ * DDB hook to trace cbq events:
+ *  the last 1024 events are held in a circular buffer.
+ *  use "call cbqtrace_dump(N)" to display 20 events from Nth event.
+ */
+void           cbqtrace_dump(int);
+static char    *rmc_funcname(void *);
+
+static struct rmc_funcs {
+       void    *func;
+       char    *name;
+} rmc_funcs[] = {
+       rmc_init,               "rmc_init",
+       rmc_queue_packet,       "rmc_queue_packet",
+       rmc_under_limit,        "rmc_under_limit",
+       rmc_update_class_util,  "rmc_update_class_util",
+       rmc_delay_action,       "rmc_delay_action",
+       rmc_restart,            "rmc_restart",
+       _rmc_wrr_dequeue_next,  "_rmc_wrr_dequeue_next",
+       NULL,                   NULL
+};
+
+static char *rmc_funcname(void *func)
+{
+       struct rmc_funcs *fp;
+
+       for (fp = rmc_funcs; fp->func != NULL; fp++) {
+               if (fp->func == func)
+                       return (fp->name);
+       }
+
+       return ("unknown");
+}
+
+void
+cbqtrace_dump(int counter)
+{
+       int i, *p;
+       char *cp;
+
+       counter = counter % NCBQTRACE;
+       p = (int *)&cbqtrace_buffer[counter];
+
+       for (i=0; i<20; i++) {
+               printf("[0x%x] ", *p++);
+               printf("%s: ", rmc_funcname((void *)*p++));
+               cp = (char *)p++;
+               printf("%c%c%c%c: ", cp[0], cp[1], cp[2], cp[3]);
+               printf("%d\n",*p++);
+
+               if (p >= (int *)&cbqtrace_buffer[NCBQTRACE])
+                       p = (int *)cbqtrace_buffer;
+       }
+}
+#endif /* CBQ_TRACE */
+#endif /* ALTQ_CBQ */
diff --git a/sys/net/altq/altq_rmclass.h b/sys/net/altq/altq_rmclass.h
new file mode 100644 (file)
index 0000000..fa20242
--- /dev/null
@@ -0,0 +1,255 @@
+/*     $KAME: altq_rmclass.h,v 1.10 2003/08/20 23:30:23 itojun Exp $   */
+/*     $DragonFly: src/sys/net/altq/altq_rmclass.h,v 1.1 2005/02/11 22:25:57 joerg Exp $ */
+
+/*
+ * Copyright (c) 1991-1997 Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *     This product includes software developed by the Network Research
+ *     Group at Lawrence Berkeley Laboratory.
+ * 4. Neither the name of the University nor of the Laboratory may be used
+ *    to endorse or promote products derived from this software without
+ *    specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _ALTQ_ALTQ_RMCLASS_H_
+#define        _ALTQ_ALTQ_RMCLASS_H_
+
+#include <net/altq/altq_classq.h>
+
+/* #pragma ident "@(#)rm_class.h  1.20     97/10/23 SMI" */
+
+#define        RM_MAXPRIO      8       /* Max priority */
+
+#ifdef _KERNEL
+
+typedef struct rm_ifdat                rm_ifdat_t;
+typedef struct rm_class                rm_class_t;
+
+struct red;
+
+/*
+ * Macros for dealing with time values.  We assume all times are
+ * 'timevals'.  `microtime' is used to get the best available clock
+ * resolution.  If `microtime' *doesn't* return a value that's about
+ * ten times smaller than the average packet time on the fastest
+ * link that will use these routines, a slightly different clock
+ * scheme than this one should be used.
+ * (Bias due to truncation error in this scheme will overestimate utilization
+ * and discriminate against high bandwidth classes.  To remove this bias an
+ * integrator needs to be added.  The simplest integrator uses a history of
+ * 10 * avg.packet.time / min.tick.time packet completion entries.  This is
+ * straight forward to add but we don't want to pay the extra memory
+ * traffic to maintain it if it's not necessary (occasionally a vendor
+ * accidentally builds a workstation with a decent clock - e.g., Sun & HP).)
+ */
+
+#define        RM_GETTIME(now) microtime(&now)
+
+#define        TV_LT(a, b) (((a)->tv_sec < (b)->tv_sec) ||  \
+       (((a)->tv_usec < (b)->tv_usec) && ((a)->tv_sec <= (b)->tv_sec)))
+
+#define        TV_DELTA(a, b, delta) { \
+       register int    xxs;    \
+                                                       \
+       delta = (a)->tv_usec - (b)->tv_usec; \
+       if ((xxs = (a)->tv_sec - (b)->tv_sec)) { \
+               switch (xxs) { \
+               default: \
+                       /* if (xxs < 0) \
+                               printf("rm_class: bogus time values\n"); */ \
+                       delta = 0; \
+                       /* fall through */ \
+               case 2: \
+                       delta += 1000000; \
+                       /* fall through */ \
+               case 1: \
+                       delta += 1000000; \
+                       break; \
+               } \
+       } \
+}
+
+#define        TV_ADD_DELTA(a, delta, res) { \
+       register int xxus = (a)->tv_usec + (delta); \
+       \
+       (res)->tv_sec = (a)->tv_sec; \
+       while (xxus >= 1000000) { \
+               ++((res)->tv_sec); \
+               xxus -= 1000000; \
+       } \
+       (res)->tv_usec = xxus; \
+}
+
+#define        RM_TIMEOUT      2       /* 1 Clock tick. */
+
+#if 1
+#define        RM_MAXQUEUED    1       /* this isn't used in ALTQ/CBQ */
+#else
+#define        RM_MAXQUEUED    16      /* Max number of packets downstream of CBQ */
+#endif
+#define        RM_MAXQUEUE     64      /* Max queue length */
+#define        RM_FILTER_GAIN  5       /* log2 of gain, e.g., 5 => 31/32 */
+#define        RM_POWER        (1 << RM_FILTER_GAIN)
+#define        RM_MAXDEPTH     32
+#define        RM_NS_PER_SEC   (1000000000)
+
+typedef struct _rm_class_stats_ {
+       u_int           handle;
+       u_int           depth;
+
+       struct pktcntr  xmit_cnt;       /* packets sent in this class */
+       struct pktcntr  drop_cnt;       /* dropped packets */
+       u_int           over;           /* # times went over limit */
+       u_int           borrows;        /* # times tried to borrow */
+       u_int           overactions;    /* # times invoked overlimit action */
+       u_int           delays;         /* # times invoked delay actions */
+} rm_class_stats_t;
+
+/*
+ * CBQ Class state structure
+ */
+struct rm_class {
+       class_queue_t   *q_;            /* Queue of packets */
+       rm_ifdat_t      *ifdat_;
+       int             pri_;           /* Class priority. */
+       int             depth_;         /* Class depth */
+       u_int           ns_per_byte_;   /* NanoSeconds per byte. */
+       u_int           maxrate_;       /* Bytes per second for this class. */
+       u_int           allotment_;     /* Fraction of link bandwidth. */
+       u_int           w_allotment_;   /* Weighted allotment for WRR */
+       int             bytes_alloc_;   /* Allocation for round of WRR */
+
+       int             avgidle_;
+       int             maxidle_;
+       int             minidle_;
+       int             offtime_;
+       int             sleeping_;      /* != 0 if delaying */
+       int             qthresh_;       /* Queue threshold for formal link sharing */
+       int             leaf_;          /* Note whether leaf class or not.*/
+
+       rm_class_t      *children_;     /* Children of this class */
+       rm_class_t      *next_;         /* Next pointer, used if child */
+
+       rm_class_t      *peer_;         /* Peer class */
+       rm_class_t      *borrow_;       /* Borrow class */
+       rm_class_t      *parent_;       /* Parent class */
+
+       void    (*overlimit)(struct rm_class *, struct rm_class *);
+       void    (*drop)(struct rm_class *);       /* Class drop action. */
+
+       struct red      *red_;          /* RED state pointer */
+       struct altq_pktattr *pktattr_;  /* saved hdr used by RED/ECN */
+       int             flags_;
+
+       int             last_pkttime_;  /* saved pkt_time */
+       struct timeval  undertime_;     /* time can next send */
+       struct timeval  last_;          /* time last packet sent */
+       struct timeval  overtime_;
+       struct callout  callout_;       /* for timeout() calls */
+
+       rm_class_stats_t stats_;        /* Class Statistics */
+};
+
+/*
+ * CBQ Interface state
+ */
+struct rm_ifdat {
+       int             queued_;        /* # pkts queued downstream */
+       int             efficient_;     /* Link Efficency bit */
+       int             wrr_;           /* Enable Weighted Round-Robin */
+       u_long          ns_per_byte_;   /* Link byte speed. */
+       int             maxqueued_;     /* Max packets to queue */
+       int             maxpkt_;        /* Max packet size. */
+       int             qi_;            /* In/out pointers for downstream */
+       int             qo_;            /* packets */
+
+       /*
+        * Active class state and WRR state.
+        */
+       rm_class_t      *active_[RM_MAXPRIO];   /* Active cl's in each pri */
+       int             na_[RM_MAXPRIO];        /* # of active cl's in a pri */
+       int             num_[RM_MAXPRIO];       /* # of cl's per pri */
+       int             alloc_[RM_MAXPRIO];     /* Byte Allocation */
+       u_long          M_[RM_MAXPRIO];         /* WRR weights. */
+
+       /*
+        * Network Interface/Solaris Queue state pointer.
+        */
+       struct ifaltq   *ifq_;
+       rm_class_t      *default_;      /* Default Pkt class, BE */
+       rm_class_t      *root_;         /* Root Link class. */
+       rm_class_t      *ctl_;          /* Control Traffic class. */
+       void            (*restart)(struct ifaltq *);    /* Restart routine. */
+
+       /*
+        * Current packet downstream packet state and dynamic state.
+        */
+       rm_class_t      *borrowed_[RM_MAXQUEUED]; /* Class borrowed last */
+       rm_class_t      *class_[RM_MAXQUEUED];  /* class sending */
+       int             curlen_[RM_MAXQUEUED];  /* Current pktlen */
+       struct timeval  now_[RM_MAXQUEUED];     /* Current packet time. */
+       int             is_overlimit_[RM_MAXQUEUED];/* Current packet time. */
+
+       int             cutoff_;        /* Cut-off depth for borrowing */
+
+       struct timeval  ifnow_;         /* expected xmit completion time */
+#if 1 /* ALTQ4PPP */
+       int             maxiftime_;     /* max delay inside interface */
+#endif
+        rm_class_t     *pollcache_;    /* cached rm_class by poll operation */
+};
+
+/* flags for rmc_init and rmc_newclass */
+/* class flags */
+#define        RMCF_RED                0x0001
+#define        RMCF_ECN                0x0002
+#define        RMCF_RIO                0x0004
+#define        RMCF_CLEARDSCP          0x0008  /* clear diffserv codepoint */
+
+/* flags for rmc_init */
+#define        RMCF_WRR                0x0100
+#define        RMCF_EFFICIENT          0x0200
+
+#define        is_a_parent_class(cl)   ((cl)->children_ != NULL)
+
+rm_class_t *rmc_newclass(int, struct rm_ifdat *, u_int,
+                        void (*)(struct rm_class *, struct rm_class *),
+                        int, struct rm_class *, struct rm_class *,
+                        u_int, int, u_int, int, int);
+void   rmc_delete_class(struct rm_ifdat *, struct rm_class *);
+int    rmc_modclass(struct rm_class *, u_int, int, u_int, int, u_int, int);
+void   rmc_init(struct ifaltq *, struct rm_ifdat *, u_int,
+                void (*)(struct ifaltq *), int, int, u_int, int, u_int, int);
+int    rmc_queue_packet(struct rm_class *, struct mbuf *);
+struct mbuf *rmc_dequeue_next(struct rm_ifdat *, int);
+void   rmc_update_class_util(struct rm_ifdat *);
+void   rmc_delay_action(struct rm_class *, struct rm_class *);
+void   rmc_dropall(struct rm_class *);
+int    rmc_get_weight(struct rm_ifdat *, int);
+
+#endif /* _KERNEL */
+
+#endif /* _ALTQ_ALTQ_RMCLASS_H_ */
diff --git a/sys/net/altq/altq_rmclass_debug.h b/sys/net/altq/altq_rmclass_debug.h
new file mode 100644 (file)
index 0000000..d46daaf
--- /dev/null
@@ -0,0 +1,101 @@
+/*     $KAME: altq_rmclass_debug.h,v 1.3 2002/11/29 04:36:24 kjc Exp $ */
+/*     $DragonFly: src/sys/net/altq/altq_rmclass_debug.h,v 1.1 2005/02/11 22:25:57 joerg Exp $ */
+
+/*
+ * Copyright (c) Sun Microsystems, Inc. 1998 All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *      This product includes software developed by the SMCC Technology
+ *      Development Group at Sun Microsystems, Inc.
+ *
+ * 4. The name of the Sun Microsystems, Inc nor may not be used to endorse or
+ *      promote products derived from this software without specific prior
+ *      written permission.
+ *
+ * SUN MICROSYSTEMS DOES NOT CLAIM MERCHANTABILITY OF THIS SOFTWARE OR THE
+ * SUITABILITY OF THIS SOFTWARE FOR ANY PARTICULAR PURPOSE.  The software is
+ * provided "as is" without express or implied warranty of any kind.
+ *
+ * These notices must be retained in any copies of any part of this software.
+ */
+
+#ifndef _ALTQ_ALTQ_RMCLASS_DEBUG_H_
+#define        _ALTQ_ALTQ_RMCLASS_DEBUG_H_
+
+/* #pragma ident       "@(#)rm_class_debug.h   1.7     98/05/04 SMI" */
+
+/*
+ * Cbq debugging macros
+ */
+
+#ifdef CBQ_TRACE
+#ifndef NCBQTRACE
+#define        NCBQTRACE (16 * 1024)
+#endif
+
+/*
+ * To view the trace output, using adb, type:
+ *     adb -k /dev/ksyms /dev/mem <cr>, then type
+ *     cbqtrace_count/D to get the count, then type
+ *     cbqtrace_buffer,0tcount/Dp4C" "Xn
+ *     This will dump the trace buffer from 0 to count.
+ */
+/*
+ * in ALTQ, "call cbqtrace_dump(N)" from DDB to display 20 events
+ * from Nth event in the circular buffer.
+ */
+
+struct cbqtrace {
+       int count;
+       int function;           /* address of function */
+       int trace_action;       /* descriptive 4 characters */
+       int object;             /* object operated on */
+};
+
+#define        CBQTRACEINIT() {                                \
+       if (cbqtrace_ptr == NULL)               \
+               cbqtrace_ptr = cbqtrace_buffer; \
+       else { \
+               cbqtrace_ptr = cbqtrace_buffer; \
+               bzero((void *)cbqtrace_ptr, sizeof(cbqtrace_buffer)); \
+               cbqtrace_count = 0; \
+       } \
+}
+
+#define        LOCK_TRACE()    splimp()
+#define        UNLOCK_TRACE(x) splx(x)
+
+#define        CBQTRACE(func, act, obj) {              \
+       int __s = LOCK_TRACE();                 \
+       int *_p = &cbqtrace_ptr->count; \
+       *_p++ = ++cbqtrace_count;               \
+       *_p++ = (int)(func);                    \
+       *_p++ = (int)(act);                     \
+       *_p++ = (int)(obj);                     \
+       if ((struct cbqtrace *)(void *)_p >= &cbqtrace_buffer[NCBQTRACE])\
+               cbqtrace_ptr = cbqtrace_buffer; \
+       else                                    \
+               cbqtrace_ptr = (struct cbqtrace *)(void *)_p; \
+       UNLOCK_TRACE(__s);                      \
+       }
+#else
+
+/* If no tracing, define no-ops */
+#define        CBQTRACEINIT()
+#define        CBQTRACE(a, b, c)
+
+#endif /* !CBQ_TRACE */
+
+#endif /* _ALTQ_ALTQ_RMCLASS_DEBUG_H_ */
diff --git a/sys/net/altq/altq_subr.c b/sys/net/altq/altq_subr.c
new file mode 100644 (file)
index 0000000..46cfbe3
--- /dev/null
@@ -0,0 +1,785 @@
+/*     $KAME: altq_subr.c,v 1.23 2004/04/20 16:10:06 itojun Exp $      */
+/*     $DragonFly: src/sys/net/altq/altq_subr.c,v 1.1 2005/02/11 22:25:57 joerg Exp $ */
+
+/*
+ * Copyright (C) 1997-2003
+ *     Sony Computer Science Laboratories Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "opt_altq.h"
+#include "opt_inet.h"
+#include "opt_inet6.h"
+
+#include <sys/param.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/kernel.h>
+#include <sys/callout.h>
+#include <sys/errno.h>
+#include <sys/syslog.h>
+#include <sys/sysctl.h>
+#include <sys/queue.h>
+
+#include <net/if.h>
+#include <net/if_dl.h>
+#include <net/if_types.h>
+#include <net/ifq_var.h>
+
+#include <netinet/in.h>
+#include <netinet/in_systm.h>
+#include <netinet/ip.h>
+#ifdef INET6
+#include <netinet/ip6.h>
+#endif
+#include <netinet/tcp.h>
+#include <netinet/udp.h>
+
+#include <net/pf/pfvar.h>
+#include <net/altq/altq.h>
+
+/* machine dependent clock related includes */
+#if defined(__i386__)
+#include <machine/clock.h>             /* for tsc_freq */
+#include <machine/md_var.h>            /* for cpu_feature */
+#include <machine/specialreg.h>                /* for CPUID_TSC */
+#endif /* __i386__ */
+
+/*
+ * internal function prototypes
+ */
+static void    tbr_timeout(void *);
+int (*altq_input)(struct mbuf *, int) = NULL;
+static int tbr_timer = 0;      /* token bucket regulator timer */
+static struct callout tbr_callout;
+
+int pfaltq_running;    /* keep track of running state */
+
+MALLOC_DEFINE(M_ALTQ, "altq", "ALTQ structures");
+
+/*
+ * alternate queueing support routines
+ */
+
+/* look up the queue state by the interface name and the queueing type. */
+void *
+altq_lookup(const char *name, int type)
+{
+       struct ifnet *ifp;
+
+       if ((ifp = ifunit(name)) != NULL) {
+               if (type != ALTQT_NONE && ifp->if_snd.altq_type == type)
+                       return (ifp->if_snd.altq_disc);
+       }
+
+       return (NULL);
+}
+
+int
+altq_attach(struct ifaltq *ifq, int type, void *discipline,
+           int (*enqueue)(struct ifaltq *, struct mbuf *, struct altq_pktattr *),
+           struct mbuf *(*dequeue)(struct ifaltq *, int),
+           int (*request)(struct ifaltq *, int, void *),
+           void *clfier,
+           void *(*classify)(struct ifaltq *, struct mbuf *,
+                             struct altq_pktattr *))
+{
+       if (!ifq_is_ready(ifq))
+               return ENXIO;
+
+       ifq->altq_type     = type;
+       ifq->altq_disc     = discipline;
+       ifq->altq_enqueue  = enqueue;
+       ifq->altq_dequeue  = dequeue;
+       ifq->altq_request  = request;
+       ifq->altq_clfier   = clfier;
+       ifq->altq_classify = classify;
+       ifq->altq_flags &= (ALTQF_CANTCHANGE|ALTQF_ENABLED);
+       return 0;
+}
+
+int
+altq_detach(struct ifaltq *ifq)
+{
+       if (!ifq_is_ready(ifq))
+               return ENXIO;
+       if (ifq_is_enabled(ifq))
+               return EBUSY;
+       if (!ifq_is_attached(ifq))
+               return (0);
+
+       ifq->altq_type     = ALTQT_NONE;
+       ifq->altq_disc     = NULL;
+       ifq->altq_enqueue  = NULL;
+       ifq->altq_dequeue  = NULL;
+       ifq->altq_request  = NULL;
+       ifq->altq_clfier   = NULL;
+       ifq->altq_classify = NULL;
+       ifq->altq_flags &= ALTQF_CANTCHANGE;
+       return 0;
+}
+
+int
+altq_enable(struct ifaltq *ifq)
+{
+       int s;
+
+       if (!ifq_is_ready(ifq))
+               return ENXIO;
+       if (ifq_is_enabled(ifq))
+               return 0;
+
+       s = splimp();
+       ifq_purge(ifq);
+       KKASSERT(ifq->ifq_len == 0);
+       ifq->altq_flags |= ALTQF_ENABLED;
+       if (ifq->altq_clfier != NULL)
+               ifq->altq_flags |= ALTQF_CLASSIFY;
+       splx(s);
+
+       return 0;
+}
+
+int
+altq_disable(struct ifaltq *ifq)
+{
+       int s;
+
+       if (!ifq_is_enabled(ifq))
+               return 0;
+
+       s = splimp();
+       ifq_purge(ifq);
+       KKASSERT(ifq->ifq_len == 0);
+       ifq->altq_flags &= ~(ALTQF_ENABLED|ALTQF_CLASSIFY);
+       splx(s);
+       return 0;
+}
+
+/*
+ * internal representation of token bucket parameters
+ *     rate:   byte_per_unittime << 32
+ *             (((bits_per_sec) / 8) << 32) / machclk_freq
+ *     depth:  byte << 32
+ *
+ */
+#define        TBR_SHIFT       32
+#define        TBR_SCALE(x)    ((int64_t)(x) << TBR_SHIFT)
+#define        TBR_UNSCALE(x)  ((x) >> TBR_SHIFT)
+
+struct mbuf *
+tbr_dequeue(struct ifaltq *ifq, int op)
+{
+       struct tb_regulator *tbr;
+       struct mbuf *m;
+       int64_t interval;
+       uint64_t now;
+
+       tbr = ifq->altq_tbr;
+       if (op == ALTDQ_REMOVE && tbr->tbr_lastop == ALTDQ_POLL) {
+               /* if this is a remove after poll, bypass tbr check */
+       } else {
+               /* update token only when it is negative */
+               if (tbr->tbr_token <= 0) {
+                       now = read_machclk();
+                       interval = now - tbr->tbr_last;
+                       if (interval >= tbr->tbr_filluptime)
+                               tbr->tbr_token = tbr->tbr_depth;
+                       else {
+                               tbr->tbr_token += interval * tbr->tbr_rate;
+                               if (tbr->tbr_token > tbr->tbr_depth)
+                                       tbr->tbr_token = tbr->tbr_depth;
+                       }
+                       tbr->tbr_last = now;
+               }
+               /* if token is still negative, don't allow dequeue */
+               if (tbr->tbr_token <= 0)
+                       return (NULL);
+       }
+
+       if (ifq_is_enabled(ifq))
+               m = (*ifq->altq_dequeue)(ifq, op);
+       else if (op == ALTDQ_POLL)
+               IF_POLL(ifq, m);
+       else
+               IF_DEQUEUE(ifq, m);
+
+       if (m != NULL && op == ALTDQ_REMOVE)
+               tbr->tbr_token -= TBR_SCALE(m_pktlen(m));
+       tbr->tbr_lastop = op;
+       return (m);
+}
+
+/*
+ * set a token bucket regulator.
+ * if the specified rate is zero, the token bucket regulator is deleted.
+ */
+int
+tbr_set(struct ifaltq *ifq, struct tb_profile *profile)
+{
+       struct tb_regulator *tbr, *otbr;
+
+       if (machclk_freq == 0)
+               init_machclk();
+       if (machclk_freq == 0) {
+               printf("tbr_set: no cpu clock available!\n");
+               return (ENXIO);
+       }
+
+       if (profile->rate == 0) {
+               /* delete this tbr */
+               if ((tbr = ifq->altq_tbr) == NULL)
+                       return (ENOENT);
+               ifq->altq_tbr = NULL;
+               free(tbr, M_ALTQ);
+               return (0);
+       }
+
+       tbr = malloc(sizeof(*tbr), M_ALTQ, M_WAITOK | M_ZERO);
+       tbr->tbr_rate = TBR_SCALE(profile->rate / 8) / machclk_freq;
+       tbr->tbr_depth = TBR_SCALE(profile->depth);
+       if (tbr->tbr_rate > 0)
+               tbr->tbr_filluptime = tbr->tbr_depth / tbr->tbr_rate;
+       else
+               tbr->tbr_filluptime = 0xffffffffffffffffLL;
+       tbr->tbr_token = tbr->tbr_depth;
+       tbr->tbr_last = read_machclk();
+       tbr->tbr_lastop = ALTDQ_REMOVE;
+
+       otbr = ifq->altq_tbr;
+       ifq->altq_tbr = tbr;    /* set the new tbr */
+
+       if (otbr != NULL)
+               free(otbr, M_ALTQ);
+       else if (tbr_timer == 0) {
+               callout_reset(&tbr_callout, 1, tbr_timeout, NULL);
+               tbr_timer = 1;
+       }
+       return (0);
+}
+
+/*
+ * tbr_timeout goes through the interface list, and kicks the drivers
+ * if necessary.
+ */
+static void
+tbr_timeout(void *arg)
+{
+       struct ifnet *ifp;
+       int active, s;
+
+       active = 0;
+       s = splimp();
+       for (ifp = TAILQ_FIRST(&ifnet); ifp; ifp = TAILQ_NEXT(ifp, if_list)) {
+               if (ifp->if_snd.altq_tbr == NULL)
+                       continue;
+               active++;
+               if (!ifq_is_empty(&ifp->if_snd) && ifp->if_start != NULL)
+                       (*ifp->if_start)(ifp);
+       }
+       splx(s);
+       if (active > 0)
+               callout_reset(&tbr_callout, 1, tbr_timeout, NULL);
+       else
+               tbr_timer = 0;  /* don't need tbr_timer anymore */
+}
+
+/*
+ * get token bucket regulator profile
+ */
+int
+tbr_get(struct ifaltq *ifq, struct tb_profile *profile)
+{
+       struct tb_regulator *tbr;
+
+       if ((tbr = ifq->altq_tbr) == NULL) {
+               profile->rate = 0;
+               profile->depth = 0;
+       } else {
+               profile->rate =
+                   (u_int)TBR_UNSCALE(tbr->tbr_rate * 8 * machclk_freq);
+               profile->depth = (u_int)TBR_UNSCALE(tbr->tbr_depth);
+       }
+       return (0);
+}
+
+/*
+ * attach a discipline to the interface.  if one already exists, it is
+ * overridden.
+ */
+int
+altq_pfattach(struct pf_altq *a)
+{
+       struct ifnet *ifp;
+       struct tb_profile tb;
+       int s, error = 0;
+
+       switch (a->scheduler) {
+       case ALTQT_NONE:
+               break;
+#ifdef ALTQ_CBQ
+       case ALTQT_CBQ:
+               error = cbq_pfattach(a);
+               break;
+#endif
+#ifdef ALTQ_PRIQ
+       case ALTQT_PRIQ:
+               error = priq_pfattach(a);
+               break;
+#endif
+#ifdef ALTQ_HFSC
+       case ALTQT_HFSC:
+               error = hfsc_pfattach(a);
+               break;
+#endif
+       default:
+               error = ENXIO;
+       }
+
+       ifp = ifunit(a->ifname);
+
+       /* if the state is running, enable altq */
+       if (error == 0 && pfaltq_running &&
+           ifp != NULL && ifp->if_snd.altq_type != ALTQT_NONE &&
+           !ifq_is_enabled(&ifp->if_snd))
+                       error = altq_enable(&ifp->if_snd);
+
+       /* if altq is already enabled, reset set tokenbucket regulator */
+       if (error == 0 && ifp != NULL && ifq_is_enabled(&ifp->if_snd)) {
+               tb.rate = a->ifbandwidth;
+               tb.depth = a->tbrsize;
+               s = splimp();
+               error = tbr_set(&ifp->if_snd, &tb);
+               splx(s);
+       }
+
+       return (error);
+}
+
+/*
+ * detach a discipline from the interface.
+ * it is possible that the discipline was already overridden by another
+ * discipline.
+ */
+int
+altq_pfdetach(struct pf_altq *a)
+{
+       struct ifnet *ifp;
+       int s, error = 0;
+
+       if ((ifp = ifunit(a->ifname)) == NULL)
+               return (EINVAL);
+
+       /* if this discipline is no longer referenced, just return */
+       if (a->altq_disc == NULL || a->altq_disc != ifp->if_snd.altq_disc)
+               return (0);
+
+       s = splimp();
+       if (ifq_is_enabled(&ifp->if_snd))
+               error = altq_disable(&ifp->if_snd);
+       if (error == 0)
+               error = altq_detach(&ifp->if_snd);
+       splx(s);
+
+       return (error);
+}
+
+/*
+ * add a discipline or a queue
+ */
+int
+altq_add(struct pf_altq *a)
+{
+       int error = 0;
+
+       if (a->qname[0] != 0)
+               return (altq_add_queue(a));
+
+       if (machclk_freq == 0)
+               init_machclk();
+       if (machclk_freq == 0)
+               panic("altq_add: no cpu clock");
+
+       switch (a->scheduler) {
+#ifdef ALTQ_CBQ
+       case ALTQT_CBQ:
+               error = cbq_add_altq(a);
+               break;
+#endif
+#ifdef ALTQ_PRIQ
+       case ALTQT_PRIQ:
+               error = priq_add_altq(a);
+               break;
+#endif
+#ifdef ALTQ_HFSC
+       case ALTQT_HFSC:
+               error = hfsc_add_altq(a);
+               break;
+#endif
+       default:
+               error = ENXIO;
+       }
+
+       return (error);
+}
+
+/*
+ * remove a discipline or a queue
+ */
+int
+altq_remove(struct pf_altq *a)
+{
+       int error = 0;
+
+       if (a->qname[0] != 0)
+               return (altq_remove_queue(a));
+
+       switch (a->scheduler) {
+#ifdef ALTQ_CBQ
+       case ALTQT_CBQ:
+               error = cbq_remove_altq(a);
+               break;
+#endif
+#ifdef ALTQ_PRIQ
+       case ALTQT_PRIQ:
+               error = priq_remove_altq(a);
+               break;
+#endif
+#ifdef ALTQ_HFSC
+       case ALTQT_HFSC:
+               error = hfsc_remove_altq(a);
+               break;
+#endif
+       default:
+               error = ENXIO;
+       }
+
+       return (error);
+}
+
+/*
+ * add a queue to the discipline
+ */
+int
+altq_add_queue(struct pf_altq *a)
+{
+       int error = 0;
+
+       switch (a->scheduler) {
+#ifdef ALTQ_CBQ
+       case ALTQT_CBQ:
+               error = cbq_add_queue(a);
+               break;
+#endif
+#ifdef ALTQ_PRIQ
+       case ALTQT_PRIQ:
+               error = priq_add_queue(a);
+               break;
+#endif
+#ifdef ALTQ_HFSC
+       case ALTQT_HFSC:
+               error = hfsc_add_queue(a);
+               break;
+#endif
+       default:
+               error = ENXIO;
+       }
+
+       return (error);
+}
+
+/*
+ * remove a queue from the discipline
+ */
+int
+altq_remove_queue(struct pf_altq *a)
+{
+       int error = 0;
+
+       switch (a->scheduler) {
+#ifdef ALTQ_CBQ
+       case ALTQT_CBQ:
+               error = cbq_remove_queue(a);
+               break;
+#endif
+#ifdef ALTQ_PRIQ
+       case ALTQT_PRIQ:
+               error = priq_remove_queue(a);
+               break;
+#endif
+#ifdef ALTQ_HFSC
+       case ALTQT_HFSC:
+               error = hfsc_remove_queue(a);
+               break;
+#endif
+       default:
+               error = ENXIO;
+       }
+
+       return (error);
+}
+
+/*
+ * get queue statistics
+ */
+int
+altq_getqstats(struct pf_altq *a, void *ubuf, int *nbytes)
+{
+       int error = 0;
+
+       switch (a->scheduler) {
+#ifdef ALTQ_CBQ
+       case ALTQT_CBQ:
+               error = cbq_getqstats(a, ubuf, nbytes);
+               break;
+#endif
+#ifdef ALTQ_PRIQ
+       case ALTQT_PRIQ:
+               error = priq_getqstats(a, ubuf, nbytes);
+               break;
+#endif
+#ifdef ALTQ_HFSC
+       case ALTQT_HFSC:
+               error = hfsc_getqstats(a, ubuf, nbytes);
+               break;
+#endif
+       default:
+               error = ENXIO;
+       }
+
+       return (error);
+}
+
+/*
+ * read and write diffserv field in IPv4 or IPv6 header
+ */
+uint8_t
+read_dsfield(struct mbuf *m, struct altq_pktattr *pktattr)
+{
+       struct mbuf *m0;
+       uint8_t ds_field = 0;
+
+       if (pktattr == NULL ||
+           (pktattr->pattr_af != AF_INET && pktattr->pattr_af != AF_INET6))
+               return ((uint8_t)0);
+
+       /* verify that pattr_hdr is within the mbuf data */
+       for (m0 = m; m0 != NULL; m0 = m0->m_next) {
+               if ((pktattr->pattr_hdr >= m0->m_data) &&
+                   (pktattr->pattr_hdr < m0->m_data + m0->m_len))
+                       break;
+       }
+       if (m0 == NULL) {
+               /* ick, pattr_hdr is stale */
+               pktattr->pattr_af = AF_UNSPEC;
+#ifdef ALTQ_DEBUG
+               printf("read_dsfield: can't locate header!\n");
+#endif
+               return ((uint8_t)0);
+       }
+
+       if (pktattr->pattr_af == AF_INET) {
+               struct ip *ip = (struct ip *)pktattr->pattr_hdr;
+
+               if (ip->ip_v != 4)
+                       return ((uint8_t)0);    /* version mismatch! */
+               ds_field = ip->ip_tos;
+       }
+#ifdef INET6
+       else if (pktattr->pattr_af == AF_INET6) {
+               struct ip6_hdr *ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
+               uint32_t flowlabel;
+
+               flowlabel = ntohl(ip6->ip6_flow);
+               if ((flowlabel >> 28) != 6)
+                       return ((uint8_t)0);    /* version mismatch! */
+               ds_field = (flowlabel >> 20) & 0xff;
+       }
+#endif
+       return (ds_field);
+}
+
+void
+write_dsfield(struct mbuf *m, struct altq_pktattr *pktattr, uint8_t dsfield)
+{
+       struct mbuf *m0;
+
+       if (pktattr == NULL ||
+           (pktattr->pattr_af != AF_INET && pktattr->pattr_af != AF_INET6))
+               return;
+
+       /* verify that pattr_hdr is within the mbuf data */
+       for (m0 = m; m0 != NULL; m0 = m0->m_next) {
+               if ((pktattr->pattr_hdr >= m0->m_data) &&
+                   (pktattr->pattr_hdr < m0->m_data + m0->m_len))
+                       break;
+       }
+       if (m0 == NULL) {
+               /* ick, pattr_hdr is stale */
+               pktattr->pattr_af = AF_UNSPEC;
+#ifdef ALTQ_DEBUG
+               printf("write_dsfield: can't locate header!\n");
+#endif
+               return;
+       }
+
+       if (pktattr->pattr_af == AF_INET) {
+               struct ip *ip = (struct ip *)pktattr->pattr_hdr;
+               uint8_t old;
+               int32_t sum;
+
+               if (ip->ip_v != 4)
+                       return;         /* version mismatch! */
+               old = ip->ip_tos;
+               dsfield |= old & 3;     /* leave CU bits */
+               if (old == dsfield)
+                       return;
+               ip->ip_tos = dsfield;
+               /*
+                * update checksum (from RFC1624)
+                *         HC' = ~(~HC + ~m + m')
+                */
+               sum = ~ntohs(ip->ip_sum) & 0xffff;
+               sum += 0xff00 + (~old & 0xff) + dsfield;
+               sum = (sum >> 16) + (sum & 0xffff);
+               sum += (sum >> 16);  /* add carry */
+
+               ip->ip_sum = htons(~sum & 0xffff);
+       }
+#ifdef INET6
+       else if (pktattr->pattr_af == AF_INET6) {
+               struct ip6_hdr *ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
+               uint32_t flowlabel;
+
+               flowlabel = ntohl(ip6->ip6_flow);
+               if ((flowlabel >> 28) != 6)
+                       return;         /* version mismatch! */
+               flowlabel = (flowlabel & 0xf03fffff) | (dsfield << 20);
+               ip6->ip6_flow = htonl(flowlabel);
+       }
+#endif
+}
+
+/*
+ * high resolution clock support taking advantage of a machine dependent
+ * high resolution time counter (e.g., timestamp counter of intel pentium).
+ * we assume
+ *  - 64-bit-long monotonically-increasing counter
+ *  - frequency range is 100M-4GHz (CPU speed)
+ */
+/* if pcc is not available or disabled, emulate 256MHz using microtime() */
+#define        MACHCLK_SHIFT   8
+
+int machclk_usepcc;
+uint32_t machclk_freq = 0;
+uint32_t machclk_per_tick = 0;
+
+void
+init_machclk(void)
+{
+       callout_init(&tbr_callout);
+
+       machclk_usepcc = 1;
+
+#if !defined(__i386__) || defined(ALTQ_NOPCC)
+       machclk_usepcc = 0;
+#elif defined(__DragonFly__) && defined(SMP)
+       machclk_usepcc = 0;
+#elif defined(__i386__)
+       /* check if TSC is available */
+       if (machclk_usepcc == 1 && (cpu_feature & CPUID_TSC) == 0)
+               machclk_usepcc = 0;
+#endif
+
+       if (machclk_usepcc == 0) {
+               /* emulate 256MHz using microtime() */
+               machclk_freq = 1000000 << MACHCLK_SHIFT;
+               machclk_per_tick = machclk_freq / hz;
+#ifdef ALTQ_DEBUG
+               printf("altq: emulate %uHz cpu clock\n", machclk_freq);
+#endif
+               return;
+       }
+
+       /*
+        * if the clock frequency (of Pentium TSC or Alpha PCC) is
+        * accessible, just use it.
+        */
+#ifdef __i386__
+       machclk_freq = tsc_freq;
+#else
+#error "machclk_freq interface not implemented"
+#endif
+
+       /*
+        * if we don't know the clock frequency, measure it.
+        */
+       if (machclk_freq == 0) {
+               static int      wait;
+               struct timeval  tv_start, tv_end;
+               uint64_t        start, end, diff;
+               int             timo;
+
+               microtime(&tv_start);
+               start = read_machclk();
+               timo = hz;      /* 1 sec */
+               tsleep(&wait, PCATCH, "init_machclk", timo);
+               microtime(&tv_end);
+               end = read_machclk();
+               diff = (uint64_t)(tv_end.tv_sec - tv_start.tv_sec) * 1000000
+                   + tv_end.tv_usec - tv_start.tv_usec;
+               if (diff != 0)
+                       machclk_freq = (u_int)((end - start) * 1000000 / diff);
+       }
+
+       machclk_per_tick = machclk_freq / hz;
+
+#ifdef ALTQ_DEBUG
+       printf("altq: CPU clock: %uHz\n", machclk_freq);
+#endif
+}
+
+uint64_t
+read_machclk(void)
+{
+       uint64_t val;
+
+       if (machclk_usepcc) {
+#if defined(__i386__)
+               val = rdtsc();
+#else
+               panic("read_machclk");
+#endif
+       } else {
+               struct timeval tv;
+
+               microtime(&tv);
+               val = (((uint64_t)(tv.tv_sec - boottime.tv_sec) * 1000000
+                   + tv.tv_usec) << MACHCLK_SHIFT);
+       }
+       return (val);
+}
diff --git a/sys/net/altq/altq_var.h b/sys/net/altq/altq_var.h
new file mode 100644 (file)
index 0000000..7f2b679
--- /dev/null
@@ -0,0 +1,96 @@
+/*     $KAME: altq_var.h,v 1.17 2004/04/20 05:09:08 kjc Exp $  */
+/*     $DragonFly: src/sys/net/altq/altq_var.h,v 1.1 2005/02/11 22:25:57 joerg Exp $ */
+
+/*
+ * Copyright (C) 1998-2003
+ *     Sony Computer Science Laboratories Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+#ifndef _ALTQ_ALTQ_VAR_H_
+#define        _ALTQ_ALTQ_VAR_H_
+
+#ifdef _KERNEL
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/queue.h>
+
+MALLOC_DECLARE(M_ALTQ);
+
+/*
+ * machine dependent clock
+ * a 64bit high resolution time counter.
+ */
+extern int     machclk_usepcc;
+extern uint32_t        machclk_freq;
+extern uint32_t        machclk_per_tick;
+
+void           init_machclk(void);
+uint64_t       read_machclk(void);
+
+#define        m_pktlen(m)             ((m)->m_pkthdr.len)
+
+extern int pfaltq_running;
+
+struct ifnet;
+struct mbuf;
+struct pf_altq;
+
+void   *altq_lookup(const char *, int);
+uint8_t        read_dsfield(struct mbuf *, struct altq_pktattr *);
+void   write_dsfield(struct mbuf *, struct altq_pktattr *, uint8_t);
+int    tbr_set(struct ifaltq *, struct tb_profile *);
+int    tbr_get(struct ifaltq *, struct tb_profile *);
+
+int    altq_pfattach(struct pf_altq *);
+int    altq_pfdetach(struct pf_altq *);
+int    altq_add(struct pf_altq *);
+int    altq_remove(struct pf_altq *);
+int    altq_add_queue(struct pf_altq *);
+int    altq_remove_queue(struct pf_altq *);
+int    altq_getqstats(struct pf_altq *, void *, int *);
+
+int    cbq_pfattach(struct pf_altq *);
+int    cbq_add_altq(struct pf_altq *);
+int    cbq_remove_altq(struct pf_altq *);
+int    cbq_add_queue(struct pf_altq *);
+int    cbq_remove_queue(struct pf_altq *);
+int    cbq_getqstats(struct pf_altq *, void *, int *);
+
+int    priq_pfattach(struct pf_altq *);
+int    priq_add_altq(struct pf_altq *);
+int    priq_remove_altq(struct pf_altq *);
+int    priq_add_queue(struct pf_altq *);
+int    priq_remove_queue(struct pf_altq *);
+int    priq_getqstats(struct pf_altq *, void *, int *);
+
+int    hfsc_pfattach(struct pf_altq *);
+int    hfsc_add_altq(struct pf_altq *);
+int    hfsc_remove_altq(struct pf_altq *);
+int    hfsc_add_queue(struct pf_altq *);
+int    hfsc_remove_queue(struct pf_altq *);
+int    hfsc_getqstats(struct pf_altq *, void *, int *);
+
+#endif /* _KERNEL */
+#endif /* _ALTQ_ALTQ_VAR_H_ */
diff --git a/sys/net/altq/if_altq.h b/sys/net/altq/if_altq.h
new file mode 100644 (file)
index 0000000..56af605
--- /dev/null
@@ -0,0 +1,144 @@
+/*     $KAME: if_altq.h,v 1.11 2003/07/10 12:07:50 kjc Exp $   */
+/*     $DragonFly: src/sys/net/altq/if_altq.h,v 1.1 2005/02/11 22:25:57 joerg Exp $ */
+
+/*
+ * Copyright (C) 1997-2003
+ *     Sony Computer Science Laboratories Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+#ifndef _ALTQ_IF_ALTQ_H_
+#define        _ALTQ_IF_ALTQ_H_
+
+struct altq_pktattr;
+
+/*
+ * Structure defining a queue for a network interface.
+ */
+struct ifaltq {
+       /* fields compatible with struct ifqueue */
+       struct  mbuf *ifq_head;
+       struct  mbuf *ifq_tail;
+       int     ifq_len;
+       int     ifq_maxlen;
+       int     ifq_drops;
+
+       /* alternate queueing related fields */
+       int     altq_type;              /* discipline type */
+       int     altq_flags;             /* flags (e.g. ready, in-use) */
+       void    *altq_disc;             /* for discipline-specific use */
+       struct  ifnet *altq_ifp;        /* back pointer to interface */
+
+       int     (*altq_enqueue)(struct ifaltq *, struct mbuf *,
+                               struct altq_pktattr *);
+       struct  mbuf *(*altq_dequeue)(struct ifaltq *, int);
+       int     (*altq_request)(struct ifaltq *, int, void *);
+
+       /* classifier fields */
+       void    *altq_clfier;           /* classifier-specific use */
+       void    *(*altq_classify)(struct ifaltq *, struct mbuf *,
+                                 struct altq_pktattr *);
+
+       /* token bucket regulator */
+       struct  tb_regulator *altq_tbr;
+};
+
+
+#ifdef _KERNEL
+
+/*
+ * packet attributes used by queueing disciplines.
+ * pattr_class is a discipline-dependent scheduling class that is
+ * set by a classifier.
+ * pattr_hdr and pattr_af may be used by a discipline to access
+ * the header within a mbuf.  (e.g. ECN needs to update the CE bit)
+ * note that pattr_hdr could be stale after m_pullup, though link
+ * layer output routines usually don't use m_pullup.  link-level
+ * compression also invalidates these fields.  thus, pattr_hdr needs
+ * to be verified when a discipline touches the header.
+ */
+struct altq_pktattr {
+       void    *pattr_class;           /* sched class set by classifier */
+       int     pattr_af;               /* address family */
+       caddr_t pattr_hdr;              /* saved header position in mbuf */
+};
+
+/*
+ * a token-bucket regulator limits the rate that a network driver can
+ * dequeue packets from the output queue.
+ * modern cards are able to buffer a large amount of packets and dequeue
+ * too many packets at a time.  this bursty dequeue behavior makes it
+ * impossible to schedule packets by queueing disciplines.
+ * a token-bucket is used to control the burst size in a device
+ * independent manner.
+ */
+struct tb_regulator {
+       int64_t         tbr_rate;       /* (scaled) token bucket rate */
+       int64_t         tbr_depth;      /* (scaled) token bucket depth */
+
+       int64_t         tbr_token;      /* (scaled) current token */
+       int64_t         tbr_filluptime; /* (scaled) time to fill up bucket */
+       uint64_t        tbr_last;       /* last time token was updated */
+
+       int             tbr_lastop;     /* last dequeue operation type
+                                          needed for poll-and-dequeue */
+};
+
+/* if_altqflags */
+#define        ALTQF_READY      0x01   /* driver supports alternate queueing */
+#define        ALTQF_ENABLED    0x02   /* altq is in use */
+#define        ALTQF_CLASSIFY   0x04   /* classify packets */
+#define        ALTQF_DRIVER1    0x40   /* driver specific */
+
+/* if_altqflags set internally only: */
+#define        ALTQF_CANTCHANGE        (ALTQF_READY)
+
+/* altq_dequeue 2nd arg */
+#define        ALTDQ_REMOVE            1       /* dequeue mbuf from the queue */
+#define        ALTDQ_POLL              2       /* don't dequeue mbuf from the queue */
+
+/* altq request types (currently only purge is defined) */
+#define        ALTRQ_PURGE             1       /* purge all packets */
+
+#define        ALTQ_ENQUEUE(ifq, m, pa, err)                                   \
+       (err) = (*(ifq)->altq_enqueue)((ifq),(m),(pa))
+#define        ALTQ_DEQUEUE(ifq, m)                                            \
+       (m) = (*(ifq)->altq_dequeue)((ifq), ALTDQ_REMOVE)
+#define        ALTQ_POLL(ifq, m)                                               \
+       (m) = (*(ifq)->altq_dequeue)((ifq), ALTDQ_POLL)
+#define        ALTQ_PURGE(ifq)                                                 \
+       (void)(*(ifq)->altq_request)((ifq), ALTRQ_PURGE, (void *)0)
+
+int    altq_attach(struct ifaltq *, int, void *,
+                   int (*)(struct ifaltq *, struct mbuf *, struct altq_pktattr *),
+                   struct mbuf *(*)(struct ifaltq *, int),
+                   int (*)(struct ifaltq *, int, void *),
+                   void *, void *(*)(struct ifaltq *, struct mbuf *,
+                                     struct altq_pktattr *));
+int    altq_detach(struct ifaltq *);
+int    altq_enable(struct ifaltq *);
+int    altq_disable(struct ifaltq *);
+struct mbuf *tbr_dequeue(struct ifaltq *, int);
+extern int     (*altq_input)(struct mbuf *, int);
+#endif /* _KERNEL */
+
+#endif /* _ALTQ_IF_ALTQ_H_ */
index f520889..0e52e99 100644 (file)
@@ -25,7 +25,7 @@
  * SUCH DAMAGE.
  *
  * $FreeBSD: src/sys/net/bridge.c,v 1.16.2.25 2003/01/23 21:06:44 sam Exp $
- * $DragonFly: src/sys/net/bridge/Attic/bridge.c,v 1.12 2005/01/23 13:47:24 joerg Exp $
+ * $DragonFly: src/sys/net/bridge/Attic/bridge.c,v 1.13 2005/02/11 22:25:57 joerg Exp $
  */
 
 /*
 
 #include <net/if.h>
 #include <net/if_types.h>
+#include <net/if_llc.h>
 #include <net/if_var.h>
+#include <net/ifq_var.h>
 #include <net/pfil.h>
 
 #include <netinet/in.h> /* for struct arpcom */
@@ -777,7 +779,7 @@ bdg_forward(struct mbuf *m0, struct ether_header *const eh, struct ifnet *dst)
     struct ifnet *src;
     struct ifnet *ifp, *last;
     int shared = bdg_copy ; /* someone else is using the mbuf */
-    int once = 0;      /* loop only once */
+    int error, once = 0;      /* loop only once */
     struct ifnet *real_dst = dst ; /* real dst from ether_output */
     struct ip_fw_args args;
 
@@ -978,6 +980,8 @@ forward:
     for (;;) {
        if (last) { /* need to forward packet leftover from previous loop */
            struct mbuf *m ;
+           struct altq_pktattr pktattr;
+
            if (shared == 0 && once ) { /* no need to copy */
                m = m0 ;
                m0 = NULL ; /* original is gone */
@@ -988,6 +992,26 @@ forward:
                    return m0 ; /* the original is still there... */
                }
            }
+           if (ifq_is_enabled(&last->if_snd)) {
+                   uint16_t ether_type;
+                   int af;
+
+                   /*
+                    * If the queueing discipline needs packet classification,
+                    * do it before prepending link headers.
+                    */
+                   ether_type = ntohs(eh->ether_type);
+                   if (ether_type == ETHERTYPE_IP)
+                           af = AF_INET;
+#ifdef INET6
+                   else if (ether_type == ETHERTYPE_IPV6)
+                           af = AF_INET6;
+#endif
+                   else
+                           af = AF_UNSPEC;
+                   ifq_classify(&last->if_snd, m, af, &pktattr);
+           }
+
            /*
             * Add header (optimized for the common case of eh pointing
             * already into the mbuf) and execute last part of ether_output:
@@ -1006,7 +1030,8 @@ forward:
                    return m0;
                bcopy(&save_eh, mtod(m, struct ether_header *), ETHER_HDR_LEN);
            }
-           if (!IF_HANDOFF(&last->if_snd, m, last)) {
+           error = ifq_handoff(last, m, &pktattr);
+           if (error != 0) {
 #if 0
                BDG_MUTE(last); /* should I also mute ? */
 #endif
@@ -1023,7 +1048,10 @@ forward:
         * up and running, is not the source interface, and belongs to
         * the same cluster as the 'real_dst', then send here.
         */
-       if ( BDG_USED(ifp) && !BDG_MUTED(ifp) && !IF_QFULL(&ifp->if_snd)  &&
+       if ( BDG_USED(ifp) && !BDG_MUTED(ifp) &&
+#ifndef ALTQ
+            !IF_QFULL(&ifp->if_snd) &&
+#endif
             (ifp->if_flags & (IFF_UP|IFF_RUNNING)) == (IFF_UP|IFF_RUNNING) &&
             ifp != src && BDG_SAMECLUSTER(ifp, real_dst) )
            last = ifp ;
index 15154e4..62f805a 100644 (file)
@@ -2,7 +2,7 @@
  * Fundamental constants relating to ethernet.
  *
  * $FreeBSD: src/sys/net/ethernet.h,v 1.12.2.8 2002/12/01 14:03:09 sobomax Exp $
- * $DragonFly: src/sys/net/ethernet.h,v 1.10 2004/12/21 02:54:14 hsu Exp $
+ * $DragonFly: src/sys/net/ethernet.h,v 1.11 2005/02/11 22:25:57 joerg Exp $
  *
  */
 
@@ -376,6 +376,11 @@ extern     int (*vlan_input_tag_p)(struct mbuf *m, uint16_t t);
        /* XXX: unlock */                               \
 } while (0)
 
+struct altq_pktattr;
+struct ifaltq;
+
+void   altq_etherclassify(struct ifaltq *, struct mbuf *, struct altq_pktattr *);
+
 #else /* _KERNEL */
 
 #include <sys/cdefs.h>
index 3dde7d8..327bcf9 100644 (file)
@@ -32,7 +32,7 @@
  *
  *     @(#)if.c        8.3 (Berkeley) 1/4/94
  * $FreeBSD: src/sys/net/if.c,v 1.185 2004/03/13 02:35:03 brooks Exp $
- * $DragonFly: src/sys/net/if.c,v 1.27 2005/02/01 16:09:37 hrs Exp $
+ * $DragonFly: src/sys/net/if.c,v 1.28 2005/02/11 22:25:57 joerg Exp $
  */
 
 #include "opt_compat.h"
@@ -60,6 +60,7 @@
 #include <net/if_dl.h>
 #include <net/if_types.h>
 #include <net/if_var.h>
+#include <net/ifq_var.h>
 #include <net/radix.h>
 #include <net/route.h>
 #include <machine/stdarg.h>
@@ -88,7 +89,6 @@ static void   if_attachdomain(void *);
 static void    if_attachdomain1(struct ifnet *);
 static int ifconf (u_long, caddr_t, struct thread *);
 static void ifinit (void *);
-static void if_qflush (struct ifqueue *);
 static void if_slowtimo (void *);
 static void link_rtrequest (int, struct rtentry *, struct rt_addrinfo *);
 static int  if_rtdel (struct radix_node *, void *);
@@ -240,6 +240,12 @@ if_attach(struct ifnet *ifp)
 
        EVENTHANDLER_INVOKE(ifnet_attach_event, ifp);
 
+       ifp->if_snd.altq_type = 0;
+       ifp->if_snd.altq_disc = NULL;
+       ifp->if_snd.altq_flags &= ALTQF_CANTCHANGE;
+       ifp->if_snd.altq_tbr = NULL;
+       ifp->if_snd.altq_ifp = ifp;
+
        if (domains)
                if_attachdomain1(ifp);
 
@@ -300,6 +306,11 @@ if_detach(struct ifnet *ifp)
        s = splnet();
        if_down(ifp);
 
+       if (ifq_is_enabled(&ifp->if_snd))
+               altq_disable(&ifp->if_snd);
+       if (ifq_is_attached(&ifp->if_snd))
+               altq_detach(&ifp->if_snd);
+
        /*
         * Remove address from ifnet_addrs[] and maybe decrement if_index.
         * Clean up all addresses.
@@ -877,7 +888,7 @@ if_unroute(struct ifnet *ifp, int flag, int fam)
        TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
                if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family))
                        pfctlinput(PRC_IFDOWN, ifa->ifa_addr);
-       if_qflush(&ifp->if_snd);
+       ifq_purge(&ifp->if_snd);
        rt_ifmsg(ifp);
 }
 
@@ -931,24 +942,6 @@ if_up(struct ifnet *ifp)
        if_route(ifp, IFF_UP, AF_UNSPEC);
 }
 
-/*
- * Flush an interface queue.
- */
-static void
-if_qflush(struct ifqueue *ifq)
-{
-       struct mbuf *m, *n;
-
-       n = ifq->ifq_head;
-       while ((m = n) != 0) {
-               n = m->m_nextpkt;
-               m_freem(m);
-       }
-       ifq->ifq_head = 0;
-       ifq->ifq_tail = 0;
-       ifq->ifq_len = 0;
-}
-
 /*
  * Handle interface watchdog timer routines.  Called
  * from softclock, we decrement timers (if set) and
index 0e88f75..d417a4f 100644 (file)
@@ -1,6 +1,6 @@
 /*     $NetBSD: if_arcsubr.c,v 1.36 2001/06/14 05:44:23 itojun Exp $   */
 /*     $FreeBSD: src/sys/net/if_arcsubr.c,v 1.1.2.5 2003/02/05 18:42:15 fjoe Exp $ */
-/*     $DragonFly: src/sys/net/Attic/if_arcsubr.c,v 1.13 2005/01/23 20:23:22 joerg Exp $ */
+/*     $DragonFly: src/sys/net/Attic/if_arcsubr.c,v 1.14 2005/02/11 22:25:57 joerg Exp $ */
 
 /*
  * Copyright (c) 1994, 1995 Ignatios Souvatzis
@@ -63,6 +63,7 @@
 #include <net/if_types.h>
 #include <net/if_arc.h>
 #include <net/if_arp.h>
+#include <net/ifq_var.h>
 #include <net/bpf.h>
 
 #if defined(INET) || defined(INET6)
@@ -113,10 +114,17 @@ arc_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst,
        u_int8_t                atype, adst;
        int                     loop_copy = 0;
        int                     isphds;
+       struct altq_pktattr pktattr;
 
        if ((ifp->if_flags & (IFF_UP | IFF_RUNNING)) != (IFF_UP | IFF_RUNNING))
                return (ENETDOWN);      /* m, m1 aren't initialized yet */
 
+       /*
+        * If the queueing discipline needs packet classification,
+        * do it before prepending link headers.
+        */
+       ifq_classify(&ifp->if_snd, m, dst->sa_family, &pktattr);
+
        switch (dst->sa_family) {
 #ifdef INET
        case AF_INET:
@@ -207,12 +215,8 @@ arc_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst,
 
        BPF_MTAP(ifp, m);
 
-       if (!IF_HANDOFF(&ifp->if_snd, m, ifp)) {
-               m = NULL;
-               gotoerr(ENOBUFS);
-       }
-
-       return (0);
+       error = ifq_handoff(ifp, m, &pktattr);
+       return (error);
 
 bad:
        if (m != NULL)
index 0db7d5a..1b7445d 100644 (file)
@@ -32,7 +32,7 @@
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD: src/sys/net/if_atmsubr.c,v 1.10.2.1 2001/03/06 00:29:26 obrien Exp $
- * $DragonFly: src/sys/net/if_atmsubr.c,v 1.11 2005/01/06 09:14:13 hsu Exp $
+ * $DragonFly: src/sys/net/if_atmsubr.c,v 1.12 2005/02/11 22:25:57 joerg Exp $
  */
 
 /*
@@ -51,6 +51,8 @@
 #include <sys/errno.h>
 
 #include <net/if.h>
+#include <net/if_var.h>
+#include <net/ifq_var.h>
 #include <net/bpf.h>
 #include <net/netisr.h>
 #include <net/route.h>
@@ -101,10 +103,18 @@ atm_output(struct ifnet *ifp, struct mbuf *m0, struct sockaddr *dst,
        struct atmllc *atmllc;
        struct atmllc *llc_hdr = NULL;
        u_int32_t atm_flags;
+       struct altq_pktattr pktattr;
 
        if ((ifp->if_flags & (IFF_UP|IFF_RUNNING)) != (IFF_UP|IFF_RUNNING))
                gotoerr(ENETDOWN);
 
+       /*
+        * if the queueing discipline needs packet classification,
+        * do it before prepending link headers.
+        */
+       ifq_classify(&ifp->if_snd, m,
+                    (dst !=