ifq/staging: Initial implementation of IFQ packet staging mechanism
authorSepherosa Ziehau <sephe@dragonflybsd.org>
Sun, 23 Dec 2012 12:31:32 +0000 (20:31 +0800)
committerSepherosa Ziehau <sephe@dragonflybsd.org>
Thu, 27 Dec 2012 07:57:44 +0000 (15:57 +0800)
The packets enqueued into IFQ are staged to a certain amount before the
ifnet's if_start is called.  In this way, the driver could avoid writing
to hardware registers upon every packet, instead, hardware registers
could be written when certain amount of packets are put onto hardware
TX ring.  The measurement on several modern NICs (emx(4), igb(4), bnx(4),
bge(4), jme(4)) shows that the hardware registers writing aggregation
could save ~20% CPU time when 18bytes UDP datagrams are transmitted at
1.48Mpps.

IFQ packets staging is performed for direct ifnet's if_start calling,
i.e. ifq_try_ifstart()

IFQ packets staging will be stopped upon any of the following conditions:
- If the count of packets enqueued on the current CPU is great than or
  equal to ifq_stage_cntmax.
- If the total length of packets enqueued on the current CPU is great
  than or equal to the hardware's MTU - max_protohdr.  max_protohdr is
  cut from the hardware's MTU mainly bacause a full TCP segment's size
  is usually less than hardware's MTU.
- if_start interlock (if_snd.altq_started) is not released.
- The if_start_rollup(), which is registered as low priority netisr
  rollup function, is called; probably because no more work is pending
  for netisr.

Currently IFQ packet staging is only performed in netisr threads.

Inspired-by: Luigi Rizzo's netmap paper
    (http://info.iet.unipi.it/~luigi/netmap/)
Also-Suggested-by: dillon@
sys/net/altq/if_altq.h
sys/net/if.c

index 94b189d..d4bb667 100644 (file)
 
 struct altq_pktattr;
 
+struct ifaltq;
+
+struct ifaltq_stage {
+       struct ifaltq   *ifqs_altq;
+       int             ifqs_cnt;
+       int             ifqs_len;
+       uint32_t        ifqs_flags;
+       TAILQ_ENTRY(ifaltq_stage) ifqs_link;
+} __cachealign;
+
+#define IFQ_STAGE_FLAG_QUED    0x1
+
 /*
  * Structure defining a queue for a network interface.
  */
@@ -67,6 +79,7 @@ struct        ifaltq {
        struct  lwkt_serialize altq_lock;
        struct  mbuf *altq_prepended;   /* mbuf dequeued, but not yet xmit */
        int     altq_started;           /* ifnet.if_start interlock */
+       struct ifaltq_stage *altq_stage;
 };
 
 #define ALTQ_ASSERT_LOCKED(ifq)        ASSERT_SERIALIZED(&(ifq)->altq_lock)
index caf5872..09d9010 100644 (file)
@@ -103,6 +103,10 @@ struct netmsg_ifaddr {
        int             tail;
 };
 
+struct ifaltq_stage_head {
+       TAILQ_HEAD(, ifaltq_stage)      ifqs_head;
+} __cachealign;
+
 /*
  * System initialization
  */
@@ -126,6 +130,17 @@ extern void        nd6_setmtu(struct ifnet *);
 SYSCTL_NODE(_net, PF_LINK, link, CTLFLAG_RW, 0, "Link layers");
 SYSCTL_NODE(_net_link, 0, generic, CTLFLAG_RW, 0, "Generic link-management");
 
+static u_long if_staged;
+SYSCTL_ULONG(_net_link, OID_AUTO, staged, CTLFLAG_RW, &if_staged, 0, "");
+
+static u_long if_staged_start;
+SYSCTL_ULONG(_net_link, OID_AUTO, staged_start, CTLFLAG_RW,
+    &if_staged_start, 0, "");
+
+static int ifq_stage_cntmax = 4;
+SYSCTL_INT(_net_link, OID_AUTO, stage_cntmax, CTLFLAG_RW,
+    &ifq_stage_cntmax, 0, "ifq staging packet count max");
+
 SYSINIT(interfaces, SI_SUB_PROTO_IF, SI_ORDER_FIRST, ifinit, NULL)
 /* Must be after netisr_init */
 SYSINIT(ifnet, SI_SUB_PRE_DRIVERS, SI_ORDER_SECOND, ifnetinit, NULL)
@@ -146,6 +161,8 @@ int                 if_index = 0;
 struct ifnet           **ifindex2ifnet = NULL;
 static struct thread   ifnet_threads[MAXCPU];
 
+static struct ifaltq_stage_head        ifq_stage_heads[MAXCPU];
+
 #define IFQ_KTR_STRING         "ifq=%p"
 #define IFQ_KTR_ARGS   struct ifaltq *ifq
 #ifndef KTR_IFQ
@@ -557,6 +574,12 @@ if_attach(struct ifnet *ifp, lwkt_serialize_t serializer)
        ALTQ_LOCK_INIT(ifq);
        ifq_set_classic(ifq);
 
+       ifq->altq_stage =
+           kmalloc_cachealign(ncpus * sizeof(struct ifaltq_stage),
+           M_DEVBUF, M_WAITOK | M_ZERO);
+       for (i = 0; i < ncpus; ++i)
+               ifq->altq_stage[i].ifqs_altq = ifq;
+
        if (!SLIST_EMPTY(&domains))
                if_attachdomain1(ifp);
 
@@ -2385,11 +2408,75 @@ ifq_classic_request(struct ifaltq *ifq, int req, void *arg)
        return(0);
 }
 
+static void
+ifq_try_ifstart(struct ifaltq *ifq)
+{
+       struct ifnet *ifp = ifq->altq_ifp;
+       int running = 0, need_sched;
+
+       /*
+        * Try to do direct ifnet.if_start first, if there is
+        * contention on ifnet's serializer, ifnet.if_start will
+        * be scheduled on ifnet's CPU.
+        */
+       if (!ifnet_tryserialize_tx(ifp)) {
+               /*
+                * ifnet serializer contention happened,
+                * ifnet.if_start is scheduled on ifnet's
+                * CPU, and we keep going.
+                */
+               logifstart(contend_sched, ifp);
+               if_start_schedule(ifp);
+               return;
+       }
+
+       if ((ifp->if_flags & (IFF_OACTIVE | IFF_RUNNING)) == IFF_RUNNING) {
+               logifstart(run, ifp);
+               ifp->if_start(ifp);
+               if ((ifp->if_flags & (IFF_OACTIVE | IFF_RUNNING)) ==
+                   IFF_RUNNING)
+                       running = 1;
+       }
+       need_sched = if_start_need_schedule(ifq, running);
+
+       ifnet_deserialize_tx(ifp);
+
+       if (need_sched) {
+               /*
+                * More data need to be transmitted, ifnet.if_start is
+                * scheduled on ifnet's CPU, and we keep going.
+                * NOTE: ifnet.if_start interlock is not released.
+                */
+               logifstart(sched, ifp);
+               if_start_schedule(ifp);
+       }
+}
+
+static __inline void
+ifq_stage_remove(struct ifaltq_stage_head *head, struct ifaltq_stage *stage)
+{
+       KKASSERT(stage->ifqs_flags & IFQ_STAGE_FLAG_QUED);
+       TAILQ_REMOVE(&head->ifqs_head, stage, ifqs_link);
+       stage->ifqs_flags &= ~IFQ_STAGE_FLAG_QUED;
+       stage->ifqs_cnt = 0;
+       stage->ifqs_len = 0;
+}
+
+static __inline void
+ifq_stage_insert(struct ifaltq_stage_head *head, struct ifaltq_stage *stage)
+{
+       KKASSERT((stage->ifqs_flags & IFQ_STAGE_FLAG_QUED) == 0);
+       stage->ifqs_flags |= IFQ_STAGE_FLAG_QUED;
+       TAILQ_INSERT_TAIL(&head->ifqs_head, stage, ifqs_link);
+}
+
 int
 ifq_dispatch(struct ifnet *ifp, struct mbuf *m, struct altq_pktattr *pa)
 {
        struct ifaltq *ifq = &ifp->if_snd;
-       int running = 0, error, start = 0, need_sched, mcast = 0, len;
+       int error, start = 0, len, mcast = 0, avoid_start = 0;
+       struct ifaltq_stage_head *head = NULL;
+       struct ifaltq_stage *stage = NULL;
 
        ASSERT_IFNET_NOT_SERIALIZED_TX(ifp);
 
@@ -2397,6 +2484,19 @@ ifq_dispatch(struct ifnet *ifp, struct mbuf *m, struct altq_pktattr *pa)
        if (m->m_flags & M_MCAST)
                mcast = 1;
 
+       if (curthread->td_type == TD_TYPE_NETISR) {
+               int cpuid = mycpuid;
+
+               head = &ifq_stage_heads[cpuid];
+               stage = &ifq->altq_stage[cpuid];
+
+               stage->ifqs_cnt++;
+               stage->ifqs_len += len;
+               if (stage->ifqs_cnt < ifq_stage_cntmax &&
+                   stage->ifqs_len < (ifp->if_mtu - max_protohdr))
+                       avoid_start = 1;
+       }
+
        ALTQ_LOCK(ifq);
        error = ifq_enqueue_locked(ifq, m, pa);
        if (error) {
@@ -2404,8 +2504,24 @@ ifq_dispatch(struct ifnet *ifp, struct mbuf *m, struct altq_pktattr *pa)
                        ALTQ_UNLOCK(ifq);
                        return error;
                }
+               avoid_start = 0;
        }
        if (!ifq->altq_started) {
+               if (avoid_start) {
+                       ALTQ_UNLOCK(ifq);
+
+                       KKASSERT(!error);
+                       if ((stage->ifqs_flags & IFQ_STAGE_FLAG_QUED) == 0)
+                               ifq_stage_insert(head, stage);
+
+                       ifp->if_obytes += len;
+                       if (mcast)
+                               ifp->if_omcasts++;
+
+                       /* atomic_add_long(&if_staged, 1); */
+                       return error;
+               }
+
                /*
                 * Hold the interlock of ifnet.if_start
                 */
@@ -2420,47 +2536,21 @@ ifq_dispatch(struct ifnet *ifp, struct mbuf *m, struct altq_pktattr *pa)
                        ifp->if_omcasts++;
        }
 
-       if (!start) {
-               logifstart(avoid, ifp);
-               return error;
+       if (stage != NULL) {
+               if (stage->ifqs_flags & IFQ_STAGE_FLAG_QUED) {
+                       ifq_stage_remove(head, stage);
+               } else {
+                       stage->ifqs_cnt = 0;
+                       stage->ifqs_len = 0;
+               }
        }
 
-       /*
-        * Try to do direct ifnet.if_start first, if there is
-        * contention on ifnet's serializer, ifnet.if_start will
-        * be scheduled on ifnet's CPU.
-        */
-       if (!ifnet_tryserialize_tx(ifp)) {
-               /*
-                * ifnet serializer contention happened,
-                * ifnet.if_start is scheduled on ifnet's
-                * CPU, and we keep going.
-                */
-               logifstart(contend_sched, ifp);
-               if_start_schedule(ifp);
+       if (!start) {
+               logifstart(avoid, ifp);
                return error;
        }
 
-       if ((ifp->if_flags & (IFF_OACTIVE | IFF_RUNNING)) == IFF_RUNNING) {
-               logifstart(run, ifp);
-               ifp->if_start(ifp);
-               if ((ifp->if_flags & (IFF_OACTIVE | IFF_RUNNING)) ==
-                   IFF_RUNNING)
-                       running = 1;
-       }
-       need_sched = if_start_need_schedule(ifq, running);
-
-       ifnet_deserialize_tx(ifp);
-
-       if (need_sched) {
-               /*
-                * More data need to be transmitted, ifnet.if_start is
-                * scheduled on ifnet's CPU, and we keep going.
-                * NOTE: ifnet.if_start interlock is not released.
-                */
-               logifstart(sched, ifp);
-               if_start_schedule(ifp);
-       }
+       ifq_try_ifstart(ifq);
        return error;
 }
 
@@ -2664,13 +2754,33 @@ ifnet_service_loop(void *arg __unused)
        }
 }
 
-#ifdef notyet
 static void
 if_start_rollup(void)
 {
-       /* TODO */
+       struct ifaltq_stage_head *head = &ifq_stage_heads[mycpuid];
+       struct ifaltq_stage *stage;
+
+       while ((stage = TAILQ_FIRST(&head->ifqs_head)) != NULL) {
+               struct ifaltq *ifq = stage->ifqs_altq;
+               int start = 0;
+
+               ifq_stage_remove(head, stage);
+
+               ALTQ_LOCK(ifq);
+               if (!ifq->altq_started) {
+                       /*
+                        * Hold the interlock of ifnet.if_start
+                        */
+                       ifq->altq_started = 1;
+                       start = 1;
+               }
+               ALTQ_UNLOCK(ifq);
+
+               if (start)
+                       ifq_try_ifstart(ifq);
+               /* atomic_add_long(&if_staged_start, 1); */
+       }
 }
-#endif
 
 static void
 ifnetinit(void *dummy __unused)
@@ -2686,9 +2796,10 @@ ifnetinit(void *dummy __unused)
                netmsg_service_port_init(&thr->td_msgport);
                lwkt_schedule(thr);
        }
-#ifdef notyet
+
+       for (i = 0; i < ncpus; ++i)
+               TAILQ_INIT(&ifq_stage_heads[i].ifqs_head);
        netisr_register_rollup(if_start_rollup, NETISR_ROLLUP_PRIO_IFSTART);
-#endif
 }
 
 struct ifnet *