From: Sepherosa Ziehau Date: Sat, 8 Jun 2013 05:47:43 +0000 (+0800) Subject: altq: Implement two level "rough" priority queue for plain sub-queue X-Git-Tag: v3.7.0~967 X-Git-Url: https://gitweb.dragonflybsd.org/dragonfly.git/commitdiff_plain/4cc8caef016d5b3dbf5eb396e27f7eb1e8a6afce altq: Implement two level "rough" priority queue for plain sub-queue The "rough" part comes from two sources: - Hardware queue could be deep, normally 512 or more even for GigE - Round robin on the transmission queues is used by all of the multiple transmission queue capable hardwares supported by DragonFly as of this commit. These two sources affect the packet priority set by DragonFly. DragonFly's "rough" prority queue has only two level, i.e. high priority and normal priority, which should be enough. Each queue has its own header. The normal priority queue will be dequeue only when there is no packets in the high priority queue. During enqueue, if the sub-queue is full and the high priority queue length is less than half of the sub- queue length (both packet count and byte count), drop-head will be applied on the normal priority queue. M_PRIO mbuf flag is added to mark that the mbuf is destined for the high priority queue. Currently TCP uses it to prioritize SYN, SYN|ACK, and pure ACK w/o FIN and RST. This behaviour could be turn off by net.inet.tcp.prio_synack, which is on by default. The performance improvement! The test environment: All three boxes are using Intel i7-2600 w/ HT enabled +-----+ | | +->- emx1 | B | TCP_MAERTS +-----+ | | | | | | +-----+ | A | bnx0 ---+ | | | +-----+ +-----+ | | | +-<- emx1 | C | TCP_STREAM/TCP_RR | | +-----+ A's kernel has this commit compiled. bnx0 has all four transmission queues enabled. For bnx0, the hardware's transmission queue round-robin is on TSO segment boundry. Some base line measurement: B<--A TCP_MAERTS (raw stats) (128 client): 984 Mbps (tcp_stream -H A -l 15 -i 128 -r) C-->A TCP_STREAM (128 client): 942 Mbps (tcp_stream -H A -l 15 -i 128) C-->A TCP_CC (768 client): 221199 conns/s (tcp_cc -H A -l 15 -i 768) To effectively measure the TCP_CC, the prefix route's MSL is changed to 10ms: route change 10.1.0.0/24 -msl 10 All stats gather in the following measurement are below the base line measurement (well, they should be). C-->A TCP_CC improvement, during test B<--A TCP_MAERTS is running: TCP_MAERTS(raw) TCP_CC TSO prio_synack=1 948 Mbps 15988 conns/s TSO prio_synack=0 965 Mbps 8867 conns/s non-TSO prio_synack=1 943 Mbps 18128 conns/s non-TSO prio_synack=0 959 Mbps 11371 conns/s * 80% TCP_CC performance improvement w/ TSO and 60% w/o TSO! C-->A TCP_STREAM improvement, during test B<--A TCP_MAERTS is running: TCP_MAERTS(raw) TCP_STREAM TSO prio_synack=1 969 Mbps 920 Mbps TSO prio_synack=0 969 Mbps 865 Mbps non-TSO prio_synack=1 969 Mbps 920 Mbps non-TSO prio_synack=0 969 Mbps 879 Mbps * 6% TCP_STREAM performance improvement w/ TSO and 4% w/o TSO. --- diff --git a/sys/net/altq/if_altq.h b/sys/net/altq/if_altq.h index 42f8037c94..ea0c7a3914 100644 --- a/sys/net/altq/if_altq.h +++ b/sys/net/altq/if_altq.h @@ -67,8 +67,12 @@ struct ifaltq_subque { struct ifnet *ifsq_ifp; void *ifsq_hw_priv; /* hw private data */ - struct mbuf *ifsq_head; - struct mbuf *ifsq_tail; + struct mbuf *ifsq_prio_head; + struct mbuf *ifsq_prio_tail; + struct mbuf *ifsq_norm_head; + struct mbuf *ifsq_norm_tail; + int ifsq_prio_len; + int ifsq_prio_bcnt; int ifsq_len; /* packet counter */ int ifsq_maxlen; int ifsq_bcnt; /* byte counter */ @@ -112,7 +116,7 @@ do { \ do { \ KASSERT((ifsq)->ifsq_len > 0, ("invalid packet count")); \ (ifsq)->ifsq_len--; \ - KASSERT((ifsq)->ifsq_bcnt >= bcnt, ("invalid byte count")); \ + KASSERT((ifsq)->ifsq_bcnt >= (bcnt), ("invalid byte count")); \ (ifsq)->ifsq_bcnt -= (bcnt); \ } while (0) @@ -122,6 +126,22 @@ do { \ (ifsq)->ifsq_bcnt = 0; \ } while (0) +#define ALTQ_SQ_PRIO_CNTR_INC(ifsq, bcnt) \ +do { \ + (ifsq)->ifsq_prio_len++; \ + (ifsq)->ifsq_prio_bcnt += (bcnt); \ +} while (0) + +#define ALTQ_SQ_PRIO_CNTR_DEC(ifsq, bcnt) \ +do { \ + KASSERT((ifsq)->ifsq_prio_len > 0, \ + ("invalid prio packet count")); \ + (ifsq)->ifsq_prio_len--; \ + KASSERT((ifsq)->ifsq_prio_bcnt >= (bcnt), \ + ("invalid prio byte count")); \ + (ifsq)->ifsq_prio_bcnt -= (bcnt); \ +} while (0) + #endif /* _KERNEL */ /* diff --git a/sys/net/if.c b/sys/net/if.c index f3e58de160..d927474f1c 100644 --- a/sys/net/if.c +++ b/sys/net/if.c @@ -2522,6 +2522,62 @@ ifq_set_methods(struct ifaltq *ifq, altq_mapsubq_t mapsubq, } } +static void +ifsq_norm_enqueue(struct ifaltq_subque *ifsq, struct mbuf *m) +{ + m->m_nextpkt = NULL; + if (ifsq->ifsq_norm_tail == NULL) + ifsq->ifsq_norm_head = m; + else + ifsq->ifsq_norm_tail->m_nextpkt = m; + ifsq->ifsq_norm_tail = m; + ALTQ_SQ_CNTR_INC(ifsq, m->m_pkthdr.len); +} + +static void +ifsq_prio_enqueue(struct ifaltq_subque *ifsq, struct mbuf *m) +{ + m->m_nextpkt = NULL; + if (ifsq->ifsq_prio_tail == NULL) + ifsq->ifsq_prio_head = m; + else + ifsq->ifsq_prio_tail->m_nextpkt = m; + ifsq->ifsq_prio_tail = m; + ALTQ_SQ_CNTR_INC(ifsq, m->m_pkthdr.len); + ALTQ_SQ_PRIO_CNTR_INC(ifsq, m->m_pkthdr.len); +} + +static struct mbuf * +ifsq_norm_dequeue(struct ifaltq_subque *ifsq) +{ + struct mbuf *m; + + m = ifsq->ifsq_norm_head; + if (m != NULL) { + if ((ifsq->ifsq_norm_head = m->m_nextpkt) == NULL) + ifsq->ifsq_norm_tail = NULL; + m->m_nextpkt = NULL; + ALTQ_SQ_CNTR_DEC(ifsq, m->m_pkthdr.len); + } + return m; +} + +static struct mbuf * +ifsq_prio_dequeue(struct ifaltq_subque *ifsq) +{ + struct mbuf *m; + + m = ifsq->ifsq_prio_head; + if (m != NULL) { + if ((ifsq->ifsq_prio_head = m->m_nextpkt) == NULL) + ifsq->ifsq_prio_tail = NULL; + m->m_nextpkt = NULL; + ALTQ_SQ_CNTR_DEC(ifsq, m->m_pkthdr.len); + ALTQ_SQ_PRIO_CNTR_DEC(ifsq, m->m_pkthdr.len); + } + return m; +} + int ifsq_classic_enqueue(struct ifaltq_subque *ifsq, struct mbuf *m, struct altq_pktattr *pa __unused) @@ -2529,16 +2585,29 @@ ifsq_classic_enqueue(struct ifaltq_subque *ifsq, struct mbuf *m, M_ASSERTPKTHDR(m); if (ifsq->ifsq_len >= ifsq->ifsq_maxlen || ifsq->ifsq_bcnt >= ifsq->ifsq_maxbcnt) { + if ((m->m_flags & M_PRIO) && + ifsq->ifsq_prio_len < (ifsq->ifsq_maxlen / 2) && + ifsq->ifsq_prio_bcnt < (ifsq->ifsq_maxbcnt / 2)) { + struct mbuf *m_drop; + + /* + * Perform drop-head on normal queue + */ + m_drop = ifsq_norm_dequeue(ifsq); + if (m_drop != NULL) { + m_freem(m_drop); + ifsq_prio_enqueue(ifsq, m); + return 0; + } + /* XXX nothing could be dropped? */ + } m_freem(m); return ENOBUFS; } else { - m->m_nextpkt = NULL; - if (ifsq->ifsq_tail == NULL) - ifsq->ifsq_head = m; + if (m->m_flags & M_PRIO) + ifsq_prio_enqueue(ifsq, m); else - ifsq->ifsq_tail->m_nextpkt = m; - ifsq->ifsq_tail = m; - ALTQ_SQ_CNTR_INC(ifsq, m->m_pkthdr.len); + ifsq_norm_enqueue(ifsq, m); return 0; } } @@ -2550,17 +2619,15 @@ ifsq_classic_dequeue(struct ifaltq_subque *ifsq, int op) switch (op) { case ALTDQ_POLL: - m = ifsq->ifsq_head; + m = ifsq->ifsq_prio_head; + if (m == NULL) + m = ifsq->ifsq_norm_head; break; case ALTDQ_REMOVE: - m = ifsq->ifsq_head; - if (m != NULL) { - if ((ifsq->ifsq_head = m->m_nextpkt) == NULL) - ifsq->ifsq_tail = NULL; - m->m_nextpkt = NULL; - ALTQ_SQ_CNTR_DEC(ifsq, m->m_pkthdr.len); - } + m = ifsq_prio_dequeue(ifsq); + if (m == NULL) + m = ifsq_norm_dequeue(ifsq); break; default: diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c index 36df3ea644..4bf3688eaa 100644 --- a/sys/netinet/tcp_output.c +++ b/sys/netinet/tcp_output.c @@ -139,6 +139,10 @@ int tcp_autosndbuf_max = 2*1024*1024; SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_RW, &tcp_autosndbuf_max, 0, "Max size of automatic send buffer"); +int tcp_prio_synack = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, prio_synack, CTLFLAG_RW, + &tcp_prio_synack, 0, "Prioritize SYN, SYN|ACK and pure ACK"); + static int tcp_idle_cwv = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, idle_cwv, CTLFLAG_RW, &tcp_idle_cwv, 0, @@ -934,6 +938,13 @@ send: else m->m_data += max_linkhdr; m->m_len = hdrlen; + + /* + * Prioritize SYN, SYN|ACK and pure ACK. + * Leave FIN and RST as they are. + */ + if (tcp_prio_synack && (flags & (TH_FIN | TH_RST)) == 0) + m->m_flags |= M_PRIO; } m->m_pkthdr.rcvif = NULL; if (isipv6) { diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c index 089c5e97bd..e673cac7f3 100644 --- a/sys/netinet/tcp_syncache.c +++ b/sys/netinet/tcp_syncache.c @@ -1200,6 +1200,8 @@ syncache_respond(struct syncache *sc, struct mbuf *m) m->m_len = tlen; m->m_pkthdr.len = tlen; m->m_pkthdr.rcvif = NULL; + if (tcp_prio_synack) + m->m_flags |= M_PRIO; if (isipv6) { ip6 = mtod(m, struct ip6_hdr *); diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h index 85c155bf9a..90922ec349 100644 --- a/sys/netinet/tcp_var.h +++ b/sys/netinet/tcp_var.h @@ -90,6 +90,7 @@ extern int tcp_do_rfc6675; extern int tcp_rfc6675_rxt; extern int tcp_aggregate_acks; extern int tcp_eifel_rtoinc; +extern int tcp_prio_synack; /* TCP segment queue entry */ struct tseg_qent { diff --git a/sys/netproto/802_11/ieee80211_dragonfly.h b/sys/netproto/802_11/ieee80211_dragonfly.h index 75f2a4ec92..5a5310502a 100644 --- a/sys/netproto/802_11/ieee80211_dragonfly.h +++ b/sys/netproto/802_11/ieee80211_dragonfly.h @@ -41,11 +41,12 @@ #ifndef IF_PREPEND_LIST +/* XXX all are prepended to normal queue */ #define _IF_PREPEND_LIST(ifq, mhead, mtail, mcount, bcnt) do { \ - (mtail)->m_nextpkt = (ifq)->ifsq_head; \ - if ((ifq)->ifsq_tail == NULL) \ - (ifq)->ifsq_tail = (mtail); \ - (ifq)->ifsq_head = (mhead); \ + (mtail)->m_nextpkt = (ifq)->ifsq_norm_head; \ + if ((ifq)->ifsq_norm_tail == NULL) \ + (ifq)->ifsq_norm_tail = (mtail); \ + (ifq)->ifsq_norm_head = (mhead); \ (ifq)->ifsq_len += (mcount); \ (ifq)->ifsq_bcnt += (bcnt); \ } while (0) diff --git a/sys/netproto/802_11/wlan/ieee80211_dragonfly.c b/sys/netproto/802_11/wlan/ieee80211_dragonfly.c index 922ab44ba8..0f0d95c8d8 100644 --- a/sys/netproto/802_11/wlan/ieee80211_dragonfly.c +++ b/sys/netproto/802_11/wlan/ieee80211_dragonfly.c @@ -419,12 +419,37 @@ ieee80211_flush_ifq(struct ifaltq *ifq, struct ieee80211vap *vap) ALTQ_SQ_LOCK(ifsq); - mprev = &ifsq->ifsq_head; + /* + * Fix normal queue + */ + mprev = &ifsq->ifsq_norm_head; + while ((m = *mprev) != NULL) { + ni = (struct ieee80211_node *)m->m_pkthdr.rcvif; + if (ni != NULL && ni->ni_vap == vap) { + *mprev = m->m_nextpkt; /* remove from list */ + ALTQ_SQ_CNTR_DEC(ifsq, m->m_pkthdr.len); + + m_freem(m); + ieee80211_free_node(ni); /* reclaim ref */ + } else + mprev = &m->m_nextpkt; + } + /* recalculate tail ptr */ + m = ifsq->ifsq_norm_head; + for (; m != NULL && m->m_nextpkt != NULL; m = m->m_nextpkt) + ; + ifsq->ifsq_norm_tail = m; + + /* + * Fix priority queue + */ + mprev = &ifsq->ifsq_prio_head; while ((m = *mprev) != NULL) { ni = (struct ieee80211_node *)m->m_pkthdr.rcvif; if (ni != NULL && ni->ni_vap == vap) { *mprev = m->m_nextpkt; /* remove from list */ ALTQ_SQ_CNTR_DEC(ifsq, m->m_pkthdr.len); + ALTQ_SQ_PRIO_CNTR_DEC(ifsq, m->m_pkthdr.len); m_freem(m); ieee80211_free_node(ni); /* reclaim ref */ @@ -432,10 +457,10 @@ ieee80211_flush_ifq(struct ifaltq *ifq, struct ieee80211vap *vap) mprev = &m->m_nextpkt; } /* recalculate tail ptr */ - m = ifsq->ifsq_head; + m = ifsq->ifsq_prio_head; for (; m != NULL && m->m_nextpkt != NULL; m = m->m_nextpkt) ; - ifsq->ifsq_tail = m; + ifsq->ifsq_prio_tail = m; ALTQ_SQ_UNLOCK(ifsq); } diff --git a/sys/sys/mbuf.h b/sys/sys/mbuf.h index 43587e057d..077c4d7862 100644 --- a/sys/sys/mbuf.h +++ b/sys/sys/mbuf.h @@ -261,6 +261,7 @@ struct mbuf { #define M_PROTO7 0x400000/* protocol-specific */ #define M_PROTO8 0x800000/* protocol-specific */ #define M_CKHASH 0x1000000/* hash needs software verification */ +#define M_PRIO 0x2000000/* high priority mbuf */ /* * Flags copied when copying m_pkthdr. @@ -269,7 +270,7 @@ struct mbuf { M_PROTO4|M_PROTO5|M_PROTO6|M_PROTO7|M_PROTO8 | \ M_BCAST|M_MCAST|M_FRAG|M_FIRSTFRAG|M_LASTFRAG | \ M_VLANTAG|M_MPLSLABELED | \ - M_LENCHECKED|M_HASH|M_CKHASH) + M_LENCHECKED|M_HASH|M_CKHASH|M_PRIO) /* * Flags indicating hw checksum support and sw checksum requirements.