vkernel - Fix deadlocks with cothread locks
authorMatthew Dillon <dillon@apollo.backplane.com>
Sat, 28 Aug 2010 00:12:01 +0000 (17:12 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Sat, 28 Aug 2010 00:12:01 +0000 (17:12 -0700)
* Callbacks into the main kernel are not allowed when holding a cothread
  lock as this could cause a deadlock between cpus.  If the callback into
  the kernel blocks the cothread lock, being a pthreads lock,
  remains locked.

* Refactor the network and disk pipeline to hold the cothread lock for
  a much shorter period of time, allowing data to be pipelined without
  any stall conditions.

* For vknet if the tx mbuf return fifo is full we wait until it isn't.

sys/dev/virtual/disk/vdisk.c
sys/dev/virtual/net/if_vke.c
sys/kern/kern_intr.c
sys/platform/vkernel/platform/cothread.c
sys/platform/vkernel64/platform/cothread.c

index 4f0291e..70f225f 100644 (file)
@@ -187,17 +187,26 @@ vkd_io_intr(cothread_t cotd)
 {
        struct vkd_softc *sc;
        struct bio *bio;
+       TAILQ_HEAD(, bio) tmpq;
 
        sc = cotd->arg;
 
+       /*
+        * We can't call back into the kernel while holding cothread!
+        */
+       TAILQ_INIT(&tmpq);
        cothread_lock(cotd, 0);
-       while (!TAILQ_EMPTY(&sc->cotd_done)) {
-               bio = TAILQ_FIRST(&sc->cotd_done);
+       while ((bio = TAILQ_FIRST(&sc->cotd_done)) != NULL) {
                TAILQ_REMOVE(&sc->cotd_done, bio, bio_act);
+               TAILQ_INSERT_TAIL(&tmpq, bio, bio_act);
+       }
+       cothread_unlock(cotd, 0);
+
+       while ((bio = TAILQ_FIRST(&tmpq)) != NULL) {
+               TAILQ_REMOVE(&tmpq, bio, bio_act);
                devstat_end_transaction_buf(&sc->stats, bio->bio_buf);
                biodone(bio);
        }
-       cothread_unlock(sc->cotd, 0);
 }
 
 /*
index 1c24c3e..8d4336d 100644 (file)
@@ -155,8 +155,9 @@ vke_txfifo_done_enqueue(struct vke_softc *sc, struct mbuf *m)
 {
        fifo_t fifo = sc->sc_txfifo_done;
 
-       if (NETFIFOINDEX(fifo->windex + 1) == NETFIFOINDEX(fifo->rindex))
-               return (-1);
+       while (NETFIFOINDEX(fifo->windex + 1) == NETFIFOINDEX(fifo->rindex)) {
+               usleep(20000);
+       }
 
        fifo->array[NETFIFOINDEX(fifo->windex)] = m;
        cpu_sfence();
@@ -184,21 +185,18 @@ vke_txfifo_done_dequeue(struct vke_softc *sc, struct mbuf *nm)
 }
 
 /*
- * vke_txfifo_enqueue() - Add an mbuf to the transmit fifo.  Wake up the
- * cothread via cothread_signal().
+ * vke_txfifo_enqueue() - Add an mbuf to the transmit fifo.
  */
 static int
 vke_txfifo_enqueue(struct vke_softc *sc, struct mbuf *m)
 {
        fifo_t fifo = sc->sc_txfifo;
-       cothread_t cotd = sc->cotd_tx;
 
        if (NETFIFOINDEX(fifo->windex + 1) == NETFIFOINDEX(fifo->rindex))
                return (-1);
 
        fifo->array[NETFIFOINDEX(fifo->windex)] = m;
        cpu_sfence();
-       cothread_signal(cotd);
        ++fifo->windex;
 
        return (0);
@@ -225,6 +223,16 @@ vke_txfifo_dequeue(struct vke_softc *sc)
        return (m);
 }
 
+static int
+vke_txfifo_empty(struct vke_softc *sc)
+{
+       fifo_t fifo = sc->sc_txfifo;
+
+       if (NETFIFOINDEX(fifo->rindex) == NETFIFOINDEX(fifo->windex))
+               return (1);
+       return(0);
+}
+
 /*
  * vke_rxfifo_dequeue() - Return next mbuf on the receice fifo if one
  * exists replacing it with newm which should point to a newly allocated
@@ -272,7 +280,6 @@ vke_init(void *xsc)
 
        ASSERT_SERIALIZED(ifp->if_serializer);
 
-
        vke_stop(sc);
 
        ifp->if_flags |= IFF_RUNNING;
@@ -311,6 +318,13 @@ vke_init(void *xsc)
 
 }
 
+/*
+ * Called from kernel.
+ *
+ * NOTE: We can't make any kernel callbacks while holding cothread lock
+ *      because the cothread lock is not governed by the kernel scheduler
+ *      (so mplock, tokens, etc will not bbe released).
+ */
 static void
 vke_start(struct ifnet *ifp)
 {
@@ -324,25 +338,24 @@ vke_start(struct ifnet *ifp)
        if ((ifp->if_flags & (IFF_RUNNING | IFF_OACTIVE)) != IFF_RUNNING)
                return;
 
-       cothread_lock(cotd, 0);
        count = 0;
-
        while ((m = ifq_dequeue(&ifp->if_snd, NULL)) != NULL) {
                if (vke_txfifo_enqueue(sc, m) != -1) {
                        if (count++ == VKE_CHUNK) {
+                               cothread_lock(cotd, 0);
                                cothread_signal(cotd);
+                               cothread_unlock(cotd, 0);
                                count = 0;
                        }
                } else {
                        m_freem(m);
                }
        }
-
        if (count) {
+               cothread_lock(cotd, 0);
                cothread_signal(cotd);
+               cothread_unlock(cotd, 0);
        }
-
-       cothread_unlock(cotd, 0);
 }
 
 static int
@@ -480,6 +493,7 @@ vke_rx_intr(cothread_t cotd)
                ifnet_deserialize_all(ifp);
                return;
        }
+       cothread_unlock(cotd, 0);
 
        while ((m = vke_rxfifo_sniff(sc)) != NULL) {
                nm = m_getcl(MB_DONTWAIT, MT_DATA, M_PKTHDR);
@@ -487,7 +501,9 @@ vke_rx_intr(cothread_t cotd)
                        vke_rxfifo_dequeue(sc, nm);
                        ifp->if_input(ifp, m);
                        if (count++ == VKE_CHUNK) {
+                               cothread_lock(cotd, 0);
                                cothread_signal(cotd);
+                               cothread_unlock(cotd, 0);
                                count = 0;
                        }
                } else {
@@ -495,10 +511,11 @@ vke_rx_intr(cothread_t cotd)
                }
        }
 
-       if (count)
+       if (count) {
+               cothread_lock(cotd, 0);
                cothread_signal(cotd);
-
-       cothread_unlock(cotd, 0);
+               cothread_unlock(cotd, 0);
+       }
        ifnet_deserialize_all(ifp);
 }
 
@@ -515,22 +532,23 @@ vke_tx_intr(cothread_t cotd)
 
        ifnet_serialize_all(ifp);
        cothread_lock(cotd, 0);
-
        if (sc->cotd_tx_exit != VKE_COTD_RUN) {
                cothread_unlock(cotd, 0);
                ifnet_deserialize_all(ifp);
                return;
        }
+       cothread_unlock(cotd, 0);
 
-       if ((ifp->if_flags & IFF_RUNNING) == 0)
-               ifp->if_start(ifp);
-
-       /* Free TX mbufs that have been processed */
+       /*
+        * Free TX mbufs that have been processed before starting new
+        * ones going to be pipeline friendly.
+        */
        while ((m = vke_txfifo_done_dequeue(sc, NULL)) != NULL) {
                m_freem(m);
        }
 
-       cothread_unlock(cotd, 0);
+       if ((ifp->if_flags & IFF_RUNNING) == 0)
+               ifp->if_start(ifp);
 
        ifnet_deserialize_all(ifp);
 }
@@ -544,10 +562,11 @@ vke_rx_thread(cothread_t cotd)
        struct mbuf *m;
        struct vke_softc *sc = cotd->arg;
        struct ifnet *ifp = &sc->arpcom.ac_if;
-       int count;
        fifo_t fifo = sc->sc_rxfifo;
        fd_set fdset;
        struct timeval tv;
+       int count;
+       int n;
 
        /* Select timeout cannot be infinite since we need to check for
         * the exit flag sc->cotd_rx_exit.
@@ -556,58 +575,52 @@ vke_rx_thread(cothread_t cotd)
        tv.tv_usec = 500000;
 
        FD_ZERO(&fdset);
+       count = 0;
 
-       cothread_lock(cotd, 1);
+       while (sc->cotd_rx_exit == VKE_COTD_RUN) {
+               /*
+                * Wait for the RX FIFO to be loaded with
+                * empty mbufs.
+                */
+               if (NETFIFOINDEX(fifo->windex + 1) ==
+                   NETFIFOINDEX(fifo->rindex)) {
+                       usleep(20000);
+                       continue;
+               }
 
-       for (;;) {
-               int n;
-               count = 0;
-               while (sc->cotd_rx_exit == VKE_COTD_RUN) {
-                       /* Wait for the RX FIFO to drain */
-                       while (NETFIFOINDEX(fifo->windex + 1) ==
-                                       NETFIFOINDEX(fifo->rindex)) {
-                                       usleep(20000);
+               /*
+                * Load data into the rx fifo
+                */
+               m = fifo->array[NETFIFOINDEX(fifo->windex)];
+               if (m == NULL)
+                       continue;
+               n = read(sc->sc_fd, mtod(m, void *), MCLBYTES);
+               if (n > 0) {
+                       ifp->if_ipackets++;
+                       m->m_pkthdr.rcvif = ifp;
+                       m->m_pkthdr.len = m->m_len = n;
+                       cpu_sfence();
+                       ++fifo->windex;
+                       if (count++ == VKE_CHUNK) {
+                               cothread_intr(cotd);
+                               count = 0;
                        }
-
-                       if ((m = fifo->array[NETFIFOINDEX(fifo->windex)]) !=
-                                       NULL) {
-                               cothread_unlock(cotd, 1);
-                               n = read(sc->sc_fd, mtod(m, void *), MCLBYTES);
-                               cothread_lock(cotd, 1);
-                               if (n <= 0)
-                                       break;
-                               ifp->if_ipackets++;
-                               m->m_pkthdr.rcvif = ifp;
-                               m->m_pkthdr.len = m->m_len = n;
-                               ++fifo->windex;
-                               cpu_sfence();
-                               if (count++ == VKE_CHUNK) {
-                                       cothread_intr(cotd);
-                                       count = 0;
-                               }
+               } else {
+                       if (count) {
+                               cothread_intr(cotd);
+                               count = 0;
                        }
-               }
+                       FD_SET(sc->sc_fd, &fdset);
 
-               if (count) {
-                       cothread_intr(cotd);
+                       if (select(sc->sc_fd + 1, &fdset, NULL, NULL, &tv) == -1) {
+                               kprintf(VKE_DEVNAME "%d: select failed for "
+                                       "TAP device\n", sc->sc_unit);
+                               usleep(1000000);
+                       }
                }
-
-               if (sc->cotd_rx_exit != VKE_COTD_RUN)
-                       break;
-
-               cothread_unlock(cotd, 1);
-
-               /* Set up data for select() call */
-               FD_SET(sc->sc_fd, &fdset);
-
-               if (select(sc->sc_fd + 1, &fdset, NULL, NULL, &tv) == -1)
-                       kprintf(VKE_DEVNAME "%d: select failed for TAP device\n", sc->sc_unit);
-
-               cothread_lock(cotd, 1);
        }
-
+       cpu_sfence();
        sc->cotd_rx_exit = VKE_COTD_DEAD;
-       cothread_unlock(cotd, 1);
 }
 
 /*
@@ -621,40 +634,41 @@ vke_tx_thread(cothread_t cotd)
        struct ifnet *ifp = &sc->arpcom.ac_if;
        int count = 0;
 
-       cothread_lock(cotd, 1);
-
        while (sc->cotd_tx_exit == VKE_COTD_RUN) {
-               /* Write outgoing packets to the TAP interface */
-               while ((m = vke_txfifo_dequeue(sc)) != NULL) {
+               /*
+                * Write outgoing packets to the TAP interface
+                */
+               m = vke_txfifo_dequeue(sc);
+               if (m) {
                        if (m->m_pkthdr.len <= MCLBYTES) {
                                m_copydata(m, 0, m->m_pkthdr.len, sc->sc_txbuf);
                                sc->sc_txbuf_len = m->m_pkthdr.len;
-                               cothread_unlock(cotd, 1);
 
-                               if (write(sc->sc_fd, sc->sc_txbuf, sc->sc_txbuf_len) < 0) {
-                                       cothread_lock(cotd, 1);
+                               if (write(sc->sc_fd, sc->sc_txbuf,
+                                         sc->sc_txbuf_len) < 0) {
                                        ifp->if_oerrors++;
                                } else {
-                                       cothread_lock(cotd, 1);
-                                       vke_txfifo_done_enqueue(sc, m);
                                        ifp->if_opackets++;
-                                       if (count++ == VKE_CHUNK) {
-                                               cothread_intr(cotd);
-                                               count = 0;
-                                       }
                                }
                        }
+                       if (count++ == VKE_CHUNK) {
+                               cothread_intr(cotd);
+                               count = 0;
+                       }
+                       vke_txfifo_done_enqueue(sc, m);
+               } else {
+                       if (count) {
+                               cothread_intr(cotd);
+                               count = 0;
+                       }
+                       cothread_lock(cotd, 1);
+                       if (vke_txfifo_empty(sc))
+                               cothread_wait(cotd);
+                       cothread_unlock(cotd, 1);
                }
-
-               if (count) {
-                       cothread_intr(cotd);
-               }
-
-               cothread_wait(cotd);    /* interlocks cothread lock */
        }
-       /* NOT REACHED */
+       cpu_sfence();
        sc->cotd_tx_exit = VKE_COTD_DEAD;
-       cothread_unlock(cotd, 1);
 }
 
 static int
index a63bcc7..c0a46dd 100644 (file)
@@ -720,8 +720,7 @@ ithread_fast_handler(struct intrframe *frame)
 /*
  * Interrupt threads run this as their main loop.
  *
- * The handler begins execution outside a critical section and with the BGL
- * held.
+ * The handler begins execution outside a critical section and no MP lock.
  *
  * The i_running state starts at 0.  When an interrupt occurs, the hardware
  * interrupt is disabled and sched_ithd() The HW interrupt remains disabled
index c432de5..b74b8ee 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2008 The DragonFly Project.  All rights reserved.
+ * Copyright (c) 2008-2010 The DragonFly Project.  All rights reserved.
  *
  * This code is derived from software contributed to The DragonFly Project
  * by Matthew Dillon <dillon@backplane.com>
@@ -150,17 +150,17 @@ cothread_intr(cothread_t cotd)
 
 /*
  * Called by the vkernel to wakeup a cothread.
+ * The cothread must be locked.
  */
 void
 cothread_signal(cothread_t cotd)
 {
-       crit_enter();
        pthread_cond_signal(&cotd->cond);
-       crit_exit();
 }
 
 /*
  * Called by the cothread to wait for the vkernel to call cothread_signal().
+ * The cothread must be locked.
  */
 void
 cothread_wait(cothread_t cotd)
@@ -170,6 +170,13 @@ cothread_wait(cothread_t cotd)
 
 /*
  * Typically called by kernel thread or cothread
+ *
+ * These must be a matched pair.  We will acquire a critical
+ * section in cothread_lock() and release it in cothread_unlock().
+ *
+ * We do this to simplify cothread operation to prevent an
+ * interrupt (e.g. vkd_io_intr()) from preempting a vkd_strategy()
+ * call and creating a recursion in the pthread.
  */
 void
 cothread_lock(cothread_t cotd, int is_cotd)
@@ -177,9 +184,8 @@ cothread_lock(cothread_t cotd, int is_cotd)
        if (is_cotd) {
                pthread_mutex_lock(&cotd->mutex);
        } else {
-               crit_enter();
+               crit_enter_id("cothread");
                pthread_mutex_lock(&cotd->mutex);
-               crit_exit();
        }
 }
 
@@ -189,9 +195,8 @@ cothread_unlock(cothread_t cotd, int is_cotd)
        if (is_cotd) {
                pthread_mutex_unlock(&cotd->mutex);
        } else {
-               crit_enter();
                pthread_mutex_unlock(&cotd->mutex);
-               crit_exit();
+               crit_exit_id("cothread");
        }
 }
 
index dc81faa..58ea26d 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2008 The DragonFly Project.  All rights reserved.
+ * Copyright (c) 2008-2010 The DragonFly Project.  All rights reserved.
  *
  * This code is derived from software contributed to The DragonFly Project
  * by Matthew Dillon <dillon@backplane.com>
@@ -150,17 +150,17 @@ cothread_intr(cothread_t cotd)
 
 /*
  * Called by the vkernel to wakeup a cothread.
+ * The cothread must be locked.
  */
 void
 cothread_signal(cothread_t cotd)
 {
-       crit_enter();
        pthread_cond_signal(&cotd->cond);
-       crit_exit();
 }
 
 /*
  * Called by the cothread to wait for the vkernel to call cothread_signal().
+ * The cothread must be locked.
  */
 void
 cothread_wait(cothread_t cotd)
@@ -170,6 +170,13 @@ cothread_wait(cothread_t cotd)
 
 /*
  * Typically called by kernel thread or cothread
+ *
+ * These must be a matched pair.  We will acquire a critical
+ * section in cothread_lock() and release it in cothread_unlock().
+ *
+ * We do this to simplify cothread operation to prevent an
+ * interrupt (e.g. vkd_io_intr()) from preempting a vkd_strategy()
+ * call and creating a recursion in the pthread.
  */
 void
 cothread_lock(cothread_t cotd, int is_cotd)
@@ -177,9 +184,8 @@ cothread_lock(cothread_t cotd, int is_cotd)
        if (is_cotd) {
                pthread_mutex_lock(&cotd->mutex);
        } else {
-               crit_enter();
+               crit_enter_id("cothread");
                pthread_mutex_lock(&cotd->mutex);
-               crit_exit();
        }
 }
 
@@ -189,8 +195,7 @@ cothread_unlock(cothread_t cotd, int is_cotd)
        if (is_cotd) {
                pthread_mutex_unlock(&cotd->mutex);
        } else {
-               crit_enter();
                pthread_mutex_unlock(&cotd->mutex);
-               crit_exit();
+               crit_exit_id("cothread");
        }
 }