kernel - Major MPSAFE Infrastructure 2
authorMatthew Dillon <dillon@apollo.backplane.com>
Fri, 27 Aug 2010 08:55:46 +0000 (01:55 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Sat, 28 Aug 2010 00:06:41 +0000 (17:06 -0700)
* Refactor buffer cache code which assumes content-stable data across
  a non-blocking BUF_LOCK().  This is no longer true.  The content must
  be reverified after the BUF_LOCK() succeeds.

* Make setting and clearing B_DELWRI atomic with buffer reassignment.

* Release cached mplock when looping in the scheduler and run
  check_splz() to avoid livelocking cpus.

* Refactor the mplock contention handling code to handle both the mplock
  and token contention.  Generate a 2uS delay for all but one cpu to
  try to avoid livelocks.

* Do not splz() from inside a spinlock, it will just panic.

* Fix the token description field for 'systat -pv 1'.

* Optimize MP_LOCK macros a bit.

15 files changed:
sys/kern/kern_mplock.c
sys/kern/kern_spinlock.c
sys/kern/kern_umtx.c
sys/kern/lwkt_thread.c
sys/kern/lwkt_token.c
sys/kern/vfs_bio.c
sys/kern/vfs_subr.c
sys/kern/vfs_vm.c
sys/platform/pc32/i386/machdep.c
sys/platform/pc32/isa/clock.c
sys/platform/pc64/isa/clock.c
sys/platform/pc64/x86_64/machdep.c
sys/platform/vkernel/i386/cpu_regs.c
sys/platform/vkernel64/x86_64/cpu_regs.c
sys/sys/mplock2.h

index 12c2705..ea5eac3 100644 (file)
@@ -85,7 +85,7 @@ KTR_INFO(KTR_GIANT_CONTENTION, giant, end, 1,
                file, line)
 
 int    mp_lock;
-int    mp_lock_contention_mask;
+int    cpu_contention_mask;
 const char *mp_lock_holder_file;       /* debugging */
 int    mp_lock_holder_line;            /* debugging */
 
@@ -220,7 +220,7 @@ yield_mplock(thread_t td)
 
 /*
  * The rel_mplock() code will call this function after releasing the
- * last reference on the MP lock if mp_lock_contention_mask is non-zero.
+ * last reference on the MP lock if cpu_contention_mask is non-zero.
  *
  * We then chain an IPI to a single other cpu potentially needing the
  * lock.  This is a bit heuristical and we can wind up with IPIs flying
@@ -240,7 +240,7 @@ lwkt_mp_lock_uncontested(void)
     if (chain_mplock) {
        gd = mycpu;
        clr_mplock_contention_mask(gd);
-       mask = mp_lock_contention_mask;
+       mask = cpu_contention_mask;
        tmpmask = ~((1 << gd->gd_cpuid) - 1);
 
        if (mask) {
@@ -248,7 +248,7 @@ lwkt_mp_lock_uncontested(void)
                    cpuid = bsfl(mask & tmpmask);
            else
                    cpuid = bsfl(mask);
-           atomic_clear_int(&mp_lock_contention_mask, 1 << cpuid);
+           atomic_clear_int(&cpu_contention_mask, 1 << cpuid);
            dgd = globaldata_find(cpuid);
            lwkt_send_ipiq(dgd, lwkt_mp_lock_uncontested_remote, NULL);
        }
index fbb1c4f..c44a850 100644 (file)
@@ -319,7 +319,6 @@ exponential_backoff(struct exponential_backoff *bo)
 #endif
                if (bo->nsec == 60)
                        panic("spin_lock: %p, indefinite wait!\n", bo->mtx);
-               splz();
                bo->base = count;
        }
        return (FALSE);
index cc07959..2705e8c 100644 (file)
@@ -172,7 +172,9 @@ done:
 static void
 umtx_sleep_page_action_cow(vm_page_t m, vm_page_action_t action)
 {
+    lwkt_gettoken(&vm_token);
     wakeup_domain(action->data, PDOMAIN_UMTX);
+    lwkt_reltoken(&vm_token);
 }
 
 /*
index eb6d553..346ffb0 100644 (file)
@@ -576,7 +576,7 @@ lwkt_switch(void)
      * (but, of course, another cpu may own or release the lock so the
      * actual value of mp_lock is not stable).
      */
-    mpheld = MP_LOCK_HELD();
+    mpheld = MP_LOCK_HELD(gd);
 #ifdef INVARIANTS
     if (td->td_cscount) {
        kprintf("Diagnostic: attempt to switch while mastering cpusync: %p\n",
@@ -656,10 +656,15 @@ lwkt_switch(void)
                if (gd->gd_trap_nesting_level == 0 && panicstr == NULL)
                    panic("Idle thread %p was holding the BGL!", ntd);
                if (mpheld == 0) {
+                   set_cpu_contention_mask(gd);
+                   handle_cpu_contention_mask();
+                   cpu_try_mplock();
+                   mpheld = MP_LOCK_HELD(gd);
                    cpu_pause();
                    continue;
                }
            }
+           clr_cpu_contention_mask(gd);
 #endif
            cpu_time.cp_msg[0] = 0;
            cpu_time.cp_stallpc = 0;
@@ -679,7 +684,7 @@ lwkt_switch(void)
            (!TD_TOKS_HELD(ntd) || lwkt_getalltokens(ntd, &lmsg, &laddr))
        ) {
 #ifdef SMP
-           clr_mplock_contention_mask(gd);
+           clr_cpu_contention_mask(gd);
 #endif
            goto havethread;
        }
@@ -688,8 +693,10 @@ lwkt_switch(void)
        laddr = NULL;
 
 #ifdef SMP
+       if (ntd->td_fairq_accum >= 0)
+               set_cpu_contention_mask(gd);
        /* Reload mpheld (it become stale after mplock/token ops) */
-       mpheld = MP_LOCK_HELD();
+       mpheld = MP_LOCK_HELD(gd);
        if (ntd->td_mpcount && mpheld == 0) {
            lmsg = "mplock";
            laddr = ntd->td_mplock_stallpc;
@@ -720,11 +727,19 @@ lwkt_switch(void)
             * nlast keeps track of the last element prior to any moves.
             */
            if (ntd->td_fairq_accum < 0) {
-               xtd = TAILQ_NEXT(ntd, td_threadq);
                lwkt_fairq_accumulate(gd, ntd);
                didaccumulate = 1;
+
+               /*
+                * Move to end
+                */
+               xtd = TAILQ_NEXT(ntd, td_threadq);
                TAILQ_REMOVE(&gd->gd_tdrunq, ntd, td_threadq);
                TAILQ_INSERT_TAIL(&gd->gd_tdrunq, ntd, td_threadq);
+
+               /*
+                * Set terminal element (nlast)
+                */
                if (nlast == NULL) {
                    nlast = ntd;
                    if (xtd == NULL)
@@ -747,13 +762,15 @@ lwkt_switch(void)
                ntd = &gd->gd_idlethread;
                ntd->td_flags |= TDF_IDLE_NOHLT;
 #ifdef SMP
-               set_mplock_contention_mask(gd);
-               cpu_mplock_contested();
                if (ntd->td_mpcount) {
-                   mpheld = MP_LOCK_HELD();
+                   mpheld = MP_LOCK_HELD(gd);
                    if (gd->gd_trap_nesting_level == 0 && panicstr == NULL)
                        panic("Idle thread %p was holding the BGL!", ntd);
                    if (mpheld == 0) {
+                       set_cpu_contention_mask(gd);
+                       handle_cpu_contention_mask();
+                       cpu_try_mplock();
+                       mpheld = MP_LOCK_HELD(gd);
                        cpu_pause();
                        break;          /* try again from the top, almost */
                    }
@@ -787,22 +804,48 @@ lwkt_switch(void)
                (!TD_TOKS_HELD(ntd) || lwkt_getalltokens(ntd, &lmsg, &laddr))
            ) {
 #ifdef SMP
-                   clr_mplock_contention_mask(gd);
+                   clr_cpu_contention_mask(gd);
 #endif
                    goto havethread;
            }
 #ifdef SMP
-           /* Reload mpheld (it become stale after mplock/token ops) */
-           mpheld = MP_LOCK_HELD();
+           if (ntd->td_fairq_accum >= 0)
+                   set_cpu_contention_mask(gd);
+           /*
+            * Reload mpheld (it become stale after mplock/token ops).
+            */
+           mpheld = MP_LOCK_HELD(gd);
            if (ntd->td_mpcount && mpheld == 0) {
                lmsg = "mplock";
                laddr = ntd->td_mplock_stallpc;
            }
-
            if (ntd->td_pri >= TDPRI_KERN_LPSCHED && ntd->td_fairq_accum >= 0)
                nquserok = 0;
 #endif
        }
+
+       /*
+        * All threads exhausted but we can loop due to a negative
+        * accumulator.
+        *
+        * While we are looping in the scheduler be sure to service
+        * any interrupts which were made pending due to our critical
+        * section, otherwise we could livelock (e.g.) IPIs.
+        *
+        * NOTE: splz can enter and exit the mplock so mpheld is
+        * stale after this call.
+        */
+       splz_check();
+
+#ifdef SMP
+       /*
+        * Our mplock can be cached and cause other cpus to livelock
+        * if we loop due to e.g. not being able to acquire tokens.
+        */
+       if (MP_LOCK_HELD(gd))
+           cpu_rel_mplock(gd->gd_cpuid);
+       mpheld = 0;
+#endif
     }
 
     /*
@@ -838,8 +881,8 @@ haveidle:
            ("priority problem in lwkt_switch %d %d", td->td_pri, ntd->td_pri));
 #ifdef SMP
     if (ntd->td_mpcount == 0 ) {
-       if (MP_LOCK_HELD())
-           cpu_rel_mplock();
+       if (MP_LOCK_HELD(gd))
+           cpu_rel_mplock(gd->gd_cpuid);
     } else {
        ASSERT_MP_LOCK_HELD(ntd);
     }
@@ -970,7 +1013,7 @@ lwkt_preempt(thread_t ntd, int critcount)
      * or not.
      */
     savecnt = td->td_mpcount;
-    mpheld = MP_LOCK_HELD();
+    mpheld = MP_LOCK_HELD(gd);
     ntd->td_mpcount += td->td_mpcount;
     if (mpheld == 0 && ntd->td_mpcount && !cpu_try_mplock()) {
        ntd->td_mpcount -= td->td_mpcount;
@@ -993,9 +1036,9 @@ lwkt_preempt(thread_t ntd, int critcount)
     KKASSERT(ntd->td_preempted && (td->td_flags & TDF_PREEMPT_DONE));
 #ifdef SMP
     KKASSERT(savecnt == td->td_mpcount);
-    mpheld = MP_LOCK_HELD();
+    mpheld = MP_LOCK_HELD(gd);
     if (mpheld && td->td_mpcount == 0)
-       cpu_rel_mplock();
+       cpu_rel_mplock(gd->gd_cpuid);
     else if (mpheld == 0 && td->td_mpcount)
        panic("lwkt_preempt(): MP lock was not held through");
 #endif
@@ -1102,7 +1145,7 @@ lwkt_user_yield(void)
      * has a chaining effect since if the interrupt is blocked, so is
      * the event, so normal scheduling will not pick up on the problem.
      */
-    if (mp_lock_contention_mask && td->td_mpcount) {
+    if (cpu_contention_mask && td->td_mpcount) {
        yield_mplock(td);
     }
 #endif
index aed1be2..2597df5 100644 (file)
@@ -532,6 +532,7 @@ lwkt_token_init(lwkt_token_t tok, int mpsafe, const char *desc)
        tok->t_ref = NULL;
        tok->t_flags = mpsafe ? LWKT_TOKEN_MPSAFE : 0;
        tok->t_collisions = 0;
+       tok->t_desc = desc;
 }
 
 void
index a0d7533..2ad6754 100644 (file)
@@ -1163,8 +1163,10 @@ bdirty(struct buf *bp)
        bp->b_flags &= ~B_RELBUF;
 
        if ((bp->b_flags & B_DELWRI) == 0) {
+               lwkt_gettoken(&bp->b_vp->v_token);
                bp->b_flags |= B_DELWRI;
                reassignbuf(bp);
+               lwkt_reltoken(&bp->b_vp->v_token);
 
                spin_lock_wr(&bufcspin);
                ++dirtybufcount;
@@ -1215,8 +1217,10 @@ void
 bundirty(struct buf *bp)
 {
        if (bp->b_flags & B_DELWRI) {
+               lwkt_gettoken(&bp->b_vp->v_token);
                bp->b_flags &= ~B_DELWRI;
                reassignbuf(bp);
+               lwkt_reltoken(&bp->b_vp->v_token);
 
                spin_lock_wr(&bufcspin);
                --dirtybufcount;
@@ -2535,6 +2539,9 @@ buf_daemon_hw(void)
  *     B_RELBUF may only be set by VFSs.  We do set B_AGE to indicate
  *     that we really want to try to get the buffer out and reuse it
  *     due to the write load on the machine.
+ *
+ *     We must lock the buffer in order to check its validity before we
+ *     can mess with its contents.  bufqspin isn't enough.
  */
 static int
 flushbufqueues(bufq_type_t q)
@@ -2548,65 +2555,72 @@ flushbufqueues(bufq_type_t q)
 
        bp = TAILQ_FIRST(&bufqueues[q]);
        while (bp) {
-               KASSERT((bp->b_flags & B_DELWRI),
-                       ("unexpected clean buffer %p", bp));
+               if ((bp->b_flags & B_DELWRI) == 0) {
+                       kprintf("Unexpected clean buffer %p\n", bp);
+                       bp = TAILQ_NEXT(bp, b_freelist);
+                       continue;
+               }
+               if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
+                       bp = TAILQ_NEXT(bp, b_freelist);
+                       continue;
+               }
+               KKASSERT(bp->b_qindex == q);
 
-               if (bp->b_flags & B_DELWRI) {
-                       if (bp->b_flags & B_INVAL) {
-                               if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
-                                       bp = TAILQ_NEXT(bp, b_freelist);
-                                       continue;
-                               }
-                               _bremfree(bp);
-                               spin_unlock_wr(&bufqspin);
-                               spun = 0;
-                               brelse(bp);
-                               ++r;
-                               break;
-                       }
-                       if (LIST_FIRST(&bp->b_dep) != NULL &&
-                           (bp->b_flags & B_DEFERRED) == 0 &&
-                           buf_countdeps(bp, 0)) {
-                               TAILQ_REMOVE(&bufqueues[q], bp, b_freelist);
-                               TAILQ_INSERT_TAIL(&bufqueues[q], bp,
-                                                 b_freelist);
-                               bp->b_flags |= B_DEFERRED;
-                               bp = TAILQ_FIRST(&bufqueues[q]);
-                               continue;
-                       }
+               /*
+                * Must recheck B_DELWRI after successfully locking
+                * the buffer.
+                */
+               if ((bp->b_flags & B_DELWRI) == 0) {
+                       BUF_UNLOCK(bp);
+                       bp = TAILQ_NEXT(bp, b_freelist);
+                       continue;
+               }
 
-                       /*
-                        * Only write it out if we can successfully lock
-                        * it.  If the buffer has a dependancy,
-                        * buf_checkwrite must also return 0 for us to
-                        * be able to initate the write.
-                        *
-                        * If the buffer is flagged B_ERROR it may be
-                        * requeued over and over again, we try to
-                        * avoid a live lock.
-                        *
-                        * NOTE: buf_checkwrite is MPSAFE.
-                        */
-                       if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) == 0) {
-                               spin_unlock_wr(&bufqspin);
-                               spun = 0;
-                               if (LIST_FIRST(&bp->b_dep) != NULL &&
-                                   buf_checkwrite(bp)) {
-                                       bremfree(bp);
-                                       brelse(bp);
-                               } else if (bp->b_flags & B_ERROR) {
-                                       tsleep(bp, 0, "bioer", 1);
-                                       bp->b_flags &= ~B_AGE;
-                                       vfs_bio_awrite(bp);
-                               } else {
-                                       bp->b_flags |= B_AGE;
-                                       vfs_bio_awrite(bp);
-                               }
-                               ++r;
-                               break;
-                       }
+               if (bp->b_flags & B_INVAL) {
+                       _bremfree(bp);
+                       spin_unlock_wr(&bufqspin);
+                       spun = 0;
+                       brelse(bp);
+                       ++r;
+                       break;
+               }
+
+               if (LIST_FIRST(&bp->b_dep) != NULL &&
+                   (bp->b_flags & B_DEFERRED) == 0 &&
+                   buf_countdeps(bp, 0)) {
+                       TAILQ_REMOVE(&bufqueues[q], bp, b_freelist);
+                       TAILQ_INSERT_TAIL(&bufqueues[q], bp, b_freelist);
+                       bp->b_flags |= B_DEFERRED;
+                       BUF_UNLOCK(bp);
+                       bp = TAILQ_FIRST(&bufqueues[q]);
+                       continue;
+               }
+
+               /*
+                * If the buffer has a dependancy, buf_checkwrite() must
+                * also return 0 for us to be able to initate the write.
+                *
+                * If the buffer is flagged B_ERROR it may be requeued
+                * over and over again, we try to avoid a live lock.
+                *
+                * NOTE: buf_checkwrite is MPSAFE.
+                */
+               spin_unlock_wr(&bufqspin);
+               spun = 0;
+
+               if (LIST_FIRST(&bp->b_dep) != NULL && buf_checkwrite(bp)) {
+                       bremfree(bp);
+                       brelse(bp);
+               } else if (bp->b_flags & B_ERROR) {
+                       tsleep(bp, 0, "bioer", 1);
+                       bp->b_flags &= ~B_AGE;
+                       vfs_bio_awrite(bp);
+               } else {
+                       bp->b_flags |= B_AGE;
+                       vfs_bio_awrite(bp);
                }
-               bp = TAILQ_NEXT(bp, b_freelist);
+               ++r;
+               break;
        }
        if (spun)
                spin_unlock_wr(&bufqspin);
index 6694c92..f619030 100644 (file)
@@ -283,6 +283,7 @@ struct vinvalbuf_bp_info {
        int slptimeo;
        int lkflags;
        int flags;
+       int clean;
 };
 
 int
@@ -331,9 +332,11 @@ vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo)
         */
        while (!RB_EMPTY(&vp->v_rbclean_tree) || 
               !RB_EMPTY(&vp->v_rbdirty_tree)) {
+               info.clean = 1;
                error = RB_SCAN(buf_rb_tree, &vp->v_rbclean_tree, NULL,
                                vinvalbuf_bp, &info);
                if (error == 0) {
+                       info.clean = 0;
                        error = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, NULL,
                                        vinvalbuf_bp, &info);
                }
@@ -376,8 +379,10 @@ vinvalbuf_bp(struct buf *bp, void *data)
        int error;
 
        if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
+               atomic_add_int(&bp->b_refs, 1);
                error = BUF_TIMELOCK(bp, info->lkflags,
                                     "vinvalbuf", info->slptimeo);
+               atomic_subtract_int(&bp->b_refs, 1);
                if (error == 0) {
                        BUF_UNLOCK(bp);
                        error = ENOLCK;
@@ -386,32 +391,32 @@ vinvalbuf_bp(struct buf *bp, void *data)
                        return(0);
                return (-error);
        }
-
        KKASSERT(bp->b_vp == info->vp);
 
        /*
-        * XXX Since there are no node locks for NFS, I
-        * believe there is a slight chance that a delayed
-        * write will occur while sleeping just above, so
-        * check for it.  Note that vfs_bio_awrite expects
-        * buffers to reside on a queue, while bwrite() and
-        * brelse() do not.
+        * Must check clean/dirty status after successfully locking as
+        * it may race.
+        */
+       if ((info->clean && (bp->b_flags & B_DELWRI)) ||
+           (info->clean == 0 && (bp->b_flags & B_DELWRI) == 0)) {
+               BUF_UNLOCK(bp);
+               return(0);
+       }
+
+       /*
+        * Note that vfs_bio_awrite expects buffers to reside
+        * on a queue, while bwrite() and brelse() do not.
         *
         * NOTE:  NO B_LOCKED CHECK.  Also no buf_checkwrite()
         * check.  This code will write out the buffer, period.
         */
        if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
            (info->flags & V_SAVE)) {
-               if (bp->b_vp == info->vp) {
-                       if (bp->b_flags & B_CLUSTEROK) {
-                               vfs_bio_awrite(bp);
-                       } else {
-                               bremfree(bp);
-                               bawrite(bp);
-                       }
+               if (bp->b_flags & B_CLUSTEROK) {
+                       vfs_bio_awrite(bp);
                } else {
                        bremfree(bp);
-                       bwrite(bp);
+                       bawrite(bp);
                }
        } else if (info->flags & V_SAVE) {
                /*
@@ -442,10 +447,16 @@ static int vtruncbuf_bp_trunc(struct buf *bp, void *data);
 static int vtruncbuf_bp_metasync_cmp(struct buf *bp, void *data);
 static int vtruncbuf_bp_metasync(struct buf *bp, void *data);
 
+struct vtruncbuf_info {
+       struct vnode *vp;
+       off_t   truncloffset;
+       int     clean;
+};
+
 int
 vtruncbuf(struct vnode *vp, off_t length, int blksize)
 {
-       off_t truncloffset;
+       struct vtruncbuf_info info;
        const char *filename;
        int count;
 
@@ -455,18 +466,21 @@ vtruncbuf(struct vnode *vp, off_t length, int blksize)
         * scan count to determine whether a loop is necessary.
         */
        if ((count = (int)(length % blksize)) != 0)
-               truncloffset = length + (blksize - count);
+               info.truncloffset = length + (blksize - count);
        else
-               truncloffset = length;
+               info.truncloffset = length;
+       info.vp = vp;
 
        lwkt_gettoken(&vp->v_token);
        do {
+               info.clean = 1;
                count = RB_SCAN(buf_rb_tree, &vp->v_rbclean_tree, 
                                vtruncbuf_bp_trunc_cmp,
-                               vtruncbuf_bp_trunc, &truncloffset);
+                               vtruncbuf_bp_trunc, &info);
+               info.clean = 0;
                count += RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree,
                                vtruncbuf_bp_trunc_cmp,
-                               vtruncbuf_bp_trunc, &truncloffset);
+                               vtruncbuf_bp_trunc, &info);
        } while(count);
 
        /*
@@ -479,7 +493,7 @@ vtruncbuf(struct vnode *vp, off_t length, int blksize)
                do {
                        count = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree,
                                        vtruncbuf_bp_metasync_cmp,
-                                       vtruncbuf_bp_metasync, vp);
+                                       vtruncbuf_bp_metasync, &info);
                } while (count);
        }
 
@@ -507,12 +521,14 @@ vtruncbuf(struct vnode *vp, off_t length, int blksize)
         * to busy dirty VM pages being flushed out to disk.
         */
        do {
+               info.clean = 1;
                count = RB_SCAN(buf_rb_tree, &vp->v_rbclean_tree, 
                                vtruncbuf_bp_trunc_cmp,
-                               vtruncbuf_bp_trunc, &truncloffset);
+                               vtruncbuf_bp_trunc, &info);
+               info.clean = 0;
                count += RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree,
                                vtruncbuf_bp_trunc_cmp,
-                               vtruncbuf_bp_trunc, &truncloffset);
+                               vtruncbuf_bp_trunc, &info);
                if (count) {
                        kprintf("Warning: vtruncbuf():  Had to re-clean %d "
                               "left over buffers in %s\n", count, filename);
@@ -532,7 +548,9 @@ static
 int
 vtruncbuf_bp_trunc_cmp(struct buf *bp, void *data)
 {
-       if (bp->b_loffset >= *(off_t *)data)
+       struct vtruncbuf_info *info = data;
+
+       if (bp->b_loffset >= info->truncloffset)
                return(0);
        return(-1);
 }
@@ -541,14 +559,26 @@ static
 int 
 vtruncbuf_bp_trunc(struct buf *bp, void *data)
 {
+       struct vtruncbuf_info *info = data;
+
        /*
         * Do not try to use a buffer we cannot immediately lock, but sleep
         * anyway to prevent a livelock.  The code will loop until all buffers
         * can be acted upon.
+        *
+        * We must always revalidate the buffer after locking it to deal
+        * with MP races.
         */
        if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
+               atomic_add_int(&bp->b_refs, 1);
                if (BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL) == 0)
                        BUF_UNLOCK(bp);
+               atomic_subtract_int(&bp->b_refs, 1);
+       } else if ((info->clean && (bp->b_flags & B_DELWRI)) ||
+                  (info->clean == 0 && (bp->b_flags & B_DELWRI) == 0) ||
+                  bp->b_vp != info->vp ||
+                  vtruncbuf_bp_trunc_cmp(bp, data)) {
+               BUF_UNLOCK(bp);
        } else {
                bremfree(bp);
                bp->b_flags |= (B_INVAL | B_RELBUF | B_NOCACHE);
@@ -563,7 +593,7 @@ vtruncbuf_bp_trunc(struct buf *bp, void *data)
  * Note that the compare function must conform to the RB_SCAN's requirements.
  */
 static int
-vtruncbuf_bp_metasync_cmp(struct buf *bp, void *data)
+vtruncbuf_bp_metasync_cmp(struct buf *bp, void *data __unused)
 {
        if (bp->b_loffset < 0)
                return(0);
@@ -573,28 +603,25 @@ vtruncbuf_bp_metasync_cmp(struct buf *bp, void *data)
 static int
 vtruncbuf_bp_metasync(struct buf *bp, void *data)
 {
-       struct vnode *vp = data;
+       struct vtruncbuf_info *info = data;
 
-       if (bp->b_flags & B_DELWRI) {
-               /*
-                * Do not try to use a buffer we cannot immediately lock,
-                * but sleep anyway to prevent a livelock.  The code will
-                * loop until all buffers can be acted upon.
-                */
-               if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
-                       if (BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL) == 0)
-                               BUF_UNLOCK(bp);
-               } else {
-                       bremfree(bp);
-                       if (bp->b_vp == vp)
-                               bawrite(bp);
-                       else
-                               bwrite(bp);
-               }
-               return(1);
+       if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
+               atomic_add_int(&bp->b_refs, 1);
+               if (BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL) == 0)
+                       BUF_UNLOCK(bp);
+               atomic_subtract_int(&bp->b_refs, 1);
+       } else if ((bp->b_flags & B_DELWRI) == 0 ||
+                  bp->b_vp != info->vp ||
+                  vtruncbuf_bp_metasync_cmp(bp, data)) {
+               BUF_UNLOCK(bp);
        } else {
-               return(0);
+               bremfree(bp);
+               if (bp->b_vp == info->vp)
+                       bawrite(bp);
+               else
+                       bwrite(bp);
        }
+       return(1);
 }
 
 /*
@@ -611,6 +638,7 @@ vtruncbuf_bp_metasync(struct buf *bp, void *data)
  */
 static int vfsync_wait_output(struct vnode *vp, 
                            int (*waitoutput)(struct vnode *, struct thread *));
+static int vfsync_dummy_cmp(struct buf *bp __unused, void *data __unused);
 static int vfsync_data_only_cmp(struct buf *bp, void *data);
 static int vfsync_meta_only_cmp(struct buf *bp, void *data);
 static int vfsync_lazy_range_cmp(struct buf *bp, void *data);
@@ -624,6 +652,7 @@ struct vfsync_info {
        int lazylimit;
        int skippedbufs;
        int (*checkdef)(struct buf *);
+       int (*cmpfunc)(struct buf *, void *);
 };
 
 int
@@ -650,10 +679,12 @@ vfsync(struct vnode *vp, int waitfor, int passes,
                 */
                info.lazylimit = 1024 * 1024;
                info.syncdeps = 1;
+               info.cmpfunc = vfsync_lazy_range_cmp;
                error = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, 
                                vfsync_lazy_range_cmp, vfsync_bp, &info);
+               info.cmpfunc = vfsync_meta_only_cmp;
                RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, 
-                               vfsync_meta_only_cmp, vfsync_bp, &info);
+                       vfsync_meta_only_cmp, vfsync_bp, &info);
                if (error == 0)
                        vp->v_lazyw = 0;
                else if (!RB_EMPTY(&vp->v_rbdirty_tree))
@@ -665,8 +696,10 @@ vfsync(struct vnode *vp, int waitfor, int passes,
                 * Asynchronous.  Do a data-only pass and a meta-only pass.
                 */
                info.syncdeps = 1;
+               info.cmpfunc = vfsync_data_only_cmp;
                RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, vfsync_data_only_cmp, 
                        vfsync_bp, &info);
+               info.cmpfunc = vfsync_meta_only_cmp;
                RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, vfsync_meta_only_cmp, 
                        vfsync_bp, &info);
                error = 0;
@@ -677,11 +710,13 @@ vfsync(struct vnode *vp, int waitfor, int passes,
                 * pass, then additional integrated passes to try to get
                 * all the dependancies flushed.
                 */
+               info.cmpfunc = vfsync_data_only_cmp;
                RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, vfsync_data_only_cmp,
                        vfsync_bp, &info);
                error = vfsync_wait_output(vp, waitoutput);
                if (error == 0) {
                        info.skippedbufs = 0;
+                       info.cmpfunc = vfsync_dummy_cmp;
                        RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, NULL,
                                vfsync_bp, &info);
                        error = vfsync_wait_output(vp, waitoutput);
@@ -695,8 +730,9 @@ vfsync(struct vnode *vp, int waitfor, int passes,
                                info.synchronous = 1;
                                info.syncdeps = 1;
                        }
+                       info.cmpfunc = vfsync_dummy_cmp;
                        error = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, NULL,
-                               vfsync_bp, &info);
+                                       vfsync_bp, &info);
                        if (error < 0)
                                error = -error;
                        info.syncdeps = 1;
@@ -721,6 +757,12 @@ vfsync_wait_output(struct vnode *vp,
        return(error);
 }
 
+static int
+vfsync_dummy_cmp(struct buf *bp __unused, void *data __unused)
+{
+       return(0);
+}
+
 static int
 vfsync_data_only_cmp(struct buf *bp, void *data)
 {
@@ -741,6 +783,7 @@ static int
 vfsync_lazy_range_cmp(struct buf *bp, void *data)
 {
        struct vfsync_info *info = data;
+
        if (bp->b_loffset < info->vp->v_lazyw)
                return(-1);
        return(0);
@@ -754,24 +797,32 @@ vfsync_bp(struct buf *bp, void *data)
        int error;
 
        /*
-        * if syncdeps is not set we do not try to write buffers which have
-        * dependancies.
+        * Ignore buffers that we cannot immediately lock.
+        */
+       if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
+               kprintf("Warning: vfsync_bp skipping dirty buffer %p\n", bp);
+               ++info->skippedbufs;
+               return(0);
+       }
+
+       /*
+        * We must revalidate the buffer after locking.
         */
-       if (!info->synchronous && info->syncdeps == 0 && info->checkdef(bp))
+       if ((bp->b_flags & B_DELWRI) == 0 ||
+           bp->b_vp != info->vp ||
+           info->cmpfunc(bp, data)) {
+               BUF_UNLOCK(bp);
                return(0);
+       }
 
        /*
-        * Ignore buffers that we cannot immediately lock.  XXX
+        * If syncdeps is not set we do not try to write buffers which have
+        * dependancies.
         */
-       if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
-               kprintf("Warning: vfsync_bp skipping dirty buffer %p\n", bp);
-               ++info->skippedbufs;
+       if (!info->synchronous && info->syncdeps == 0 && info->checkdef(bp)) {
+               BUF_UNLOCK(bp);
                return(0);
        }
-       if ((bp->b_flags & B_DELWRI) == 0)
-               panic("vfsync_bp: buffer not dirty");
-       if (vp != bp->b_vp)
-               panic("vfsync_bp: buffer vp mismatch");
 
        /*
         * B_NEEDCOMMIT (primarily used by NFS) is a state where the buffer
@@ -785,7 +836,8 @@ vfsync_bp(struct buf *bp, void *data)
        }
 
        /*
-        * Ask bioops if it is ok to sync 
+        * Ask bioops if it is ok to sync.  If not the VFS may have
+        * set B_LOCKED so we have to cycle the buffer.
         */
        if (LIST_FIRST(&bp->b_dep) != NULL && buf_checkwrite(bp)) {
                bremfree(bp);
@@ -837,6 +889,7 @@ bgetvp(struct vnode *vp, struct buf *bp, int testsize)
         * Insert onto list for new vnode.
         */
        lwkt_gettoken(&vp->v_token);
+
        if (buf_rb_hash_RB_INSERT(&vp->v_rbhash_tree, bp)) {
                lwkt_reltoken(&vp->v_token);
                return (EEXIST);
@@ -926,6 +979,7 @@ brelvp(struct buf *bp)
  * Reassign the buffer to the proper clean/dirty list based on B_DELWRI.
  * This routine is called when the state of the B_DELWRI bit is changed.
  *
+ * Must be called with vp->v_token held.
  * MPSAFE
  */
 void
@@ -934,7 +988,7 @@ reassignbuf(struct buf *bp)
        struct vnode *vp = bp->b_vp;
        int delay;
 
-       KKASSERT(vp != NULL);
+       ASSERT_LWKT_TOKEN_HELD(&vp->v_token);
        ++reassignbufcalls;
 
        /*
@@ -944,7 +998,6 @@ reassignbuf(struct buf *bp)
        if (bp->b_flags & B_PAGING)
                panic("cannot reassign paging buffer");
 
-       lwkt_gettoken(&vp->v_token);
        if (bp->b_flags & B_DELWRI) {
                /*
                 * Move to the dirty list, add the vnode to the worklist
@@ -999,7 +1052,6 @@ reassignbuf(struct buf *bp)
                        vn_syncer_remove(vp);
                }
        }
-       lwkt_reltoken(&vp->v_token);
 }
 
 /*
@@ -1440,6 +1492,7 @@ vinitvmio(struct vnode *vp, off_t filesize, int blksize, int boff)
 
 retry:
        if ((object = vp->v_object) == NULL) {
+               lwkt_gettoken(&vm_token);
                object = vnode_pager_alloc(vp, filesize, 0, 0, blksize, boff);
                /*
                 * Dereference the reference we just created.  This assumes
@@ -1447,6 +1500,7 @@ retry:
                 */
                object->ref_count--;
                vrele(vp);
+               lwkt_reltoken(&vm_token);
        } else {
                if (object->flags & OBJ_DEAD) {
                        vn_unlock(vp);
index 6975be4..b7383f2 100644 (file)
@@ -121,10 +121,17 @@ static int nvtruncbuf_bp_metasync(struct buf *bp, void *data);
  * bdwrite() will call BMAP on it again.  Some filesystems, like HAMMER,
  * never overwrite existing data blocks.
  */
+
+struct truncbuf_info {
+       struct vnode *vp;
+       off_t truncloffset;     /* truncation point */
+       int clean;              /* clean tree, else dirty tree */
+};
+
 int
 nvtruncbuf(struct vnode *vp, off_t length, int blksize, int boff)
 {
-       off_t truncloffset;
+       struct truncbuf_info info;
        off_t truncboffset;
        const char *filename;
        struct buf *bp;
@@ -141,18 +148,20 @@ nvtruncbuf(struct vnode *vp, off_t length, int blksize, int boff)
        if (boff < 0)
                boff = (int)(length % blksize);
        if (boff)
-               truncloffset = length + (blksize - boff);
+               info.truncloffset = length + (blksize - boff);
        else
-               truncloffset = length;
-
+               info.truncloffset = length;
+       info.vp = vp;
        lwkt_gettoken(&vp->v_token);
        do {
+               info.clean = 1;
                count = RB_SCAN(buf_rb_tree, &vp->v_rbclean_tree,
                                nvtruncbuf_bp_trunc_cmp,
-                               nvtruncbuf_bp_trunc, &truncloffset);
+                               nvtruncbuf_bp_trunc, &info);
+               info.clean = 0;
                count += RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree,
                                nvtruncbuf_bp_trunc_cmp,
-                               nvtruncbuf_bp_trunc, &truncloffset);
+                               nvtruncbuf_bp_trunc, &info);
        } while(count);
 
        nvnode_pager_setsize(vp, length, blksize, boff);
@@ -197,7 +206,7 @@ nvtruncbuf(struct vnode *vp, off_t length, int blksize, int boff)
                do {
                        count = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree,
                                        nvtruncbuf_bp_metasync_cmp,
-                                       nvtruncbuf_bp_metasync, vp);
+                                       nvtruncbuf_bp_metasync, &info);
                } while (count);
        }
 
@@ -222,12 +231,14 @@ nvtruncbuf(struct vnode *vp, off_t length, int blksize, int boff)
         * to busy dirty VM pages being flushed out to disk.
         */
        do {
+               info.clean = 1;
                count = RB_SCAN(buf_rb_tree, &vp->v_rbclean_tree,
                                nvtruncbuf_bp_trunc_cmp,
-                               nvtruncbuf_bp_trunc, &truncloffset);
+                               nvtruncbuf_bp_trunc, &info);
+               info.clean = 0;
                count += RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree,
                                nvtruncbuf_bp_trunc_cmp,
-                               nvtruncbuf_bp_trunc, &truncloffset);
+                               nvtruncbuf_bp_trunc, &info);
                if (count) {
                        kprintf("Warning: vtruncbuf():  Had to re-clean %d "
                               "left over buffers in %s\n", count, filename);
@@ -247,7 +258,9 @@ static
 int
 nvtruncbuf_bp_trunc_cmp(struct buf *bp, void *data)
 {
-       if (bp->b_loffset >= *(off_t *)data)
+       struct truncbuf_info *info = data;
+
+       if (bp->b_loffset >= info->truncloffset)
                return(0);
        return(-1);
 }
@@ -256,14 +269,23 @@ static
 int
 nvtruncbuf_bp_trunc(struct buf *bp, void *data)
 {
+       struct truncbuf_info *info = data;
+
        /*
-        * Do not try to use a buffer we cannot immediately lock, but sleep
-        * anyway to prevent a livelock.  The code will loop until all buffers
-        * can be acted upon.
+        * Do not try to use a buffer we cannot immediately lock,
+        * but sleep anyway to prevent a livelock.  The code will
+        * loop until all buffers can be acted upon.
         */
        if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
+               atomic_add_int(&bp->b_refs, 1);
                if (BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL) == 0)
                        BUF_UNLOCK(bp);
+               atomic_subtract_int(&bp->b_refs, 1);
+       } else if ((info->clean && (bp->b_flags & B_DELWRI)) ||
+                  (info->clean == 0 && (bp->b_flags & B_DELWRI) == 0) ||
+                  bp->b_vp != info->vp ||
+                  nvtruncbuf_bp_trunc_cmp(bp, data)) {
+               BUF_UNLOCK(bp);
        } else {
                bremfree(bp);
                bp->b_flags |= (B_INVAL | B_RELBUF | B_NOCACHE);
@@ -278,7 +300,7 @@ nvtruncbuf_bp_trunc(struct buf *bp, void *data)
  * Note that the compare function must conform to the RB_SCAN's requirements.
  */
 static int
-nvtruncbuf_bp_metasync_cmp(struct buf *bp, void *data)
+nvtruncbuf_bp_metasync_cmp(struct buf *bp, void *data __unused)
 {
        if (bp->b_loffset < 0)
                return(0);
@@ -288,28 +310,27 @@ nvtruncbuf_bp_metasync_cmp(struct buf *bp, void *data)
 static int
 nvtruncbuf_bp_metasync(struct buf *bp, void *data)
 {
-       struct vnode *vp = data;
+       struct truncbuf_info *info = data;
 
-       if (bp->b_flags & B_DELWRI) {
-               /*
-                * Do not try to use a buffer we cannot immediately lock,
-                * but sleep anyway to prevent a livelock.  The code will
-                * loop until all buffers can be acted upon.
-                */
-               if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
-                       if (BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL) == 0)
-                               BUF_UNLOCK(bp);
-               } else {
-                       bremfree(bp);
-                       if (bp->b_vp == vp)
-                               bawrite(bp);
-                       else
-                               bwrite(bp);
-               }
-               return(1);
+       /*
+        * Do not try to use a buffer we cannot immediately lock,
+        * but sleep anyway to prevent a livelock.  The code will
+        * loop until all buffers can be acted upon.
+        */
+       if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
+               atomic_add_int(&bp->b_refs, 1);
+               if (BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL) == 0)
+                       BUF_UNLOCK(bp);
+               atomic_subtract_int(&bp->b_refs, 1);
+       } else if ((bp->b_flags & B_DELWRI) == 0 ||
+                  bp->b_vp != info->vp ||
+                  nvtruncbuf_bp_metasync_cmp(bp, data)) {
+               BUF_UNLOCK(bp);
        } else {
-               return(0);
+               bremfree(bp);
+               bawrite(bp);
        }
+       return(1);
 }
 
 /*
index 4d09c15..3a0c741 100644 (file)
@@ -920,17 +920,18 @@ cpu_idle(void)
                        __asm __volatile("cli");
                        splz();
                        if (!lwkt_runnable())
-                           cpu_idle_hook();
+                               cpu_idle_hook();
 #ifdef SMP
                        else
-                           __asm __volatile("pause");
+                               handle_cpu_contention_mask();
 #endif
                        ++cpu_idle_hltcnt;
                } else {
                        td->td_flags &= ~TDF_IDLE_NOHLT;
                        splz();
 #ifdef SMP
-                       __asm __volatile("sti; pause");
+                       __asm __volatile("sti");
+                       handle_cpu_contention_mask();
 #else
                        __asm __volatile("sti");
 #endif
@@ -947,9 +948,16 @@ cpu_idle(void)
  * we let the scheduler spin.
  */
 void
-cpu_mplock_contested(void)
+handle_cpu_contention_mask(void)
 {
-       cpu_pause();
+       cpumask_t mask;
+
+       mask = cpu_contention_mask;
+       cpu_ccfence();
+       if (mask && bsfl(mask) != mycpu->gd_cpuid) {
+               cpu_pause();
+               DELAY(2);
+       }
 }
 
 /*
index 697fcc8..f404d9f 100644 (file)
@@ -442,6 +442,7 @@ DODELAY(int n, int doswitch)
                ticks_left -= delta;
                if (doswitch && ticks_left > 0)
                        lwkt_switch();
+               cpu_pause();
        }
 #ifdef DELAYDEBUG
        if (state == 1)
index 6b78a7d..94ba26e 100644 (file)
@@ -445,6 +445,7 @@ DODELAY(int n, int doswitch)
                ticks_left -= delta;
                if (doswitch && ticks_left > 0)
                        lwkt_switch();
+               cpu_pause();
        }
 #ifdef DELAYDEBUG
        if (state == 1)
index 944b3ec..22438fb 100644 (file)
@@ -945,17 +945,18 @@ cpu_idle(void)
                        __asm __volatile("cli");
                        splz();
                        if (!lwkt_runnable())
-                           cpu_idle_hook();
+                               cpu_idle_hook();
 #ifdef SMP
                        else
-                           __asm __volatile("pause");
+                               handle_cpu_contention_mask();
 #endif
                        ++cpu_idle_hltcnt;
                } else {
                        td->td_flags &= ~TDF_IDLE_NOHLT;
                        splz();
 #ifdef SMP
-                       __asm __volatile("sti; pause");
+                       __asm __volatile("sti");
+                       handle_cpu_contention_mask();
 #else
                        __asm __volatile("sti");
 #endif
@@ -972,7 +973,7 @@ cpu_idle(void)
  * we let the scheduler spin.
  */
 void
-cpu_mplock_contested(void)
+handle_cpu_contention_mask(void)
 {
        cpu_pause();
 }
index ff53178..dc98099 100644 (file)
@@ -702,9 +702,13 @@ cpu_idle(void)
 {
        struct thread *td = curthread;
        struct mdglobaldata *gd = mdcpu;
+       int reqflags;
 
        crit_exit();
        KKASSERT(td->td_critcount == 0);
+#ifdef SMP
+       KKASSERT(td->td_mpcount == 0);
+#endif
        cpu_enable_intr();
        for (;;) {
                /*
@@ -712,6 +716,9 @@ cpu_idle(void)
                 */
                lwkt_switch();
 
+#ifdef SMP
+               KKASSERT(td->td_mpcount == 0);
+#endif
                /*
                 * The idle loop halts only if no threads are scheduleable
                 * and no signals have occured.
@@ -719,12 +726,19 @@ cpu_idle(void)
                if (cpu_idle_hlt && !lwkt_runnable() &&
                    (td->td_flags & TDF_IDLE_NOHLT) == 0) {
                        splz();
+#ifdef SMP
+                       KKASSERT(td->td_mpcount == 0);
+                       KKASSERT(MP_LOCK_HELD(td->td_gd) == 0);
+#endif
                        if (!lwkt_runnable()) {
 #ifdef DEBUGIDLE
                                struct timeval tv1, tv2;
                                gettimeofday(&tv1, NULL);
 #endif
-                               umtx_sleep(&gd->mi.gd_reqflags, 0, 1000000);
+                               reqflags = gd->mi.gd_reqflags &
+                                          ~RQF_IDLECHECK_MASK;
+                               umtx_sleep(&gd->mi.gd_reqflags, reqflags,
+                                          1000000);
 #ifdef DEBUGIDLE
                                gettimeofday(&tv2, NULL);
                                if (tv2.tv_usec - tv1.tv_usec +
@@ -739,7 +753,7 @@ cpu_idle(void)
                        }
 #ifdef SMP
                        else {
-                           __asm __volatile("pause");
+                               handle_cpu_contention_mask();
                        }
 #endif
                        ++cpu_idle_hltcnt;
@@ -747,10 +761,8 @@ cpu_idle(void)
                        td->td_flags &= ~TDF_IDLE_NOHLT;
                        splz();
 #ifdef SMP
-                       /*__asm __volatile("sti; pause");*/
+                       handle_cpu_contention_mask();
                        __asm __volatile("pause");
-#else
-                       /*__asm __volatile("sti");*/
 #endif
                        ++cpu_idle_spincnt;
                }
@@ -766,9 +778,17 @@ cpu_idle(void)
  * we sleep for a bit.
  */
 void
-cpu_mplock_contested(void)
+handle_cpu_contention_mask(void)
 {
-       usleep(1000);
+       cpumask_t mask;
+
+       mask = cpu_contention_mask;
+       cpu_ccfence();
+       if (mask && bsfl(mask) != mycpu->gd_cpuid) {
+               cpu_pause();
+               usleep(1000);
+       }
+       /* usleep(1000); */
 }
 
 /*
@@ -781,9 +801,11 @@ cpu_mplock_contested(void)
 void
 cpu_spinlock_contested(void)
 {
+       cpu_pause();
+       /*
        crit_enter();
        usleep(1000);
-       crit_exit();
+       crit_exit();*/
 }
 
 #endif
index bd4f3ef..03e347b 100644 (file)
@@ -706,6 +706,7 @@ cpu_idle(void)
 {
        struct thread *td = curthread;
        struct mdglobaldata *gd = mdcpu;
+       int reqflags;
 
        crit_exit();
        KKASSERT(td->td_critcount == 0);
@@ -728,7 +729,10 @@ cpu_idle(void)
                                struct timeval tv1, tv2;
                                gettimeofday(&tv1, NULL);
 #endif
-                               umtx_sleep(&gd->mi.gd_reqflags, 0, 1000000);
+                               reqflags = gd->mi.gd_reqflags &
+                                          ~RQF_IDLECHECK_MASK;
+                               umtx_sleep(&gd->mi.gd_reqflags, reqflags,
+                                          1000000);
 #ifdef DEBUGIDLE
                                gettimeofday(&tv2, NULL);
                                if (tv2.tv_usec - tv1.tv_usec +
@@ -743,7 +747,7 @@ cpu_idle(void)
                        }
 #ifdef SMP
                        else {
-                           __asm __volatile("pause");
+                               handle_cpu_contention_mask();
                        }
 #endif
                        ++cpu_idle_hltcnt;
@@ -751,10 +755,8 @@ cpu_idle(void)
                        td->td_flags &= ~TDF_IDLE_NOHLT;
                        splz();
 #ifdef SMP
-                       /*__asm __volatile("sti; pause");*/
+                       handle_cpu_contention_mask();
                        __asm __volatile("pause");
-#else
-                       /*__asm __volatile("sti");*/
 #endif
                        ++cpu_idle_spincnt;
                }
@@ -770,9 +772,16 @@ cpu_idle(void)
  * we sleep for a bit.
  */
 void
-cpu_mplock_contested(void)
+handle_cpu_contention_mask(void)
 {
-       usleep(1000);
+       cpumask_t mask;
+
+       mask = cpu_contention_mask;
+       cpu_ccfence();
+       if (mask && bsfl(mask) != mycpu->gd_cpuid) {
+               cpu_pause();
+               usleep(1000);
+       }
 }
 
 /*
index 1ea688a..9d8cfcc 100644 (file)
@@ -27,11 +27,11 @@ void _try_mplock_contested(const char *file, int line);
 void _cpu_try_mplock_contested(const char *file, int line);
 void _rel_mplock_contested(void);
 void cpu_get_initial_mplock(void);
-void cpu_mplock_contested(void);
+void handle_cpu_contention_mask(void);
 void yield_mplock(struct thread *td);
 
 extern int mp_lock;
-extern int mp_lock_contention_mask;
+extern int cpu_contention_mask;
 extern const char *mp_lock_holder_file;
 extern int mp_lock_holder_line;
 
@@ -153,9 +153,9 @@ cpu_try_mplock_debug(const char *file, int line)
  */
 static __inline
 void
-set_mplock_contention_mask(globaldata_t gd)
+set_cpu_contention_mask(globaldata_t gd)
 {
-       atomic_set_int(&mp_lock_contention_mask, gd->gd_cpumask);
+       atomic_set_int(&cpu_contention_mask, gd->gd_cpumask);
 }
 
 /*
@@ -167,9 +167,9 @@ set_mplock_contention_mask(globaldata_t gd)
  */
 static __inline
 void
-clr_mplock_contention_mask(globaldata_t gd)
+clr_cpu_contention_mask(globaldata_t gd)
 {
-       atomic_clear_int(&mp_lock_contention_mask, gd->gd_cpumask);
+       atomic_clear_int(&cpu_contention_mask, gd->gd_cpumask);
 }
 
 static __inline
@@ -187,15 +187,17 @@ owner_mplock(void)
  *         end up clearing someone else's lock.
  */
 static __inline void
-cpu_rel_mplock(void)
+cpu_rel_mplock(int cpu)
 {
-       mp_lock = -1;
+       (void)atomic_cmpset_int(&mp_lock, cpu, -1);
 }
 
-#define MP_LOCK_HELD()         \
-       (mp_lock == mycpu->gd_cpuid)
-#define ASSERT_MP_LOCK_HELD(td)        \
-       KASSERT(MP_LOCK_HELD(), ("MP_LOCK_HELD: Not held thread %p", td))
+#define MP_LOCK_HELD(gd)                       \
+       (mp_lock == gd->gd_cpuid)
+
+#define ASSERT_MP_LOCK_HELD(td)                        \
+       KASSERT(MP_LOCK_HELD(td->td_gd),        \
+               ("MP_LOCK_HELD: Not held thread %p", td))
 
 #else
 
@@ -206,7 +208,7 @@ cpu_rel_mplock(void)
 #define        rel_mplock()
 #define try_mplock()           1
 #define owner_mplock()         0
-#define MP_LOCK_HELD()         (!0)
+#define MP_LOCK_HELD(gd)       (!0)
 #define ASSERT_MP_LOCK_HELD(td)
 
 #endif