kernel - Change lockmgr LK_SHARED behavior to fix improper recursion return
authorMatthew Dillon <dillon@apollo.backplane.com>
Fri, 17 Aug 2012 00:40:38 +0000 (17:40 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Fri, 17 Aug 2012 00:40:38 +0000 (17:40 -0700)
* When obtaining a LK_SHARED lock in a situation where you already own the
  lock LK_EXCLUSIVE, lockmgr would downgrade the lock to shared.

  This creates a very serious problem when large procedural recursions get
  a lock that is already being held exclusively but request a shared lock.
  When these recursions return the original top level will find its lock is
  no longer exclusive.

* This problem occured with vnode locks when a VOP_WRITE operation on a mmap'd
  space causes a VM fault which then turns around and issues a read().
  When the fault returns the vnode wound up locked shared instead of
  exclusive.

* Fix the problem by NOT downgrading an exclusive lock to shared when
  recursing on LK_SHARED.  Simply add another count to the exclusive
  lock.

sys/kern/kern_lock.c
sys/vfs/ufs/ffs_rawread.c

index 9453c0c..2e67007 100644 (file)
@@ -234,13 +234,38 @@ debuglockmgr(struct lock *lkp, u_int flags,
                        COUNT(td, 1);
                        break;
                }
+
                /*
-                * We hold an exclusive lock, so downgrade it to shared.
-                * An alternative would be to fail with EDEADLK.
+                * If we already hold an exclusive lock we bump the
+                * exclusive count instead of downgrading to a shared
+                * lock.
+                *
+                * WARNING!  The old FreeBSD behavior was to downgrade,
+                *           but this creates a problem when recursions
+                *           return to the caller and the caller expects
+                *           its original exclusive lock to remain exclusively
+                *           locked.
+                */
+               if (extflags & LK_CANRECURSE) {
+                       lkp->lk_exclusivecount++;
+                       COUNT(td, 1);
+                       break;
+               }
+               if (extflags & LK_NOWAIT) {
+                       error = EBUSY;
+                       break;
+               }
+               spin_unlock(&lkp->lk_spinlock);
+               panic("lockmgr: locking against myself");
+#if 0
+               /*
+                * old code queued a shared lock request fell into
+                * a downgrade.
                 */
                sharelock(lkp, 1);
                COUNT(td, 1);
                /* fall into downgrade */
+#endif
 
        case LK_DOWNGRADE:
                if (lkp->lk_lockholder != td || lkp->lk_exclusivecount == 0) {
index 13550dc..e5fcdfa 100644 (file)
@@ -88,7 +88,6 @@ static int
 ffs_rawread_sync(struct vnode *vp)
 {
        int error;
-       int upgraded;
 
        /*
         * Check for dirty mmap, pending writes and dirty buffers
@@ -97,13 +96,6 @@ ffs_rawread_sync(struct vnode *vp)
        if (bio_track_active(&vp->v_track_write) ||
            !RB_EMPTY(&vp->v_rbdirty_tree) ||
            (vp->v_flag & VOBJDIRTY) != 0) {
-               if (vn_islocked(vp) != LK_EXCLUSIVE) {
-                       upgraded = 1;
-                       /* Upgrade to exclusive lock, this might block */
-                       vn_lock(vp, LK_UPGRADE);
-               } else
-                       upgraded = 0;
-               
                /* Attempt to msync mmap() regions to clean dirty mmap */ 
                if ((vp->v_flag & VOBJDIRTY) != 0) {
                        struct vm_object *obj;
@@ -114,23 +106,17 @@ ffs_rawread_sync(struct vnode *vp)
                /* Wait for pending writes to complete */
                error = bio_track_wait(&vp->v_track_write, 0, 0);
                if (error != 0) {
-                       if (upgraded != 0)
-                               vn_lock(vp, LK_DOWNGRADE);
                        goto done;
                }
                /* Flush dirty buffers */
                if (!RB_EMPTY(&vp->v_rbdirty_tree)) {
                        if ((error = VOP_FSYNC(vp, MNT_WAIT, 0)) != 0) {
-                               if (upgraded != 0)
-                                       vn_lock(vp, LK_DOWNGRADE);
                                goto done;
                        }
                        if (bio_track_active(&vp->v_track_write) ||
                            !RB_EMPTY(&vp->v_rbdirty_tree))
                                panic("ffs_rawread_sync: dirty bufs");
                }
-               if (upgraded != 0)
-                       vn_lock(vp, LK_DOWNGRADE);
        } else {
                error = 0;
        }