kernel - Fix issues where tmpfs loses file data
authorMatthew Dillon <dillon@apollo.backplane.com>
Fri, 7 Dec 2012 22:20:31 +0000 (14:20 -0800)
committerMatthew Dillon <dillon@apollo.backplane.com>
Fri, 7 Dec 2012 22:20:31 +0000 (14:20 -0800)
* For TMPFS, UIO_NOCOPY writes must use bawrite() or bwrite() and must NEVER
  used buwrite() because these operations are being called via the VM page
  cleaning code via the pageout daemon or the vnode termination code
  (when maxvnodes is reached), and the underlying pages are about to be
  destroyed.

* vm_object_terminate() must call vinvalbuf() both before (for normal
  filesystems) and also after (for tmpfs style filesystems).  Otherwise
  buffers potentially not disposed of during the page cleaning might
  get left hanging.  This is a safety feature.

* Remove post-flush test code from vm_object_page_collect_flush() entirely.
  The IO's are in progress at this point so it makes no sense to set
  PG_CLEANCHK here.

* vm_page_need_commit() must make the object writeable and dirty, I think.

* Fix multiple places where m->dirty is tested and PG_NEED_COMMIT is not.

Reported-by: vsrinivas, others
sys/vfs/tmpfs/tmpfs_subr.c
sys/vfs/tmpfs/tmpfs_vnops.c
sys/vm/vm_object.c
sys/vm/vm_page.c

index cd18ca2..0bd807d 100644 (file)
@@ -496,8 +496,8 @@ tmpfs_free_vp(struct vnode *vp)
        TMPFS_NODE_LOCK(node);
        KKASSERT(lockcount(TMPFS_NODE_MTX(node)) > 0);
        node->tn_vnode = NULL;
-       TMPFS_NODE_UNLOCK(node);
        vp->v_data = NULL;
+       TMPFS_NODE_UNLOCK(node);
 }
 
 /* --------------------------------------------------------------------- */
index febc692..56a800b 100644 (file)
@@ -595,19 +595,19 @@ tmpfs_write (struct vop_write_args *ap)
                kflags |= NOTE_WRITE;
 
                /*
-                * Always try to flush the page if the request is coming
-                * from the pageout daemon (IO_ASYNC), else buwrite() the
-                * buffer.
+                * Always try to flush the page in the UIO_NOCOPY case.  This
+                * can come from the pageout daemon or during vnode eviction.
+                * It is not necessarily going to be marked IO_ASYNC/IO_SYNC.
                 *
-                * buwrite() dirties the underlying VM pages instead of
-                * dirtying the buffer, releasing the buffer as a clean
-                * buffer.  This allows tmpfs to use essentially all
-                * available memory to cache file data.  If we used bdwrite()
-                * the buffer cache would wind up flushing the data to
-                * swap too quickly.
+                * For the normal case we buwrite(), dirtying the underlying
+                * VM pages instead of dirtying the buffer and releasing the
+                * buffer as a clean buffer.  This allows tmpfs to use
+                * essentially all available memory to cache file data.
+                * If we used bdwrite() the buffer cache would wind up
+                * flushing the data to swap too quickly.
                 */
                bp->b_flags |= B_AGE;
-               if (ap->a_ioflag & IO_ASYNC) {
+               if (uio->uio_segflg == UIO_NOCOPY) {
                        bawrite(bp);
                } else {
                        buwrite(bp);
@@ -1440,6 +1440,8 @@ tmpfs_inactive(struct vop_inactive_args *v)
 }
 
 /* --------------------------------------------------------------------- */
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
 
 int
 tmpfs_reclaim(struct vop_reclaim_args *v)
index 5b007fa..06ecc10 100644 (file)
@@ -908,11 +908,19 @@ vm_object_terminate(vm_object_t object)
 
                /*
                 * Clean pages and flush buffers.
+                *
+                * NOTE!  TMPFS buffer flushes do not typically flush the
+                *        actual page to swap as this would be highly
+                *        inefficient, and normal filesystems usually wrap
+                *        page flushes with buffer cache buffers.
+                *
+                *        To deal with this we have to call vinvalbuf() both
+                *        before and after the vm_object_page_clean().
                 */
-               vm_object_page_clean(object, 0, 0, OBJPC_SYNC);
-
                vp = (struct vnode *) object->handle;
                vinvalbuf(vp, V_SAVE, 0, 0);
+               vm_object_page_clean(object, 0, 0, OBJPC_SYNC);
+               vinvalbuf(vp, V_SAVE, 0, 0);
        }
 
        /*
@@ -1001,7 +1009,7 @@ vm_object_terminate_callback(vm_page_t p, void *data __unused)
                vm_page_wakeup(p);
        } else if (p->wire_count == 0) {
                /*
-                * NOTE: PG_NEED_COMMIT is ignored.
+                * NOTE: p->dirty and PG_NEED_COMMIT are ignored.
                 */
                vm_page_free(p);
                mycpu->gd_cnt.v_pfree++;
@@ -1195,7 +1203,8 @@ vm_object_page_clean_pass2(struct vm_page *p, void *data)
         * cases where the page cannot be dirty.
         */
        if (p->valid == 0 || (p->queue - p->pc) == PQ_CACHE) {
-               KKASSERT((p->dirty & p->valid) == 0);
+               KKASSERT((p->dirty & p->valid) == 0 &&
+                        (p->flags & PG_NEED_COMMIT) == 0);
                vm_page_wakeup(p);
                goto done;
        }
@@ -1206,7 +1215,7 @@ vm_object_page_clean_pass2(struct vm_page *p, void *data)
         * page.
         */
        vm_page_test_dirty(p);
-       if ((p->dirty & p->valid) == 0) {
+       if ((p->dirty & p->valid) == 0 && (p->flags & PG_NEED_COMMIT) == 0) {
                vm_page_flag_clear(p, PG_CLEANCHK);
                vm_page_wakeup(p);
                goto done;
@@ -1281,7 +1290,8 @@ vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags)
                        break;
                }
                vm_page_test_dirty(tp);
-               if ((tp->dirty & tp->valid) == 0) {
+               if ((tp->dirty & tp->valid) == 0 &&
+                   (tp->flags & PG_NEED_COMMIT) == 0) {
                        vm_page_flag_clear(tp, PG_CLEANCHK);
                        vm_page_wakeup(tp);
                        break;
@@ -1314,7 +1324,8 @@ vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags)
                        break;
                }
                vm_page_test_dirty(tp);
-               if ((tp->dirty & tp->valid) == 0) {
+               if ((tp->dirty & tp->valid) == 0 &&
+                   (tp->flags & PG_NEED_COMMIT) == 0) {
                        vm_page_flag_clear(tp, PG_CLEANCHK);
                        vm_page_wakeup(tp);
                        break;
@@ -1340,33 +1351,13 @@ vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags)
        }
        runlen = maxb + maxf + 1;
 
-       for (i = 0; i < runlen; i++)
+       for (i = 0; i < runlen; i++)    /* XXX need this any more? */
                vm_page_hold(ma[i]);
 
        vm_pageout_flush(ma, runlen, pagerflags);
 
-       /*
-        * WARNING: Related pages are still held but the BUSY was inherited
-        *          by the pageout I/O, so the pages might not be busy any
-        *          more.  We cannot re-protect the page without waiting
-        *          for the I/O to complete and then busying it again.
-        */
-       for (i = 0; i < runlen; i++) {
-               if (ma[i]->valid & ma[i]->dirty) {
-                       /*vm_page_protect(ma[i], VM_PROT_READ);*/
-                       vm_page_flag_set(ma[i], PG_CLEANCHK);
-
-                       /*
-                        * maxf will end up being the actual number of pages
-                        * we wrote out contiguously, non-inclusive of the
-                        * first page.  We do not count look-behind pages.
-                        */
-                       if (i >= maxb + 1 && (maxf > i - maxb - 1))
-                               maxf = i - maxb - 1;
-               }
+       for (i = 0; i < runlen; i++)    /* XXX need this any more? */
                vm_page_unhold(ma[i]);
-       }
-       /*return(maxf + 1);*/
 }
 
 /*
@@ -2387,13 +2378,13 @@ vm_object_page_remove_callback(vm_page_t p, void *data)
        }
 
        /*
-        * limit is our clean_only flag.  If set and the page is dirty, do
-        * not free it.  If set and the page is being held by someone, do
-        * not free it.
+        * limit is our clean_only flag.  If set and the page is dirty or
+        * requires a commit, do not free it.  If set and the page is being
+        * held by someone, do not free it.
         */
        if (info->limit && p->valid) {
                vm_page_test_dirty(p);
-               if (p->valid & p->dirty) {
+               if ((p->valid & p->dirty) || (p->flags & PG_NEED_COMMIT)) {
                        vm_page_wakeup(p);
                        return(0);
                }
index 350fd26..ad6f177 100644 (file)
@@ -942,7 +942,8 @@ vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
         * Since we are inserting a new and possibly dirty page,
         * update the object's OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY flags.
         */
-       if ((m->valid & m->dirty) || (m->flags & PG_WRITEABLE))
+       if ((m->valid & m->dirty) ||
+           (m->flags & (PG_WRITEABLE | PG_NEED_COMMIT)))
                vm_object_set_writeable_dirty(object);
 
        /*
@@ -2284,7 +2285,7 @@ vm_page_try_to_cache(vm_page_t m)
         * be moved to the cache.
         */
        vm_page_test_dirty(m);
-       if (m->dirty) {
+       if (m->dirty || (m->flags & PG_NEED_COMMIT)) {
                vm_page_wakeup(m);
                return(0);
        }
@@ -2336,12 +2337,12 @@ vm_page_try_to_free(vm_page_t m)
         * dirty bit after cleaning out the pmaps.
         */
        vm_page_test_dirty(m);
-       if (m->dirty) {
+       if (m->dirty || (m->flags & PG_NEED_COMMIT)) {
                vm_page_wakeup(m);
                return(0);
        }
        vm_page_protect(m, VM_PROT_NONE);
-       if (m->dirty) {
+       if (m->dirty || (m->flags & PG_NEED_COMMIT)) {
                vm_page_wakeup(m);
                return(0);
        }
@@ -2523,6 +2524,7 @@ void
 vm_page_need_commit(vm_page_t m)
 {
        vm_page_flag_set(m, PG_NEED_COMMIT);
+       vm_object_set_writeable_dirty(m->object);
 }
 
 void