hammer2 - fix I/O race, flesh out bulkfree
authorMatthew Dillon <dillon@apollo.backplane.com>
Fri, 30 Jan 2015 02:26:34 +0000 (18:26 -0800)
committerMatthew Dillon <dillon@apollo.backplane.com>
Fri, 30 Jan 2015 02:26:34 +0000 (18:26 -0800)
* Fix an I/O race which could cause hammer2 to block on an iocb forever.
  A TAILQ was being tested outside of a spin-lock and could race the
  related atomic op on the flag.

* Add signal checking for the bulkfree scan, allowing it to be interrupted.

* Loop sbase/sstop to cover the whole volume.  The in-memory bitmap must
  be bzero()d on each loop.

sys/vfs/hammer2/hammer2.h
sys/vfs/hammer2/hammer2_bulkscan.c
sys/vfs/hammer2/hammer2_io.c
sys/vfs/hammer2/hammer2_subr.c

index 183004d..8225173 100644 (file)
@@ -773,6 +773,7 @@ extern mtx_t thread_protect;
 #define hammer2_icrc32(buf, size)      iscsi_crc32((buf), (size))
 #define hammer2_icrc32c(buf, size, crc)        iscsi_crc32_ext((buf), (size), (crc))
 
+int hammer2_signal_check(time_t *timep);
 hammer2_cluster_t *hammer2_inode_lock_ex(hammer2_inode_t *ip);
 hammer2_cluster_t *hammer2_inode_lock_sh(hammer2_inode_t *ip);
 void hammer2_inode_unlock_ex(hammer2_inode_t *ip, hammer2_cluster_t *chain);
index cde54cc..aac27bb 100644 (file)
@@ -76,7 +76,7 @@ hammer2_bulk_scan(hammer2_trans_t *trans, hammer2_chain_t *parent,
        save->parent = parent;
        TAILQ_INSERT_TAIL(&list, save, entry);
 
-       while ((save = TAILQ_FIRST(&list)) != NULL) {
+       while ((save = TAILQ_FIRST(&list)) != NULL && doabort == 0) {
                hammer2_chain_t *chain;
                int cache_index;
 
@@ -147,6 +147,16 @@ hammer2_bulk_scan(hammer2_trans_t *trans, hammer2_chain_t *parent,
                if (save)
                        kfree(save, M_HAMMER2);
        }
+
+       /*
+        * Cleanup anything left undone due to an abort
+        */
+       while ((save = TAILQ_FIRST(&list)) != NULL) {
+               TAILQ_REMOVE(&list, save, entry);
+               hammer2_chain_drop(save->parent);
+               kfree(save, M_HAMMER2);
+       }
+
        return doabort;
 }
 
@@ -191,6 +201,7 @@ typedef struct hammer2_bulkfree_info {
        long                    count_l0cleans;
        long                    count_linadjusts;
        hammer2_off_t           adj_free;
+       time_t                  save_time;
 } hammer2_bulkfree_info_t;
 
 static int h2_bulkfree_callback(hammer2_chain_t *chain, void *info);
@@ -203,6 +214,7 @@ hammer2_bulkfree_pass(hammer2_mount_t *hmp, hammer2_ioc_bulkfree_t *bfi)
 {
        hammer2_trans_t trans;
        hammer2_bulkfree_info_t cbinfo;
+       hammer2_off_t incr;
        size_t size;
        int doabort = 0;
 
@@ -214,33 +226,75 @@ hammer2_bulkfree_pass(hammer2_mount_t *hmp, hammer2_ioc_bulkfree_t *bfi)
        cbinfo.trans = &trans;
        cbinfo.hmp = hmp;
        cbinfo.bmap = kmem_alloc_swapbacked(&cbinfo.kp, size);
-       cbinfo.sbase = 0;
-       cbinfo.sstop = cbinfo.sbase +
-                      size / HAMMER2_FREEMAP_LEVELN_PSIZE *    /* per 64KB */
-                      HAMMER2_FREEMAP_LEVEL1_SIZE;             /* 2GB */
-       /* XXX also limit to volume size */
-
-       hammer2_trans_init(&trans, hmp->spmp, 0);
-       doabort |= hammer2_bulk_scan(&trans, &hmp->vchain,
-                                       h2_bulkfree_callback, &cbinfo);
-       h2_bulkfree_sync(&cbinfo);
 
        /*
-        * Adjust bytes free in the volume header before ending the
-        * transaction.
+        * Normalize start point to a 2GB boundary.  We operate on a
+        * 64KB leaf bitmap boundary which represents 2GB of storage.
         */
-       hammer2_voldata_lock(hmp);
-       hammer2_voldata_modify(hmp);
-       hmp->voldata.allocator_free += cbinfo.adj_free;
-       hammer2_voldata_unlock(hmp);
+       cbinfo.sbase = bfi->sbase;
+       if (cbinfo.sbase > hmp->voldata.volu_size)
+               cbinfo.sbase = hmp->voldata.volu_size;
+       cbinfo.sbase &= ~HAMMER2_FREEMAP_LEVEL1_MASK;
 
        /*
-        * Cleanup
+        * Loop on a full meta-data scan as many times as required to
+        * get through all available storage.
         */
-       hammer2_trans_done(&trans);
+       while (cbinfo.sbase < hmp->voldata.volu_size) {
+               /*
+                * We have enough ram to represent (incr) bytes of storage.
+                * Each 64KB of ram represents 2GB of storage.
+                */
+               bzero(cbinfo.bmap, size);
+               incr = size / HAMMER2_FREEMAP_LEVELN_PSIZE *
+                      HAMMER2_FREEMAP_LEVEL1_SIZE;
+               if (hmp->voldata.volu_size - cbinfo.sbase < incr)
+                       cbinfo.sstop = hmp->voldata.volu_size;
+               else
+                       cbinfo.sstop = cbinfo.sbase + incr;
+               kprintf("bulkfree pass %016jx/%jdGB\n",
+                       (intmax_t)cbinfo.sbase,
+                       (intmax_t)incr / HAMMER2_FREEMAP_LEVEL1_SIZE);
+
+               hammer2_trans_init(&trans, hmp->spmp, 0);
+               doabort |= hammer2_bulk_scan(&trans, &hmp->vchain,
+                                           h2_bulkfree_callback, &cbinfo);
+
+               /*
+                * If complete scan succeeded we can synchronize our
+                * in-memory freemap against live storage.  If an abort
+                * did occur we cannot safely synchronize our partially
+                * filled-out in-memory freemap.
+                */
+               if (doabort == 0) {
+                       h2_bulkfree_sync(&cbinfo);
+
+                       hammer2_voldata_lock(hmp);
+                       hammer2_voldata_modify(hmp);
+                       hmp->voldata.allocator_free += cbinfo.adj_free;
+                       hammer2_voldata_unlock(hmp);
+               }
+
+               /*
+                * Cleanup for next loop.
+                */
+               hammer2_trans_done(&trans);
+               if (doabort)
+                       break;
+               cbinfo.sbase = cbinfo.sstop;
+       }
        kmem_free_swapbacked(&cbinfo.kp);
 
-       kprintf("bulkfree pass statistics:\n");
+       bfi->sstop = cbinfo.sbase;
+
+       incr = bfi->sstop / (hmp->voldata.volu_size / 10000);
+       if (incr > 10000)
+               incr = 10000;
+
+       kprintf("bulkfree pass statistics (%d.%02d%% storage processed):\n",
+               (int)incr / 100,
+               (int)incr % 100);
+
        kprintf("    transition->free   %ld\n", cbinfo.count_10_00);
        kprintf("    transition->staged %ld\n", cbinfo.count_11_10);
        kprintf("    raced on           %ld\n", cbinfo.count_10_11);
@@ -261,7 +315,11 @@ h2_bulkfree_callback(hammer2_chain_t *chain, void *info)
        int radix;
        int error;
 
-       error = 0;
+       /*
+        * Check for signal and allow yield to userland during scan
+        */
+       if (hammer2_signal_check(&cbinfo->save_time))
+               return HAMMER2_BULK_ABORT;
 
 #if 0
        kprintf("scan chain %016jx %016jx/%-2d type=%02x\n",
@@ -275,6 +333,7 @@ h2_bulkfree_callback(hammer2_chain_t *chain, void *info)
         * Calculate the data offset and determine if it is within
         * the current freemap range being gathered.
         */
+       error = 0;
        data_off = chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX;
        if (data_off < cbinfo->sbase || data_off > cbinfo->sstop)
                return 0;
index 99d2105..ea1cb24 100644 (file)
@@ -249,12 +249,14 @@ hammer2_io_complete(hammer2_iocb_t *iocb)
         * Now finish up the dio.  If another iocb is pending chain to it
         * leaving DIO_INPROG set.  Otherwise clear DIO_INPROG
         * (and DIO_WAITING).
+        *
+        * NOTE: The TAILQ is not stable until the spin-lock is held.
         */
        for (;;) {
                orefs = dio->refs;
                nrefs = orefs & ~(HAMMER2_DIO_WAITING | HAMMER2_DIO_INPROG);
 
-               if ((orefs & HAMMER2_DIO_WAITING) && TAILQ_FIRST(&dio->iocbq)) {
+               if (orefs & HAMMER2_DIO_WAITING) {
                        spin_lock(&dio->spin);
                        iocb = TAILQ_FIRST(&dio->iocbq);
                        if (iocb) {
@@ -262,6 +264,10 @@ hammer2_io_complete(hammer2_iocb_t *iocb)
                                spin_unlock(&dio->spin);
                                iocb->callback(iocb);   /* chained */
                                break;
+                       } else if (atomic_cmpset_int(&dio->refs,
+                                                    orefs, nrefs)) {
+                               spin_unlock(&dio->spin);
+                               break;
                        }
                        spin_unlock(&dio->spin);
                        /* retry */
index 664a29e..128583e 100644 (file)
@@ -422,3 +422,17 @@ hammer2_adjreadcounter(hammer2_blockref_t *bref, size_t bytes)
        }
        *counterp += bytes;
 }
+
+int
+hammer2_signal_check(time_t *timep)
+{
+       int error = 0;
+
+       lwkt_user_yield();
+       if (*timep != time_second) {
+               *timep = time_second;
+               if (CURSIG(curthread->td_lwp) != 0)
+                       error = EINTR;
+       }
+       return error;
+}