hammer2 - Refactor reserved block selection in freemap code
authorMatthew Dillon <dillon@apollo.backplane.com>
Mon, 9 Dec 2013 22:47:11 +0000 (14:47 -0800)
committerMatthew Dillon <dillon@apollo.backplane.com>
Mon, 9 Dec 2013 22:51:23 +0000 (14:51 -0800)
* Refactor the reserved block selection in the freemap code.  Move from
  4 copies of each freemap block to 15 copies in order to ensure
  that any of the four volume header backups (which are rotated on each
  flush) can be used at mount-time.

* A better algorithm could use as few as 10 copies but for now I am using
  a more trivial algorithm which needs 15.

* No media changes, the 4MB/2GB of space already reserved had sufficient
  room.

sys/vfs/hammer2/hammer2_disk.h
sys/vfs/hammer2/hammer2_freemap.c

index 7167e17..d87d381 100644 (file)
  * dead areas are reserved for future use and MUST NOT BE USED for other
  * purposes.
  *
- * The freemap is arranged into four groups.  Modifications rotate through
- * the groups on a block by block basis (so all the blocks are not necessarily
- * synchronized to the same group).  Because the freemap is flushed
- * independent of the main filesystem, the freemap only really needs two
- * groups to operate efficiently.
- *
- *
- *
+ * The freemap is arranged into 15 groups of 4x64KB each.  The 4 sub-groups
+ * are labeled ZONEFM1..4 and representing HAMMER2_FREEMAP_LEVEL{1-4}_RADIX,
+ * for the up to 4 levels of radix tree representing the freemap.  For
+ * simplicity we are reserving all four radix tree layers even though the
+ * higher layers do not require teh reservation at each 2GB mark.  That
+ * space is reserved for future use.
+ *
+ * Freemap blocks are not allocated dynamically but instead rotate through
+ * one of 15 possible copies.  We require 15 copies for several reasons:
+ *
+ * (1) For distinguishing freemap 'allocations' made by the current flush
+ *     verses the concurrently running front-end (at flush_tid + 1).  This
+ *     theoretically requires two copies but the algorithm is greatly
+ *     simplified if we use three.
+ *
+ * (2) There are up to 4 copies of the volume header (iterated on each flush),
+ *     and if the mount code is forced to use an older copy due to corruption
+ *     we must be sure that the state of the freemap AS-OF the earlier copy
+ *     remains valid.
+ *
+ *     This means 3 copies x 4 flushes = 12 copies to be able to mount any
+ *     of the four volume header backups after on boot or after a crash.
+ *
+ * (3) Freemap recovery on-mount eats a copy.  We don't want freemap recovery
+ *     to blow away the copy used by some other volume header in case H2
+ *     crashes during the recovery.  Total is now 13.
+ *
+ * (4) And I want some breathing room to ensure that complex flushes do not
+ *     cause problems.  Also note that bulk block freeing itself must be
+ *     careful so even on a live system, post-mount, the four volume header
+ *     backups effectively represent short-lived snapshots.  And I only
+ *     have room for 15 copies so it works out.
+ *
+ * Preferably I would like to improve the algorithm to only use 2 copies per
+ * volume header (which would be a total of 2 x 4 = 8 + 1 for freemap recovery
+ * + 1 for breathing room = 10 total instead of 15).  For now we use 15.
  */
 #define HAMMER2_ZONE_VOLHDR            0       /* volume header or backup */
-#define HAMMER2_ZONE_FREEMAP_A         1       /* freemap layer group A */
-#define HAMMER2_ZONE_FREEMAP_B         5       /* freemap layer group B */
-#define HAMMER2_ZONE_FREEMAP_C         9       /* freemap layer group C */
-#define HAMMER2_ZONE_FREEMAP_D         13      /* freemap layer group D */
-
+#define HAMMER2_ZONE_FREEMAP_00                1
+#define HAMMER2_ZONE_FREEMAP_01                5
+#define HAMMER2_ZONE_FREEMAP_02                9
+#define HAMMER2_ZONE_FREEMAP_03                13
+#define HAMMER2_ZONE_FREEMAP_04                17
+#define HAMMER2_ZONE_FREEMAP_05                21
+#define HAMMER2_ZONE_FREEMAP_06                25
+#define HAMMER2_ZONE_FREEMAP_07                29
+#define HAMMER2_ZONE_FREEMAP_08                33
+#define HAMMER2_ZONE_FREEMAP_09                37
+#define HAMMER2_ZONE_FREEMAP_10                41
+#define HAMMER2_ZONE_FREEMAP_11                45
+#define HAMMER2_ZONE_FREEMAP_12                49
+#define HAMMER2_ZONE_FREEMAP_13                53
+#define HAMMER2_ZONE_FREEMAP_14                57
+#define HAMMER2_ZONE_FREEMAP_END       61      /* (non-inclusive) */
+#define HAMMER2_ZONE_UNUSED62          62
+#define HAMMER2_ZONE_UNUSED63          63
+
+#define HAMMER2_ZONE_FREEMAP_COPIES    15
                                                /* relative to FREEMAP_x */
 #define HAMMER2_ZONEFM_LEVEL1          0       /* 2GB leafmap */
 #define HAMMER2_ZONEFM_LEVEL2          1       /* 2TB indmap */
index 07de4c6..9a55506 100644 (file)
@@ -92,6 +92,7 @@ hammer2_freemap_reserve(hammer2_trans_t *trans, hammer2_chain_t *chain,
 {
        hammer2_blockref_t *bref = &chain->bref;
        hammer2_off_t off;
+       int index;
        size_t bytes;
 
        /*
@@ -104,66 +105,62 @@ hammer2_freemap_reserve(hammer2_trans_t *trans, hammer2_chain_t *chain,
        bytes = 1 << radix;
 
        /*
-        * Adjust by HAMMER2_ZONE_FREEMAP_{A,B,C,D} using the existing
-        * offset as a basis.  Start in zone A if previously unallocated.
+        * Calculate block selection index 0..7 of current block.
         */
-#if 0
-       kprintf("trans %04jx/%08x freemap chain %p.%d [%08x] %016jx/%d %016jx",
-               trans->sync_tid, trans->flags,
-               chain, chain->bref.type, chain->flags,
-               chain->bref.key, chain->bref.keybits,
-               bref->data_off);
-#endif
        if ((bref->data_off & ~HAMMER2_OFF_MASK_RADIX) == 0) {
-               off = HAMMER2_ZONE_FREEMAP_A;
+               index = 0;
        } else {
                off = bref->data_off & ~HAMMER2_OFF_MASK_RADIX &
                      (((hammer2_off_t)1 << HAMMER2_FREEMAP_LEVEL1_RADIX) - 1);
                off = off / HAMMER2_PBUFSIZE;
-               KKASSERT(off >= HAMMER2_ZONE_FREEMAP_A);
-               KKASSERT(off < HAMMER2_ZONE_FREEMAP_D + 4);
+               KKASSERT(off >= HAMMER2_ZONE_FREEMAP_00 &&
+                        off < HAMMER2_ZONE_FREEMAP_END);
+               index = (int)(off - HAMMER2_ZONE_FREEMAP_00) / 4;
+               KKASSERT(index >= 0 && index < HAMMER2_ZONE_FREEMAP_COPIES);
        }
 
-       if ((trans->flags &
-            (HAMMER2_TRANS_ISFLUSH | HAMMER2_TRANS_ISALLOCATING)) ==
-           HAMMER2_TRANS_ISFLUSH) {
+       /*
+        * Calculate new index (our 'allocation').  We have to be careful
+        * here as there can be two different transaction ids running
+        * concurrently when a flush is in-progress.
+        *
+        * We also want to make sure, for algorithmic repeatability, that
+        * the index sequences are monotonic with transaction ids.  Some
+        * skipping is allowed as long as we ensure that all four volume
+        * header backups have consistent freemaps.
+        *
+        * FLUSH  NORMAL FLUSH  NORMAL FLUSH  NORMAL FLUSH  NORMAL
+        * N+=1   N+=2
+        * (0->1) (1->3) (3->4) (4->6) (6->7) (7->9) (9->10) (10->12)
+        *
+        * [-concurrent-][-concurrent-][-concurrent-][-concurrent-]
+        *
+        * (alternative first NORMAL might be 0->2 if flush had not yet
+        *  modified the chain, this is the worst case).
+        */
+       if ((trans->flags & HAMMER2_TRANS_ISFLUSH) == 0) {
                /*
-                * Delete-Duplicates while flushing the fchain topology
-                * itself.
+                * Normal transactions always run with the highest TID.
+                * But if a flush is in-progress we want to reserve a slot
+                * for the flush with a lower TID running concurrently to
+                * do a delete-duplicate.
                 */
-#if 0
-               kprintf(" flush ");
-#endif
-               if (off >= HAMMER2_ZONE_FREEMAP_D)
-                       off = HAMMER2_ZONE_FREEMAP_B;
-               else if (off >= HAMMER2_ZONE_FREEMAP_C)
-                       off = HAMMER2_ZONE_FREEMAP_A;
-               else if (off >= HAMMER2_ZONE_FREEMAP_B)
-                       off = HAMMER2_ZONE_FREEMAP_D;
-               else
-                       off = HAMMER2_ZONE_FREEMAP_C;
+               index = (index + 2) % HAMMER2_ZONE_FREEMAP_COPIES;
+       } else if (trans->flags & HAMMER2_TRANS_ISALLOCATING) {
+               /*
+                * Flush transaction, hammer2_freemap.c itself is doing a
+                * delete-duplicate during an allocation within the freemap.
+                */
+               index = (index + 1) % HAMMER2_ZONE_FREEMAP_COPIES;
        } else {
                /*
-                * Allocations from the freemap via a normal transaction
-                * or a flush whos sync_tid has been bumped (so effectively
-                * done as a normal transaction).
+                * Flush transaction, hammer2_flush.c is doing a
+                * delete-duplicate on the freemap while flushing
+                * hmp->fchain.
                 */
-#if 0
-               kprintf(" alloc ");
-#endif
-               if (off >= HAMMER2_ZONE_FREEMAP_D)
-                       off = HAMMER2_ZONE_FREEMAP_A;
-               else if (off >= HAMMER2_ZONE_FREEMAP_C)
-                       off = HAMMER2_ZONE_FREEMAP_D;
-               else if (off >= HAMMER2_ZONE_FREEMAP_B)
-                       off = HAMMER2_ZONE_FREEMAP_C;
-               else
-                       off = HAMMER2_ZONE_FREEMAP_B;
+               index = (index + 1) % HAMMER2_ZONE_FREEMAP_COPIES;
        }
 
-
-       off = off * HAMMER2_PBUFSIZE;
-
        /*
         * Calculate the block offset of the reserved block.  This will
         * point into the 4MB reserved area at the base of the appropriate
@@ -174,30 +171,31 @@ hammer2_freemap_reserve(hammer2_trans_t *trans, hammer2_chain_t *chain,
        case HAMMER2_FREEMAP_LEVEL4_RADIX:      /* 2EB */
                KKASSERT(bref->type == HAMMER2_BREF_TYPE_FREEMAP_NODE);
                KKASSERT(bytes == HAMMER2_FREEMAP_LEVELN_PSIZE);
-               off += H2FMBASE(bref->key, HAMMER2_FREEMAP_LEVEL4_RADIX) +
-                      HAMMER2_ZONEFM_LEVEL4 * HAMMER2_PBUFSIZE;
+               off = H2FMBASE(bref->key, HAMMER2_FREEMAP_LEVEL4_RADIX) +
+                     (index * 4 + HAMMER2_ZONEFM_LEVEL4) * HAMMER2_PBUFSIZE;
                break;
        case HAMMER2_FREEMAP_LEVEL3_RADIX:      /* 2PB */
                KKASSERT(bref->type == HAMMER2_BREF_TYPE_FREEMAP_NODE);
                KKASSERT(bytes == HAMMER2_FREEMAP_LEVELN_PSIZE);
-               off += H2FMBASE(bref->key, HAMMER2_FREEMAP_LEVEL3_RADIX) +
-                      HAMMER2_ZONEFM_LEVEL3 * HAMMER2_PBUFSIZE;
+               off = H2FMBASE(bref->key, HAMMER2_FREEMAP_LEVEL3_RADIX) +
+                     (index * 4 + HAMMER2_ZONEFM_LEVEL3) * HAMMER2_PBUFSIZE;
                break;
        case HAMMER2_FREEMAP_LEVEL2_RADIX:      /* 2TB */
                KKASSERT(bref->type == HAMMER2_BREF_TYPE_FREEMAP_NODE);
                KKASSERT(bytes == HAMMER2_FREEMAP_LEVELN_PSIZE);
-               off += H2FMBASE(bref->key, HAMMER2_FREEMAP_LEVEL2_RADIX) +
-                      HAMMER2_ZONEFM_LEVEL2 * HAMMER2_PBUFSIZE;
+               off = H2FMBASE(bref->key, HAMMER2_FREEMAP_LEVEL2_RADIX) +
+                     (index * 4 + HAMMER2_ZONEFM_LEVEL2) * HAMMER2_PBUFSIZE;
                break;
        case HAMMER2_FREEMAP_LEVEL1_RADIX:      /* 2GB */
                KKASSERT(bref->type == HAMMER2_BREF_TYPE_FREEMAP_LEAF);
                KKASSERT(bytes == HAMMER2_FREEMAP_LEVELN_PSIZE);
-               off += H2FMBASE(bref->key, HAMMER2_FREEMAP_LEVEL1_RADIX) +
-                      HAMMER2_ZONEFM_LEVEL1 * HAMMER2_PBUFSIZE;
+               off = H2FMBASE(bref->key, HAMMER2_FREEMAP_LEVEL1_RADIX) +
+                     (index * 4 + HAMMER2_ZONEFM_LEVEL1) * HAMMER2_PBUFSIZE;
                break;
        default:
                panic("freemap: bad radix(2) %p %d\n", bref, bref->keybits);
                /* NOT REACHED */
+               off = (hammer2_off_t)-1;
                break;
        }
        bref->data_off = off | radix;