hammer2 - freemap part 4, misc fixes
authorMatthew Dillon <dillon@apollo.backplane.com>
Thu, 13 Jun 2013 07:11:04 +0000 (00:11 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Thu, 13 Jun 2013 07:11:04 +0000 (00:11 -0700)
* Revamp the freemap a bit.  Remove Layer 0.  Layer 1 is now the LEAF.
  2GB of media storage is now represented by a single 64KB Layer 1 block.
  Synchronize the FREEMAP document with the current thinking.

  The Layer 1 block contains 1024x64 entries.  Each entry represents 2MBytes
  of media storage.  These entries are no longer blockrefs pointing to
  Layer 0 but are instead terminal structures.

* Each entry represents a 16KB allocation granularity in 2 bits and has
  128 bit pairs (256 bits total), plus additional information to represent
  the 2MBytes of storage.

  Fine-grained allocations are supported via an iterator field, currently
  allowing fine-grained allocations down to 1KB and potentially expandable
  in the future to even smaller allocation sizes.

* Fix a SMP race in voldata handling during flush.  The freemap portion of
  voldata could be updated during crc calculations due to hmp->fchain not
  being held locked, causing random volume header/backups to fail their CRC
  test on remount.

* Add missing BUF_KERNPROC() when chain->bp is replaced.  Fixes a kernel
  lock ownership assertion.

* Add freezone/radix fields to the inode_data structure.  Each inode can
  accomodate four fields.  The fields are not yet utilized.  Current thinking
  is to use them to optimize the bulk free-scan for freeing blocks.

sbin/hammer2/cmd_debug.c
sbin/hammer2/cmd_stat.c
sys/vfs/hammer2/FREEMAP
sys/vfs/hammer2/hammer2_chain.c
sys/vfs/hammer2/hammer2_disk.h
sys/vfs/hammer2/hammer2_flush.c
sys/vfs/hammer2/hammer2_freemap.c
sys/vfs/hammer2/hammer2_subr.c

index 18b9f1f..2a6b5c8 100644 (file)
@@ -534,7 +534,7 @@ show_bref(int fd, int tab, int bi, hammer2_blockref_t *bref, int dofreemap)
                }
                break;
        case HAMMER2_BREF_TYPE_INDIRECT:
-               bscan = &media.npdata.blockref[0];
+               bscan = &media.npdata[0];
                bcount = bytes / sizeof(hammer2_blockref_t);
                didnl = 1;
                printf("{\n");
@@ -558,24 +558,30 @@ show_bref(int fd, int tab, int bi, hammer2_blockref_t *bref, int dofreemap)
                printf("{\n");
                break;
        case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
-               printf("radix=%d {\n",
-                       bref->check.freemap.radix);
-               obrace = 1;
-               for (i = 0; i < (int)(bytes / sizeof(uint64_t)); ++i) {
-                       if ((i & 3) == 0)
-                               tabprintf(tab, "");
-                       else
-                               printf(" ");
-                       printf("%016jx", (intmax_t)media.bmdata.array[i]);
-                       if ((i & 3) == 3)
-                               printf("\n");
+               printf("{\n");
+               for (i = 0; i < HAMMER2_FREEMAP_COUNT; ++i) {
+                       if (media.bmdata[i].radix == 0 &&
+                           media.bmdata[i].avail == 0) {
+                               continue;
+                       }
+                       tabprintf(tab + 4, "%04d.%02d (avail=%5d) "
+                                 "%08x %08x %08x %08x %08x %08x %08x %08x\n",
+                                 i, media.bmdata[i].radix,
+                                 media.bmdata[i].avail,
+                                 media.bmdata[i].bitmap[0],
+                                 media.bmdata[i].bitmap[1],
+                                 media.bmdata[i].bitmap[2],
+                                 media.bmdata[i].bitmap[3],
+                                 media.bmdata[i].bitmap[4],
+                                 media.bmdata[i].bitmap[5],
+                                 media.bmdata[i].bitmap[6],
+                                 media.bmdata[i].bitmap[7]);
                }
-               if (((i - 1) & 3) != 3)
-                       printf("\n");
+               tabprintf(tab, "}\n");
                break;
        case HAMMER2_BREF_TYPE_FREEMAP_NODE:
                printf("{\n");
-               bscan = &media.npdata.blockref[0];
+               bscan = &media.npdata[0];
                bcount = bytes / sizeof(hammer2_blockref_t);
                break;
        default:
index 53e03ce..1a50ed9 100644 (file)
@@ -58,7 +58,7 @@ cmd_stat(int ac, const char **av)
        }
        if (w < 16)
                w = 16;
-       printf("%-*.*s ncp data-use  inode-use kaddr\n", w, w, "PATH");
+       printf("%-*.*s ncp  data-use inode-use kaddr\n", w, w, "PATH");
        for (i = 0; i < ac; ++i) {
                if ((fd = open(av[i], O_RDONLY)) < 0) {
                        fprintf(stderr, "%s: %s\n", av[i], strerror(errno));
index a558815..9fc9854 100644 (file)
 
-                     HAMMER2 Freemap Design Notes
+                       HAMMER2 Freemap Design Notes
 
                                Overview
 
     HAMMER2 Media is broken down into 2 GByte zones.  Each 2 GByte zone
-    contains a 4 MByte header (64 x 64K blocks).
+    contains a 4 MByte header (64 x 64K blocks).  The blocks in this header
+    are reserved for various purposes.  For example, block #0 is used for
+    the volume header or for a volume header backup.
 
-    Block #0 is the volume header.  Block #0 is the next three zones,
-    assuming the media is big enough, contain backup volume headers.
-    Flushes cycle through these four headers and the mount code iterates
-    all four to locate the best candidate to mount with.  The reason for
-    this is to ensure that mounting works even if a crash catches a
-    volume header in a partial update.
-
-    Remaining blocks are used for various purposes, primarily by the
-    freemap.
-
-    * It is very important to remember that the Freemap only uses
-      blocks from these reserved areas.  Freemap blocks are NOT dynamically
+    * It is very important to remember that the Freemap only uses blocks
+      from these reserved areas.  Freemap blocks are NOT dynamically
       allocated.
 
     * On-mount, the synchronization TID for the main H2 filesystem is
       compared against the synchronization TID of the freemap and the
       H2 topology is incrementally iterated using mirror_tid to update
       the freemap with any missing information.  This way the freemap flush
-      does not need to be synchronized with the normal H2 flush.
+      does not need to be synchronized with the normal H2 flush.  This
+      can be done very quickly on-mount.
 
     * The freemap is flushed in a manner similar to the normal H2 filesystem,
       but as mentioned above it does not have to be synchronized.  One freemap
       flush could cover several H2 flushes.  A freemap flush is not necessary
       for e.g. a fsync() or sync() to complete successfully.
 
-    * The minimum allocation radix is 10 (1 Kilobyte).  In Hammer2, the
-      inode is 1KB and contains up to 512 bytes of direct data, so in terms
-      of file storage efficiency H2 is can pack small files and their inodes
-      into a single 1KB block.
+    * The freemap granularity is 64KB (radix of 16) but the minimum
+      allocation radix for code is 1KB (radix of 10).  1KB inodes can
+      hold up to 512 bytes of direct data, so small files eat exactly
+      1KB of media storage.
+
+    * Representation of storage is block-oriented with ~1KB granularity
+      in the filesystem topology.  However, H2 also stores freemap locality
+      hints in the inode at all levels which specifies which freemap zones
+      all storage allocations made by the sub-tree are allocated from.  Up
+      to four zones may be listed in each inode.  The zones are power-of-sized
+      and aligned the same and use a base/radix representation (same as used
+      for blockref->data_off).
+
+      During updates higher level inodes may not have a sufficient number of
+      entries to represent the storage used on a fine-grain.  In this
+      situation the representations back-off to larger radix values.
+
+      Ultimately these representations will be optimized by background scans.
+      That is, ultimately storage localization can be optimized bottom-up
+      such that it winds up being fairly optimal.  This includes the ability
+      to detect when a writable snapshot has differentiated sufficiently to
+      warrant a split.  This optimization should NOT attempt to dup common
+      data blocks.
+
+    * The zone oriented forward storage references in the inode (the four
+      entries) is used by the bulk free-scan to reduce the amount of
+      meta-data which must be duplicatively scanned.  More specifically,
+      when the sysadmin deletes storage and/or files or even whole directory
+      subhierachies, it is possible for a bulk free-scan to incrementally
+      scan the meta-data topology that covers ONLY those areas to determine
+      if it is possible to free up any actual blocks.
+
+    * H2 does not require that a rm -rf or snapshot destruction, truncation,
+      or any other operation actually modify the freemap.  That is, the
+      freemap elements can remain set to ALLOCATED (11).  In fact, it is
+      possible to just delete the directory inode and not even recursively
+      scan sub-directories.  The related storage will eventually be freed
+      by an exhaustive bulk free-scan anyway.
+
+                               Freemap Topology
+
+    The freemap topology contains 4 levels of meta-data (blockref arrays),
+    one of which is embedded in the volume header (so only three real
+    meta-data levels), plus one level of leaf-data.
+
+    Level 1 - (radix 10) 64KB blockmap representing 2GB.  There are 1024
+             entries representing ~2MB worth of media storage per entry.
+
+             Each entry maps 32 x 64KB allocations @ 2 bits per allocation,
+             plus contains additional meta-data which allows H2 to cluster
+             I/O operations.  Each entry locks the allocation granularity
+             (e.g. to 1KB = radix 10 for inodes).
+
+    Level 2 - (radix 10) 64KB blockmap representing 2TB (~2GB per entry)
+    Level 3 - (radix 10) 64KB blockmap representing 2PB (~2TB per entry)
+    Level 4 - (radix 10) 64KB blockmap representing 2EB (~2PB per entry)
+    Level 5 - (radix 3) blockref x 8 in volume header representing 16EB (2^64)
+             (this conveniently eats one 512-byte 'sector' of the 64KB
+             volume header).
 
-      The freemap thus must handle a 1KB granularity, which comes to around
-      256KB per 2GB zone at 1-bit-per-block.  Since we have ~4MB available,
-      there is plenty of space to implement redundancy.
+    Each level is assign reserved blocks in the 4MB header per 2GB zone.
+    Since we use block 0 for the volume header / volume header backup,
+    our level names above can simply also represent the relative block
+    number.  Level 1 uses block 1 through level 4 using block 4.  Level 5
+    is stored in the volume header.
 
-                           Freemap Topology
+    In addition there are FOUR SETS, A, B, C, and D, each containing blocks
+    for level 1-4.  Hammer2 alternates between sets on a block-by-block basis
+    in order to maintain consistency when updating the freemap.
 
-    The freemap topology contains 5 levels of meta-data (blockref arrays).
+                               Leaf Substructure
 
-    Level 0 - (radix 10+19+2) 256KB bitmap representing 2GB
+    * radix  - Clustering radix.  All allocations for any given ~2MB zone
+              are always the same size, allowing the filesystem code to
+              cluster buffer cache I/O.
 
-    Level 1 - (radix 10) 64KB blockmap representing 2GB.  This level
-             shadows level 0 exactly.   There are 1024 blockref entries
-             each representing ~2MB worth of media storage.
+    * bitmap - four 32 bit words representing ~2MB in 64KB allocation chunks
+              at 2 bits per chunk.  The filesystem allocation granularity
+              can be smaller (currently ~1KB minimum), and the live
+              filesystem keeps caches iterations when allocating multiple
+              chunks.  However, on remount any partial allocations out of
+              a 64KB allocation block causes the entire 64KB to be
+              considered allocated.  Fragmented space can potentially be
+              reclaimed and/or relocated by the bulk block free scan.
 
-    Level 2 - (radix 10) 64KB blockmap representing 2TB.
-    Level 3 - (radix 10) 64KB blockmap representing 2PB.
-    Level 4 - (radix 10) 64KB blockmap representing 2EB.
-    Level 5 - (radix 3) blockref x 8 in volume header representing 16EB (2^64)
-             (this conveniently eats one 512-byte 'sector' of the 64KB
-             volume header).
+              The 2-bit bitmap fields are assigned as follows:
 
-    Each level is assign reserved blocks in the 4MB header per 2GB zone.
-    Level 0 requires four blocks (#1-#4), level 1, 2, 3, and 4 each require
-    one block (#5, #6, #7, #8), while level 5 is embedded in the volume
-    header.
+              00       FREE
+              01       ARMED for free stage (future use)
+              10       ARMED for free stage (future use)
+              11       ALLOCATED
 
-    In addition, the reserved blocks 1-8 are not overwritten on each flush.
-    Instead, a different set of reserved blocks is used.  Four sets, A-D,
-    are specified.  A=1-8, B=9-16, C=17-24, D=25-32.  Blocks 33-63 are unused
-    at present and reserved for future use.
+              It should be noted that in some cases, such as snapshot
+              destruction, H2 does not bother to actually ARM the related
+              blocks (which would take a long time).  Instead, the bulk
+              free-scan may have to do a more exhaustive scan.
 
-                           Blockref Substructure
+                             Blockref Substructure
 
     The blockref substructure at each level steals some space from the
     check code area (a 24-byte area).  We only need 4 bytes for the check
     code icrc.  We use some of the remaining space to store information
     that allows the block allocator to do its work more efficiently.
 
-    * biggest - Biggest available linear allocation radix (powers of 2).
-               May be initialized larger but the 2GB zone has a 4MB chunk
-               taken out of it for a header so the maximum linear allocation
-               is going to be 1GB (and not an efficient 1GB at that), which
-               would be radix 30.
+    * bigmask - A mask of radixes available for allocation under this
+               blockref.  Typically initialized to -1.
 
     * avail   - Total available space in bytes.
 
     of work but I hope I've laid down a media format that will not have to be
     changed down the line to accomodate better allocation strategies.
 
-                           Initial Conditions
+                               Initial Conditions
 
     The freemap is a multi-indirect block structure but there is no real
     reason to pre-format it in newfs_hammer2.  Instead, newfs_hammer2 simply
-    leaves the associated blockset empty and uses the (voldata->allocator_beg)
-    field to allocate space linearly, then leaves it to the live filesystem
-    to initialize the freemap as more space gets allocated.
+    leaves the associated top-level indirect blocks empty and uses the
+    (voldata->allocator_beg) field to allocate space linearly, then leaves
+    it to the live filesystem to initialize the freemap as more space gets
+    allocated.
+
+                               How blocks are freed
+
+    The freemap bit patterns for each 64KB block are as follows:
+
+       00      FREE
+       01      ARMED (for free) (future use)
+       10      ARMED (for free) (future use)
+       11      ALLOCATED
+
+    Currently H2 only implements 00 and 11.  When a file, topology, or
+    snapshot is deleted H2 simply leaves the blocks marked allocated but
+    records the related freezone/radix(s) in memory.
+
+    At some point a background bulk free-scan will run.  This code must
+    scan meta-data and has a limited cache to detect duplicative sub-trees
+    (due to snapshots).  It uses the freezone/radix information recorded
+    in memory to reduce the complexity of the scan, find all references to
+    the related blocks in the meta-data, and determines what can actually
+    be freed.  Once this determination is made the bulk free-scan sets
+    the related freemap bits to FREE (00).
 
-    To keep the abstraction simple, this means in the bitmap 0=unallocated,
-    1=allocated.  The allocation blockmap is initialized for the zone's 4MB
-    reserve area as new zones are opened up for allocation.  Initialization
-    of the freemap for the root zone at offset 0 is further adjusted based
-    on (voldata->allocator_beg).  This field is not used once the freemap
-    for the root zone has been setup by the live filesystem.
+    An exhaustive free-scan is not usually required during normal operation
+    but is typically run incrementally by cron every so often to ensure, over
+    time, that all freeable blocks are actually freed.  This is most useful
+    when maintaining multiple snapshots.
 
                        Use of Generic indirect-block API
 
     and indirect blocks via a reserved-block calculation and (obviously)
     not via a recursive call to the allocator.
 
-    The Freemap is defined above as a fixed 6-level scheme (level 0-5),
+    The Freemap is defined above as a fixed 5-level scheme (level 1-5),
     but in actual operation the radix tree can be shortcut just as it
     is with normal files.  However, shorcuts are forced into the radix
-    values of this specification and reserved blocks are calculated based on
-    the radix level and offset, so as the freemap becomes more fleshed
+    values of this specification and reserved blocks are calculated based
+    on the radix level and offset, so as the freemap becomes more fleshed
     out the tree looks more and more like the specification.
 
     One advantage of doing things this way is that smaller filesystems
     to layer 2.  And so forth.
 
     At the moment we have no plans to return any of the unused 4MB zone
-    header space back to the filesystem for general use.  There are lots
-    of things we may want to use the reserved areas for in the future.
+    header space (per 2GB of storage) back to the filesystem for general use.
+    There are lots of things we may want to use the reserved areas for in
+    the future.
index 3686388..eaf55b5 100644 (file)
@@ -1258,6 +1258,7 @@ skipxx: /* XXX */
                                }
                        }
                        chain->bp = nbp;
+                       BUF_KERNPROC(chain->bp);
                }
                chain->data = bdata;
                atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
@@ -1441,7 +1442,7 @@ retry:
                KKASSERT(parent->data != NULL);
                KKASSERT(index >= 0 &&
                         index < parent->bytes / sizeof(hammer2_blockref_t));
-               bref = &parent->data->npdata.blockref[index];
+               bref = &parent->data->npdata[index];
                break;
        case HAMMER2_BREF_TYPE_VOLUME:
                KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT);
@@ -1676,7 +1677,7 @@ again:
                } else {
                        if (parent->data == NULL)
                                panic("parent->data is NULL");
-                       base = &parent->data->npdata.blockref[0];
+                       base = &parent->data->npdata[0];
                }
                count = parent->bytes / sizeof(hammer2_blockref_t);
                break;
@@ -1884,7 +1885,7 @@ again2:
                        base = NULL;
                } else {
                        KKASSERT(parent->data != NULL);
-                       base = &parent->data->npdata.blockref[0];
+                       base = &parent->data->npdata[0];
                }
                count = parent->bytes / sizeof(hammer2_blockref_t);
                break;
@@ -2148,7 +2149,7 @@ again:
                        base = NULL;
                } else {
                        KKASSERT(parent->data != NULL);
-                       base = &parent->data->npdata.blockref[0];
+                       base = &parent->data->npdata[0];
                }
                count = parent->bytes / sizeof(hammer2_blockref_t);
                break;
@@ -2377,7 +2378,7 @@ hammer2_chain_duplicate(hammer2_trans_t *trans, hammer2_chain_t *parent, int i,
                                base = NULL;
                        } else {
                                KKASSERT(parent->data != NULL);
-                               base = &parent->data->npdata.blockref[0];
+                               base = &parent->data->npdata[0];
                        }
                        count = parent->bytes / sizeof(hammer2_blockref_t);
                        break;
@@ -2640,7 +2641,9 @@ hammer2_chain_dup_fixup(hammer2_chain_t *ochain, hammer2_chain_t *nchain)
                atomic_set_int(&nchain->flags, HAMMER2_CHAIN_EMBEDDED);
                nchain->data = kmalloc(sizeof(nchain->data->bmdata),
                                       ochain->hmp->mchain, M_WAITOK | M_ZERO);
-               nchain->data->bmdata = ochain->data->bmdata;
+               bcopy(ochain->data->bmdata,
+                     nchain->data->bmdata,
+                     sizeof(nchain->data->bmdata));
                break;
        default:
                break;
@@ -2869,7 +2872,7 @@ hammer2_chain_create_indirect(hammer2_trans_t *trans, hammer2_chain_t *parent,
                        break;
                case HAMMER2_BREF_TYPE_INDIRECT:
                case HAMMER2_BREF_TYPE_FREEMAP_NODE:
-                       base = &parent->data->npdata.blockref[0];
+                       base = &parent->data->npdata[0];
                        count = parent->bytes / sizeof(hammer2_blockref_t);
                        break;
                case HAMMER2_BREF_TYPE_VOLUME:
index cbe2ae8..9e3901c 100644 (file)
  */
 #define HAMMER2_ZONE_VOLHDR            0       /* volume header or backup */
 #define HAMMER2_ZONE_FREEMAP_A         1       /* freemap layer group A */
-#define HAMMER2_ZONE_FREEMAP_B         9       /* freemap layer group B */
-#define HAMMER2_ZONE_FREEMAP_C         17      /* freemap layer group C */
-#define HAMMER2_ZONE_FREEMAP_D         25      /* freemap layer group D */
+#define HAMMER2_ZONE_FREEMAP_B         5       /* freemap layer group B */
+#define HAMMER2_ZONE_FREEMAP_C               /* freemap layer group C */
+#define HAMMER2_ZONE_FREEMAP_D         13      /* freemap layer group D */
 
                                                /* relative to FREEMAP_x */
-#define HAMMER2_ZONEFM_LEVEL0          0       /* 256KB bitmap (4 blks) */
-#define HAMMER2_ZONEFM_LEVEL1          4       /* 2GB indmap */
-#define HAMMER2_ZONEFM_LEVEL2          5       /* 2TB indmap */
-#define HAMMER2_ZONEFM_LEVEL3          6       /* 2PB indmap */
-#define HAMMER2_ZONEFM_LEVEL4          7       /* 2EB indmap */
+#define HAMMER2_ZONEFM_LEVEL1          0       /* 2GB leafmap */
+#define HAMMER2_ZONEFM_LEVEL2          1       /* 2TB indmap */
+#define HAMMER2_ZONEFM_LEVEL3          2       /* 2PB indmap */
+#define HAMMER2_ZONEFM_LEVEL4          3       /* 2EB indmap */
 /* LEVEL5 is a set of 8 blockrefs in the volume header 16EB */
 
-#define HAMMER2_ZONE_BLOCK49           49      /* future */
-#define HAMMER2_ZONE_BLOCK50           50      /* future */
-#define HAMMER2_ZONE_BLOCK51           51      /* future */
-#define HAMMER2_ZONE_BLOCK52           52      /* future */
-#define HAMMER2_ZONE_BLOCK53           53      /* future */
-#define HAMMER2_ZONE_BLOCK54           54      /* future */
-#define HAMMER2_ZONE_BLOCK55           55      /* future */
-#define HAMMER2_ZONE_BLOCK56           56      /* future */
-#define HAMMER2_ZONE_BLOCK57           57      /* future */
-#define HAMMER2_ZONE_BLOCK58           58      /* future */
-#define HAMMER2_ZONE_BLOCK59           59      /* future */
-
-#define HAMMER2_ZONE_BLOCK60           60      /* future */
-#define HAMMER2_ZONE_BLOCK61           61      /* future */
-#define HAMMER2_ZONE_BLOCK62           62      /* future */
-#define HAMMER2_ZONE_BLOCK63           63      /* future */
 
 /*
  * Freemap radii.  Please note that LEVEL 1 blockref array entries
 #define HAMMER2_FREEMAP_LEVEL4_RADIX   61      /* 2EB */
 #define HAMMER2_FREEMAP_LEVEL3_RADIX   51      /* 2PB */
 #define HAMMER2_FREEMAP_LEVEL2_RADIX   41      /* 2TB */
-#define HAMMER2_FREEMAP_LEVEL1_RADIX   31      /* 2GB (256KB of bitmap) */
-#define HAMMER2_FREEMAP_LEVEL0_RADIX   21      /* 2MB (256 bytes of bitmap) */
+#define HAMMER2_FREEMAP_LEVEL1_RADIX   31      /* 2GB */
+#define HAMMER2_FREEMAP_LEVEL0_RADIX   21      /* 2MB (entry in l-1 leaf) */
 
 #define HAMMER2_FREEMAP_LEVELN_PSIZE   65536   /* physical bytes */
-#define HAMMER2_FREEMAP_LEVEL0_PSIZE   256     /* physical bytes */
 
+#define HAMMER2_FREEMAP_COUNT          (int)(HAMMER2_FREEMAP_LEVELN_PSIZE / \
+                                        sizeof(hammer2_bmap_data_t))
+#define HAMMER2_FREEMAP_BLOCK_RADIX    14
+#define HAMMER2_FREEMAP_BLOCK_SIZE     (1 << HAMMER2_FREEMAP_BLOCK_RADIX)
+#define HAMMER2_FREEMAP_BLOCK_MASK     (HAMMER2_FREEMAP_BLOCK_SIZE - 1)
 
 /*
  * Two linear areas can be reserved after the initial 2MB segment in the base
@@ -399,26 +386,16 @@ struct hammer2_blockref {         /* MUST BE EXACTLY 64 BYTES */
                /*
                 * Freemap hints are embedded in addition to the icrc32.
                 *
-                * biggest - Largest possible allocation 2^N within sub-tree.
-                *           typically initialized to 64 in freemap_blockref
-                *           and reduced as-needed when a request fails.
+                * bigmask - Radixes available for allocation (0-31).
+                *           Heuristical (may be permissive but not
+                *           restrictive).  Typically only radix values
+                *           10-16 are used (i.e. (1<<10) through (1<<16)).
                 *
-                *           An allocation > 2^N is guaranteed to fail.  An
-                *           allocation <= 2^N MAY fail, and if it does the
-                *           biggest hint will be adjusted downward.
-                *
-                *           Used when allocating space.
-                *
-                * radix   - (Leaf only) once assigned, radix for clustering.
-                *           All device I/O can cluster within the 2MB
-                *           segment.
+                * avail   - Total available space remaining, in bytes
                 */
                struct {
                        uint32_t icrc32;
-                       uint8_t biggest;
-                       uint8_t radix;          /* 0, LBUFRADIX, PBUFRADIX */
-                       uint8_t reserved06;
-                       uint8_t reserved07;
+                       uint32_t bigmask;       /* available radixes */
                        uint64_t avail;         /* total available bytes */
                        uint64_t unused;        /* unused must be 0 */
                } freemap;
@@ -507,16 +484,60 @@ typedef struct hammer2_blockset hammer2_blockset_t;
 #endif
 
 /*
- * The media indirect block structure.
+ * hammer2_bmap_data - A freemap entry in the LEVEL1 block.
+ *
+ * Each 64-byte entry contains the bitmap and meta-data required to manage
+ * a LEVEL0 (2MB) block of storage.  The storage is managed in 128 x 16KB
+ * chunks.  Smaller allocation granularity is supported via a linear iterator
+ * and/or must otherwise be tracked in ram.
+ *
+ * (data structure must be 64 bytes exactly)
+ *
+ * linear  - A BYTE linear allocation offset.  May contain values between
+ *          0 and 2MB.  Any value which is 16KB-aligned is effective neutral
+ *          (forces the bitmap to be checked), whereas intermediate values
+ *          allow iterative allocations from a bitmap that is already marked
+ *          allocated.
+ *
+ *          Please note that file data granularity may be limited by
+ *          other issues such as buffer cache direct-mapping and the
+ *          desire to support sector sizes up to 16KB (so H2 only issues
+ *          I/O's in multiples of 16KB anyway).
+ *
+ * radix   - Once assigned, radix for clustering.  Cleared to 0 only if
+ *          the entire leaf becomes free (related device buffer cache buffers
+ *          must also be destroyed to avoid later overlap assertions).  All
+ *          chain's within this 2MB segment must match the clustering radix.
+ *
+ *          Device I/O size may be further adjusted to the minimum (16KB),
+ *          even if radix is smaller.   This forms the I/O clustering radix.
+ *
+ *          Value will typically be 10-16 (1KB to 64KB).  Smaller values may
+ *          be allowed in the future (probably unnecessary to add that
+ *          complication though).
+ *
+ * bitmap  - Two bits per 16KB allocation block arranged in arrays of
+ *          32-bit elements, 128x2 bits representing ~2MB worth of media
+ *          storage.  Bit patterns are as follows:
+ *
+ *          00 Unallocated
+ *          01 Armed for bulk free scan
+ *          10 Possibly free
+ *           11 Allocated
  */
-struct hammer2_indblock_data {
-       hammer2_blockref_t blockref[HAMMER2_IND_COUNT_MAX];
-};
-
-typedef struct hammer2_indblock_data hammer2_indblock_data_t;
-
 struct hammer2_bmap_data {
-       uint64_t    array[HAMMER2_FREEMAP_LEVEL0_PSIZE / sizeof(uint64_t)];
+       int32_t linear;         /* 00 linear sub-granular allocation offset */
+       uint8_t reserved04;     /* 04 */
+       uint8_t radix;          /* 05 cluster I/O 0, LBUFRADIX, PBUFRADIX */
+       uint8_t reserved06;     /* 06 */
+       uint8_t reserved07;     /* 07 */
+       uint32_t reserved08;    /* 08 */
+       uint32_t reserved0C;    /* 0C */
+       uint32_t reserved10;    /* 10 */
+       uint32_t reserved14;    /* 14 */
+       uint32_t reserved18;    /* 18 */
+       uint32_t avail;         /* 1C */
+       uint32_t bitmap[8];     /* 20-3F 256 bits manages 2MB/16KB/2-bits */
 };
 
 typedef struct hammer2_bmap_data hammer2_bmap_data_t;
@@ -627,16 +648,19 @@ struct hammer2_inode_data {
        /*
         * Quotas and cumulative sub-tree counters.
         */
-       hammer2_off_t   data_quota;     /* 00B0 subtree quota in bytes */
-       hammer2_off_t   data_count;     /* 00B8 subtree byte count */
-       hammer2_off_t   inode_quota;    /* 00C0 subtree quota inode count */
-       hammer2_off_t   inode_count;    /* 00C8 subtree inode count */
+       hammer2_key_t   data_quota;     /* 00B0 subtree quota in bytes */
+       hammer2_key_t   data_count;     /* 00B8 subtree byte count */
+       hammer2_key_t   inode_quota;    /* 00C0 subtree quota inode count */
+       hammer2_key_t   inode_count;    /* 00C8 subtree inode count */
        hammer2_tid_t   attr_tid;       /* 00D0 attributes changed */
        hammer2_tid_t   dirent_tid;     /* 00D8 directory/attr changed */
-       uint64_t        reservedE0;     /* 00E0 */
-       uint64_t        reservedE8;     /* 00E8 */
-       uint64_t        reservedF0;     /* 00F0 */
-       uint64_t        reservedF8;     /* 00F8 */
+
+       /*
+        * Tracks (possibly degenerate) free areas covering all sub-tree
+        * allocations under inode, not counting the inode itself.
+        * 0/0 indicates empty entry.  fully set-associative.
+        */
+       hammer2_off_t   freezones[4];   /* 00E0/E8/F0/F8 base|radix */
 
        unsigned char   filename[HAMMER2_INODE_MAXNAME];
                                        /* 0100-01FF (256 char, unterminated) */
@@ -912,8 +936,8 @@ typedef struct hammer2_volume_data hammer2_volume_data_t;
 union hammer2_media_data {
        hammer2_volume_data_t   voldata;
         hammer2_inode_data_t    ipdata;
-       hammer2_indblock_data_t npdata;
-       hammer2_bmap_data_t     bmdata;
+       hammer2_blockref_t      npdata[HAMMER2_IND_COUNT_MAX];
+       hammer2_bmap_data_t     bmdata[HAMMER2_FREEMAP_COUNT];
        char                    buf[HAMMER2_PBUFSIZE];
 };
 
index 805480e..3f28982 100644 (file)
@@ -67,6 +67,26 @@ static void hammer2_chain_flush_core(hammer2_flush_info_t *info,
 static int hammer2_chain_flush_scan1(hammer2_chain_t *child, void *data);
 static int hammer2_chain_flush_scan2(hammer2_chain_t *child, void *data);
 
+#if 0
+static __inline
+void
+hammer2_updatestats(hammer2_flush_info_t *info, hammer2_blockref_t *bref,
+                   int how)
+{
+       hammer2_key_t bytes;
+
+       if (bref->type != 0) {
+               bytes = 1 << (bref->data_off & HAMMER2_OFF_MASK_RADIX);
+               if (bref->type == HAMMER2_BREF_TYPE_INODE)
+                       info->inode_count += how;
+               if (how < 0)
+                       info->data_count -= bytes;
+               else
+                       info->data_count += bytes;
+       }
+}
+#endif
+
 /*
  * Transaction support functions for writing to the filesystem.
  *
@@ -509,7 +529,8 @@ hammer2_chain_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain)
        /*
         * If we encounter a deleted chain within our flush we can clear
         * the MODIFIED bit and avoid flushing it whether it has been
-        * destroyed or not.
+        * destroyed or not.  We must make sure that the chain is flagged
+        * MOVED in this situation so the parent picks up the deletion.
         */
        if (chain->delete_tid <= info->sync_tid) {
                if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
@@ -517,6 +538,11 @@ hammer2_chain_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain)
                                if (chain->bytes == chain->bp->b_bufsize)
                                        chain->bp->b_flags |= B_INVAL|B_RELBUF;
                        }
+                       if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
+                               hammer2_chain_ref(chain);
+                               atomic_set_int(&chain->flags,
+                                              HAMMER2_CHAIN_MOVED);
+                       }
                        atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
                        hammer2_chain_drop(chain);
                }
@@ -616,6 +642,9 @@ hammer2_chain_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain)
                /*
                 * We should flush the free block table before we calculate
                 * CRCs and copy voldata -> volsync.
+                *
+                * To prevent SMP races, fchain must remain locked until
+                * voldata is copied to volsync.
                 */
                hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS);
                if (hmp->fchain.flags & (HAMMER2_CHAIN_MODIFIED |
@@ -623,7 +652,6 @@ hammer2_chain_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain)
                        /* this will modify vchain as a side effect */
                        hammer2_chain_flush(info->trans, &hmp->fchain);
                }
-               hammer2_chain_unlock(&hmp->fchain);
 
                /*
                 * The volume header is flushed manually by the syncer, not
@@ -651,6 +679,7 @@ hammer2_chain_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain)
                                HAMMER2_VOLUME_ICRCVH_SIZE);
                hmp->volsync = hmp->voldata;
                atomic_set_int(&chain->flags, HAMMER2_CHAIN_VOLUMESYNC);
+               hammer2_chain_unlock(&hmp->fchain);
                break;
        case HAMMER2_BREF_TYPE_DATA:
                /*
@@ -1042,7 +1071,7 @@ hammer2_chain_flush_scan2(hammer2_chain_t *child, void *data)
        case HAMMER2_BREF_TYPE_INDIRECT:
        case HAMMER2_BREF_TYPE_FREEMAP_NODE:
                if (parent->data) {
-                       base = &parent->data->npdata.blockref[0];
+                       base = &parent->data->npdata[0];
                } else {
                        base = NULL;
                        KKASSERT(child->flags & HAMMER2_CHAIN_DELETED);
@@ -1054,7 +1083,7 @@ hammer2_chain_flush_scan2(hammer2_chain_t *child, void *data)
                count = HAMMER2_SET_COUNT;
                break;
        case HAMMER2_BREF_TYPE_FREEMAP:
-               base = &parent->data->npdata.blockref[0];
+               base = &parent->data->npdata[0];
                count = HAMMER2_SET_COUNT;
                break;
        default:
@@ -1080,16 +1109,16 @@ hammer2_chain_flush_scan2(hammer2_chain_t *child, void *data)
                if (base) {
                        KKASSERT(child->index < count);
                        bzero(&base[child->index], sizeof(child->bref));
-                       if (info->mirror_tid < child->delete_tid)
-                               info->mirror_tid = child->delete_tid;
                }
+               if (info->mirror_tid < child->delete_tid)
+                       info->mirror_tid = child->delete_tid;
        } else {
                if (base) {
                        KKASSERT(child->index < count);
                        base[child->index] = child->bref;
-                       if (info->mirror_tid < child->modify_tid)
-                               info->mirror_tid = child->modify_tid;
                }
+               if (info->mirror_tid < child->modify_tid)
+                       info->mirror_tid = child->modify_tid;
        }
 
        if (info->mirror_tid < child->bref.mirror_tid) {
index d14205c..a91ca8c 100644 (file)
 
 #include "hammer2.h"
 
-#define USE_SIMPLE_ALLOC       0
-
-#if !USE_SIMPLE_ALLOC
-
 static int hammer2_freemap_try_alloc(hammer2_trans_t *trans,
                        hammer2_chain_t **parentp, hammer2_blockref_t *bref,
                        int radix, hammer2_off_t bpref, hammer2_off_t *bnext);
+static void hammer2_freemap_init(hammer2_trans_t *trans, hammer2_key_t key,
+                    hammer2_chain_t *chain);
+static int hammer2_bmap_alloc(hammer2_trans_t *trans, hammer2_bmap_data_t *bmap,
+                  int n, int radix, hammer2_key_t *basep);
 static int hammer2_freemap_iterate(hammer2_trans_t *trans,
                        hammer2_chain_t **parentp, hammer2_chain_t **chainp,
                        hammer2_off_t bpref, hammer2_off_t *bnextp);
 
-#endif
-
 static __inline
 int
 hammer2_freemapradix(int radix)
@@ -106,7 +104,7 @@ hammer2_freemap_reserve(hammer2_mount_t *hmp, hammer2_blockref_t *bref,
                      (((hammer2_off_t)1 << HAMMER2_FREEMAP_LEVEL1_RADIX) - 1);
                off = off / HAMMER2_PBUFSIZE;
                KKASSERT(off >= HAMMER2_ZONE_FREEMAP_A);
-               KKASSERT(off < HAMMER2_ZONE_FREEMAP_D + 8);
+               KKASSERT(off < HAMMER2_ZONE_FREEMAP_D + 4);
 
                if (off >= HAMMER2_ZONE_FREEMAP_D)
                        off = HAMMER2_ZONE_FREEMAP_A;
@@ -145,27 +143,11 @@ hammer2_freemap_reserve(hammer2_mount_t *hmp, hammer2_blockref_t *bref,
                       HAMMER2_ZONEFM_LEVEL2 * HAMMER2_PBUFSIZE;
                break;
        case HAMMER2_FREEMAP_LEVEL1_RADIX:      /* 2GB */
-               KKASSERT(bref->type == HAMMER2_BREF_TYPE_FREEMAP_NODE);
+               KKASSERT(bref->type == HAMMER2_BREF_TYPE_FREEMAP_LEAF);
                KKASSERT(bytes == HAMMER2_FREEMAP_LEVELN_PSIZE);
                off += H2FMBASE(bref->key, HAMMER2_FREEMAP_LEVEL1_RADIX) +
                       HAMMER2_ZONEFM_LEVEL1 * HAMMER2_PBUFSIZE;
                break;
-       case HAMMER2_FREEMAP_LEVEL0_RADIX:      /* 2MB (256 byte bitmap) */
-               /*
-                * Terminal bitmap, start with 2GB base, then offset by
-                * 256 bytes of device offset per 2MB of logical space
-                * (8 bits per byte, 1024 byte allocation chunking).
-                */
-               KKASSERT(bref->type == HAMMER2_BREF_TYPE_FREEMAP_LEAF);
-               KKASSERT(bytes == HAMMER2_FREEMAP_LEVEL0_PSIZE);
-               off += H2FMBASE(bref->key, HAMMER2_FREEMAP_LEVEL1_RADIX) +
-                      HAMMER2_ZONEFM_LEVEL0 * HAMMER2_PBUFSIZE;
-
-               off += ((bref->key >> HAMMER2_FREEMAP_LEVEL0_RADIX) &
-                       ((1 << (HAMMER2_FREEMAP_LEVEL1_RADIX -
-                              HAMMER2_FREEMAP_LEVEL0_RADIX)) - 1)) *
-                       HAMMER2_FREEMAP_LEVEL0_PSIZE;
-               break;
        default:
                panic("freemap: bad radix(2) %p %d\n", bref, bref->keybits);
                /* NOT REACHED */
@@ -175,109 +157,6 @@ hammer2_freemap_reserve(hammer2_mount_t *hmp, hammer2_blockref_t *bref,
        return (0);
 }
 
-/*
- * Allocate media space, returning a combined data offset and radix.
- * THIS ROUTINE IS USE FOR DEBUGGING ONLY.
- *
- * This version of the routine is ONLY usable for testing and debug
- * purposes and only if the filesystem never instantiated an actual
- * freemap.  It uses the initial allocation iterator that newfs_hammer2
- * used to build the filesystem to allocate new space and is not capable
- * of dealing with freed space.
- */
-#if USE_SIMPLE_ALLOC
-
-static
-int
-hammer2_freemap_simple_alloc(hammer2_mount_t *hmp, hammer2_blockref_t *bref,
-                            int radix)
-{
-       hammer2_off_t data_off;
-       hammer2_off_t data_next;
-       hammer2_freecache_t *fc;
-       /*struct buf *bp;*/
-       size_t bytes;
-       int fctype;
-
-       bytes = (size_t)(1 << radix);
-       KKASSERT(bytes >= HAMMER2_MIN_ALLOC &&
-                bytes <= HAMMER2_MAX_ALLOC);
-
-       /*
-        * Must not be used if the filesystem is using a real freemap.
-        */
-       KKASSERT(hmp->voldata.freemap_blockset.blockref[0].data_off == 0);
-
-       switch(bref->type) {
-       case HAMMER2_BREF_TYPE_INODE:
-               fctype = HAMMER2_FREECACHE_INODE;
-               break;
-       case HAMMER2_BREF_TYPE_INDIRECT:
-               fctype = HAMMER2_FREECACHE_INODE;
-               break;
-       case HAMMER2_BREF_TYPE_DATA:
-               fctype = HAMMER2_FREECACHE_DATA;
-               break;
-       default:
-               fctype = HAMMER2_FREECACHE_DATA;
-               break;
-       }
-
-       if (radix <= HAMMER2_MAX_RADIX)
-               fc = &hmp->freecache[fctype][radix];
-       else
-               fc = NULL;
-
-       lockmgr(&hmp->alloclk, LK_EXCLUSIVE);
-       if (fc && fc->single) {
-               /*
-                * Allocate from our single-block cache.
-                */
-               data_off = fc->single;
-               fc->single = 0;
-       } else if (fc && fc->bulk) {
-               /*
-                * Allocate from our packing cache.
-                */
-               data_off = fc->bulk;
-               fc->bulk += bytes;
-               if ((fc->bulk & HAMMER2_SEGMASK) == 0)
-                       fc->bulk = 0;
-       } else {
-               /*
-                * Allocate from the allocation iterator using a SEGSIZE
-                * aligned block and reload the packing cache if possible.
-                *
-                * Skip reserved areas at the beginning of each zone.
-                */
-               hammer2_voldata_lock(hmp);
-               data_off = hmp->voldata.allocator_beg;
-               data_off = (data_off + HAMMER2_SEGMASK64) & ~HAMMER2_SEGMASK64;
-               if ((data_off & HAMMER2_ZONE_MASK64) < HAMMER2_ZONE_SEG) {
-                       KKASSERT((data_off & HAMMER2_ZONE_MASK64) == 0);
-                       data_off += HAMMER2_ZONE_SEG64;
-               }
-               data_next = data_off + bytes;
-
-               if ((data_next & HAMMER2_SEGMASK) == 0) {
-                       hmp->voldata.allocator_beg = data_next;
-               } else {
-                       KKASSERT(radix <= HAMMER2_MAX_RADIX);
-                       hmp->voldata.allocator_beg =
-                                       (data_next + HAMMER2_SEGMASK64) &
-                                       ~HAMMER2_SEGMASK64;
-                       fc->bulk = data_next;
-               }
-               hammer2_voldata_unlock(hmp, 1);
-       }
-       lockmgr(&hmp->alloclk, LK_RELEASE);
-
-       bref->data_off = data_off | radix;
-       return (0);
-}
-
-#endif
-
 /*
  * Normal freemap allocator
  *
@@ -297,31 +176,31 @@ hammer2_freemap_alloc(hammer2_trans_t *trans,
        hammer2_chain_t *parent;
        hammer2_off_t bpref;
        hammer2_off_t bnext;
-       int freemap_radix;
        int radix;
        int error;
 
        /*
         * Validate the allocation size.  It must be a power of 2.
+        *
+        * For now require that the caller be aware of the minimum
+        * allocation (1K).
         */
        radix = hammer2_getradix(bytes);
        KKASSERT((size_t)1 << radix == bytes);
 
        /*
-        * Freemap elements are assigned from the reserve area.
-        * Note that FREEMAP_LEVEL0_PSIZE is 256 bytes which is
-        * allowed for this case.
+        * Freemap blocks themselves are simply assigned from the reserve
+        * area, not allocated from the freemap.
         */
        if (bref->type == HAMMER2_BREF_TYPE_FREEMAP_NODE ||
            bref->type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) {
                return(hammer2_freemap_reserve(hmp, bref, radix));
        }
-#if USE_SIMPLE_ALLOC
-       return (hammer2_freemap_simple_alloc(hmp, bref, radix));
-#else
 
-       KKASSERT(bytes >= HAMMER2_MIN_ALLOC &&
-                bytes <= HAMMER2_MAX_ALLOC);
+       /*
+        * Normal allocations
+        */
+       KKASSERT(bytes >= HAMMER2_MIN_ALLOC && bytes <= HAMMER2_MAX_ALLOC);
 
        /*
         * Calculate the starting point for our allocation search.
@@ -337,7 +216,6 @@ hammer2_freemap_alloc(hammer2_trans_t *trans,
         * because that is what allows 'find' and 'ls' and other filesystem
         * topology operations to run fast.
         */
-       freemap_radix = hammer2_freemapradix(radix);
 #if 0
        if (bref->data_off & ~HAMMER2_OFF_MASK_RADIX)
                bpref = bref->data_off & ~HAMMER2_OFF_MASK_RADIX;
@@ -348,7 +226,7 @@ hammer2_freemap_alloc(hammer2_trans_t *trans,
        else
 #endif
        KKASSERT(radix >= 0 && radix <= HAMMER2_MAX_RADIX);
-       bpref = hmp->heur_freemap[freemap_radix];
+       bpref = hmp->heur_freemap[radix];
 
        /*
         * Make sure bpref is in-bounds.  It's ok if bpref covers a zone's
@@ -369,41 +247,10 @@ hammer2_freemap_alloc(hammer2_trans_t *trans,
                error = hammer2_freemap_try_alloc(trans, &parent, bref,
                                                  radix, bpref, &bnext);
        }
-       hmp->heur_freemap[freemap_radix] = bnext;
+       hmp->heur_freemap[radix] = bnext;
        hammer2_chain_unlock(parent);
 
        return (error);
-#endif
-}
-
-#if !USE_SIMPLE_ALLOC
-
-/*
- * Attempt to allocate (1 << radix) bytes from the freemap at bnext.
- * Return 0 on success with the bref appropriately updated, non-zero
- * on failure.  Updates bnextp and returns EAGAIN to continue the
- * iteration.
- *
- * This function will create missing freemap infrastructure as well as
- * properly initialize reserved areas as already having been allocated.
- */
-static __inline
-int
-countbits(uint64_t *data)
-{
-       int i;
-       int r = 0;
-       uint64_t v;
-
-       for (i = 0; i < 32; ++i) {
-               v = data[i];
-               while (v) {
-                       if (v & 1)
-                               ++r;
-                       v >>= 1;
-               }
-       }
-       return(r);
 }
 
 static int
@@ -412,22 +259,14 @@ hammer2_freemap_try_alloc(hammer2_trans_t *trans, hammer2_chain_t **parentp,
                          hammer2_off_t bpref, hammer2_off_t *bnextp)
 {
        hammer2_mount_t *hmp = trans->hmp;
-       hammer2_off_t l0mask;
        hammer2_off_t l0size;
+       hammer2_off_t l1size;
+       hammer2_off_t l1mask;
        hammer2_chain_t *chain;
        hammer2_off_t key;
-       hammer2_off_t tmp;
        size_t bytes;
-       uint64_t mask;
-       uint64_t tmp_mask;
-       uint64_t *data;
        int error = 0;
-       int bits;
-       int index;
-       int count;
-       int subindex;
-       int freemap_radix;
-       int devblk_radix;
+
 
        /*
         * Calculate the number of bytes being allocated, the number
@@ -438,22 +277,18 @@ hammer2_freemap_try_alloc(hammer2_trans_t *trans, hammer2_chain_t **parentp,
         *          mask calculation.
         */
        bytes = (size_t)1 << radix;
-       bits = 1 << (radix - HAMMER2_MIN_RADIX);
-       mask = (bits == 64) ? (uint64_t)-1 : (((uint64_t)1 << bits) - 1);
-
-       devblk_radix = hammer2_devblkradix(radix);
-       freemap_radix = hammer2_freemapradix(radix);
 
        /*
-        * Lookup the level0 freemap chain, creating and initializing one
+        * Lookup the level1 freemap chain, creating and initializing one
         * if necessary.  Intermediate levels will be created automatically
         * when necessary by hammer2_chain_create().
         */
-       key = H2FMBASE(*bnextp, HAMMER2_FREEMAP_LEVEL0_RADIX);
-       l0mask = H2FMSHIFT(HAMMER2_FREEMAP_LEVEL0_RADIX) - 1;
+       key = H2FMBASE(*bnextp, HAMMER2_FREEMAP_LEVEL1_RADIX);
        l0size = H2FMSHIFT(HAMMER2_FREEMAP_LEVEL0_RADIX);
+       l1size = H2FMSHIFT(HAMMER2_FREEMAP_LEVEL1_RADIX);
+       l1mask = l1size - 1;
 
-       chain = hammer2_chain_lookup(parentp, key, key + l0mask,
+       chain = hammer2_chain_lookup(parentp, key, key + l1mask,
                                     HAMMER2_LOOKUP_FREEMAP |
                                     HAMMER2_LOOKUP_ALWAYS |
                                     HAMMER2_LOOKUP_MATCHIND/*XXX*/);
@@ -464,118 +299,85 @@ hammer2_freemap_try_alloc(hammer2_trans_t *trans, hammer2_chain_t **parentp,
                 * the bref.check.freemap structure.
                 */
 #if 0
-               kprintf("freemap create L0 @ %016jx bpref %016jx\n",
+               kprintf("freemap create L1 @ %016jx bpref %016jx\n",
                        key, bpref);
 #endif
                error = hammer2_chain_create(trans, parentp, &chain,
-                                    key, HAMMER2_FREEMAP_LEVEL0_RADIX,
+                                    key, HAMMER2_FREEMAP_LEVEL1_RADIX,
                                     HAMMER2_BREF_TYPE_FREEMAP_LEAF,
-                                    HAMMER2_FREEMAP_LEVEL0_PSIZE);
+                                    HAMMER2_FREEMAP_LEVELN_PSIZE);
                if (error == 0) {
                        hammer2_chain_modify(trans, &chain, 0);
-                       bzero(chain->data->bmdata.array,
-                             HAMMER2_FREEMAP_LEVEL0_PSIZE);
-                       chain->bref.check.freemap.biggest =
-                                       HAMMER2_FREEMAP_LEVEL0_RADIX;
-                       chain->bref.check.freemap.avail = l0size;
-                       chain->bref.check.freemap.radix = freemap_radix;
-
-                       /*
-                        * Preset bitmap for existing static allocations.
-                        * 64-bit-align so we don't have to worry about
-                        * endian for the memset().
-                        */
-                       tmp = (hmp->voldata.allocator_beg +
-                              HAMMER2_MAX_ALLOC - 1) &
-                             ~(hammer2_off_t)(HAMMER2_MAX_ALLOC - 1);
-                       if (key < tmp) {
-                               if (key + l0size <= tmp) {
-                                       memset(chain->data->bmdata.array, -1,
-                                              l0size / HAMMER2_MIN_ALLOC / 8);
-                                       chain->bref.check.freemap.avail = 0;
-                               } else {
-                                       count = (tmp - key) / HAMMER2_MIN_ALLOC;
-                                       kprintf("Init L0 BASE %d\n", count);
-                                       memset(chain->data->bmdata.array, -1,
-                                              count / 8);
-                                       chain->bref.check.freemap.avail -=
-                                               count * HAMMER2_MIN_ALLOC;
-                               }
-                       }
-
-                       /*
-                        * Preset bitmap for reserved area.  Calculate
-                        * 2GB base.
-                        */
-                       tmp = H2FMBASE(key, HAMMER2_FREEMAP_LEVEL1_RADIX);
-                       if (key - tmp < HAMMER2_ZONE_SEG) {
-                               memset(chain->data->bmdata.array, -1,
-                                      l0size / HAMMER2_MIN_ALLOC / 8);
-                               chain->bref.check.freemap.avail = 0;
-                       }
+                       bzero(&chain->data->bmdata[0],
+                             HAMMER2_FREEMAP_LEVELN_PSIZE);
+                       chain->bref.check.freemap.bigmask = (uint32_t)-1;
+                       chain->bref.check.freemap.avail = l1size;
+                       /* bref.methods should already be inherited */
 
-                       /*
-                        * Preset bitmap for end of media
-                        */
-                       if (key >= trans->hmp->voldata.volu_size) {
-                               memset(chain->data->bmdata.array, -1,
-                                      l0size / HAMMER2_MIN_ALLOC / 8);
-                               chain->bref.check.freemap.avail = 0;
-                       }
+                       hammer2_freemap_init(trans, key, chain);
                }
-       } else if (chain->bref.check.freemap.biggest < radix) {
+       } else if ((chain->bref.check.freemap.bigmask & (1 << radix)) == 0) {
                /*
                 * Already flagged as not having enough space
                 */
                error = ENOSPC;
-       } else if (chain->bref.check.freemap.radix != freemap_radix) {
-               /*
-                * Wrong cluster radix, cannot allocate from this leaf.
-                */
-               error = ENOSPC;
        } else {
                /*
                 * Modify existing chain to setup for adjustment.
                 */
                hammer2_chain_modify(trans, &chain, 0);
        }
-       if (error)
-               goto skip;
 
        /*
-        * Calculate mask and count.  Each bit represents 1KB and (currently)
-        * the maximum allocation is 65536 bytes.  Allocations are always
-        * natively aligned.
+        * Scan 2MB entries.
         */
-       count = HAMMER2_FREEMAP_LEVEL0_PSIZE / sizeof(uint64_t); /* 32 */
-       data = &chain->data->bmdata.array[0];
-
-       tmp_mask = 0; /* avoid compiler warnings */
-       subindex = 0; /* avoid compiler warnings */
+       if (error == 0) {
+               hammer2_bmap_data_t *bmap;
+               hammer2_key_t base_key;
+               int count;
+               int start;
+               int n;
+
+               start = (int)((*bnextp - key) >> HAMMER2_FREEMAP_LEVEL0_RADIX);
+               KKASSERT(start >= 0 && start < HAMMER2_FREEMAP_COUNT);
+               hammer2_chain_modify(trans, &chain, 0);
 
-       /*
-        * Allocate data and meta-data from the beginning and inodes
-        * from the end.
-        */
-       for (index = 0; index < count; ++index) {
-               if (data[index] == (uint64_t)-1) /* all allocated */
-                       continue;
-               tmp_mask = mask;                 /* iterate */
-               for (subindex = 0; subindex < 64; subindex += bits) {
-                       if ((data[index] & tmp_mask) == 0)
+               error = ENOSPC;
+               for (count = 0; count < HAMMER2_FREEMAP_COUNT; ++count) {
+                       if (start + count >= HAMMER2_FREEMAP_COUNT &&
+                           start - count < 0) {
                                break;
-                       tmp_mask <<= bits;
-               }
-               if (subindex != 64) {
-                       key += HAMMER2_MIN_ALLOC * 64 * index;
-                       key += HAMMER2_MIN_ALLOC * subindex;
-                       break;
+                       }
+                       n = start + count;
+                       bmap = &chain->data->bmdata[n];
+                       if (n < HAMMER2_FREEMAP_COUNT && bmap->avail &&
+                           (bmap->radix == 0 || bmap->radix == radix)) {
+                               base_key = key + n * l0size;
+                               error = hammer2_bmap_alloc(trans, bmap, n,
+                                                          radix, &base_key);
+                               if (error != ENOSPC) {
+                                       key = base_key;
+                                       break;
+                               }
+                       }
+                       n = start - count;
+                       bmap = &chain->data->bmdata[n];
+                       if (n >= 0 && bmap->avail &&
+                           (bmap->radix == 0 || bmap->radix == radix)) {
+                               base_key = key + n * l0size;
+                               error = hammer2_bmap_alloc(trans, bmap, n,
+                                                          radix, &base_key);
+                               if (error != ENOSPC) {
+                                       key = base_key;
+                                       break;
+                               }
+                       }
                }
+               if (error == ENOSPC)
+                       chain->bref.check.freemap.bigmask &= ~(1 << radix);
+               /* XXX also scan down from original count */
        }
-       if (index == count)
-               error = ENOSPC;
 
-skip:
        if (error == 0) {
                /*
                 * Assert validity.  Must be beyond the static allocator used
@@ -583,81 +385,21 @@ skip:
                 * not go past the volume size, and must not be in the
                 * reserved segment area for a zone.
                 */
-               int prebuf;
-
                KKASSERT(key >= hmp->voldata.allocator_beg &&
                         key + bytes <= hmp->voldata.volu_size);
                KKASSERT((key & HAMMER2_ZONE_MASK64) >= HAMMER2_ZONE_SEG);
-
-               /*
-                * Modify the chain and set the bitmap appropriately.
-                *
-                * For smaller allocations try to avoid a read-before-write
-                * by priming the buffer cache buffer.  The caller handles
-                * read-avoidance for larger allocations (or more properly,
-                * when the chain is locked).
-                */
-               prebuf = 0;
-               hammer2_chain_modify(trans, &chain, 0);
-               data = &chain->data->bmdata.array[0];
-               if (radix != devblk_radix) {
-                       uint64_t iomask;
-                       int iobmradix = HAMMER2_MINIORADIX - HAMMER2_MIN_RADIX;
-                       int ioindex;
-                       int iobmskip = 1 << iobmradix;
-
-                       iomask = ((uint64_t)1 << iobmskip) - 1;
-                       for (ioindex = 0; ioindex < 64; ioindex += iobmskip) {
-                               if (tmp_mask & iomask) {
-                                       if ((data[index] & iomask) == 0)
-                                               prebuf = 1;
-                                       break;
-                               }
-                               iomask <<= iobmskip;
-                       }
-               }
-
-               KKASSERT((data[index] & tmp_mask) == 0);
-               data[index] |= tmp_mask;
-
-               /*
-                * We return the allocated space in bref->data_off.
-                */
-               *bnextp = key;
                bref->data_off = key | radix;
 
-               if (prebuf) {
-                       struct buf *bp;
-                       hammer2_off_t pbase;
-                       hammer2_off_t csize;
-                       hammer2_off_t cmask;
-
-                       csize = (hammer2_off_t)1 << devblk_radix;
-                       cmask = csize - 1;
-                       pbase = key & ~mask;
-
-                       bp = getblk(hmp->devvp, pbase, csize,
-                                   GETBLK_NOWAIT, 0);
-                       if (bp) {
-                               if ((bp->b_flags & B_CACHE) == 0)
-                                       vfs_bio_clrbuf(bp);
-                               bqrelse(bp);
-                       }
-               }
-
 #if 0
-               kprintf("alloc cp=%p %016jx %016jx using %016jx chain->data %d\n",
+               kprintf("alloc cp=%p %016jx %016jx using %016jx\n",
                        chain,
-                       bref->key, bref->data_off, chain->bref.data_off,
-                       countbits(data));
+                       bref->key, bref->data_off, chain->bref.data_off);
 #endif
        } else if (error == ENOSPC) {
                /*
                 * Return EAGAIN with next iteration in *bnextp, or
                 * return ENOSPC if the allocation map has been exhausted.
                 */
-               if (chain->bref.check.freemap.biggest > radix)
-                       chain->bref.check.freemap.biggest = radix;
                error = hammer2_freemap_iterate(trans, parentp, &chain,
                                                bpref, bnextp);
        }
@@ -670,19 +412,224 @@ skip:
        return (error);
 }
 
+/*
+ * Allocate (1<<radix) bytes from the bmap whos base data offset is (*basep).
+ *
+ * If the linear iterator is mid-block we use it directly (the bitmap should
+ * already be marked allocated), otherwise we search for a block in the bitmap
+ * that fits the allocation request.
+ *
+ * A partial bitmap allocation sets the minimum bitmap granularity (16KB)
+ * to fully allocated and adjusts the linear allocator to allow the
+ * remaining space to be allocated.
+ */
+static
+int
+hammer2_bmap_alloc(hammer2_trans_t *trans, hammer2_bmap_data_t *bmap,
+                  int n, int radix, hammer2_key_t *basep)
+{
+       struct buf *bp;
+       size_t size;
+       size_t bmsize;
+       int bmradix;
+       uint32_t bmmask;
+       int offset;
+       int i;
+       int j;
+
+       /*
+        * Take into account 2-bits per block when calculating bmradix.
+        */
+       size = (size_t)1 << radix;
+       if (radix <= HAMMER2_FREEMAP_BLOCK_RADIX) {
+               bmradix = 2;
+               bmsize = HAMMER2_FREEMAP_BLOCK_SIZE;
+               /* (16K) 2 bits per allocation block */
+       } else {
+               bmradix = 2 << (radix - HAMMER2_FREEMAP_BLOCK_RADIX);
+               bmsize = size;
+               /* (32K-256K) 4, 8, 16, 32 bits per allocation block */
+       }
+
+       /*
+        * Use iterator or scan bitmap.  Beware of hardware artifacts
+        * when bmradix == 32 (intermediate result can wind up being '1'
+        * instead of '0' if hardware masks bit-count & 31).
+        *
+        * NOTE: j needs to be even in the j= calculation.  As an artifact
+        *       of the /2 division, our bitmask has to clear bit 0.
+        */
+       if (bmap->linear & HAMMER2_FREEMAP_BLOCK_MASK) {
+               KKASSERT(bmap->linear >= 0 &&
+                        bmap->linear + size <= HAMMER2_SEGSIZE);
+               offset = bmap->linear;
+               i = offset / (HAMMER2_SEGSIZE / 8);
+               j = (offset / (HAMMER2_FREEMAP_BLOCK_SIZE / 2)) & 30;
+               bmmask = (bmradix == 32) ?
+                        0xFFFFFFFFU : (1 << bmradix) - 1;
+               bmmask <<= j;
+#if 0
+               kprintf("alloc1 %016jx/%zd %08x i=%d j=%d bmmask=%08x "
+                       "(linear=%08x)\n",
+                       *basep + offset, size,
+                       offset, i, j, bmmask, bmap->linear);
+#endif
+       } else {
+               for (i = 0; i < 8; ++i) {
+                       bmmask = (bmradix == 32) ?
+                                0xFFFFFFFFU : (1 << bmradix) - 1;
+                       for (j = 0; j < 32; j += bmradix) {
+                               if ((bmap->bitmap[i] & bmmask) == 0)
+                                       goto success;
+                               bmmask <<= bmradix;
+                       }
+               }
+               KKASSERT(bmap->avail == 0);
+               return (ENOSPC);
+success:
+               offset = i * (HAMMER2_SEGSIZE / 8) +
+                        (j * (HAMMER2_FREEMAP_BLOCK_SIZE / 2));
+#if 0
+               kprintf("alloc2 %016jx/%zd %08x i=%d j=%d bmmask=%08x\n",
+                       *basep + offset, size,
+                       offset, i, j, bmmask);
+#endif
+       }
+
+       KKASSERT(i >= 0 && i < 8);      /* 8 x 16 -> 128 x 16K -> 2MB */
+
+       /*
+        * Optimize the buffer cache to avoid unnecessary read-before-write
+        * operations.
+        */
+       if (radix < HAMMER2_MINIORADIX &&
+           (bmap->bitmap[i] & bmmask) == 0) {
+               bp = getblk(trans->hmp->devvp, *basep + offset,
+                           HAMMER2_MINIOSIZE, GETBLK_NOWAIT, 0);
+               if (bp) {
+                       if ((bp->b_flags & B_CACHE) == 0)
+                               vfs_bio_clrbuf(bp);
+                       bqrelse(bp);
+               }
+       }
+
+#if 0
+       /*
+        * When initializing a new inode segment also attempt to initialize
+        * an adjacent segment.  Be careful not to index beyond the array
+        * bounds.
+        *
+        * We do this to try to localize inode accesses to improve
+        * directory scan rates.  XXX doesn't improve scan rates.
+        */
+       if (size == HAMMER2_INODE_BYTES) {
+               if (n & 1) {
+                       if (bmap[-1].radix == 0 && bmap[-1].avail)
+                               bmap[-1].radix = radix;
+               } else {
+                       if (bmap[1].radix == 0 && bmap[1].avail)
+                               bmap[1].radix = radix;
+               }
+       }
+#endif
+
+       /*
+        * Adjust the linear iterator, set the radix if necessary (might as
+        * well just set it unconditionally), adjust *basep to return the
+        * allocated data offset.
+        */
+       bmap->bitmap[i] |= bmmask;
+       bmap->linear = offset + size;
+       bmap->radix = radix;
+       bmap->avail -= size;
+       *basep += offset;
+
+       return(0);
+}
+
+static
+void
+hammer2_freemap_init(hammer2_trans_t *trans, hammer2_key_t key,
+                    hammer2_chain_t *chain)
+{
+       hammer2_off_t l1size;
+       hammer2_off_t lokey;
+       hammer2_off_t hikey;
+       hammer2_bmap_data_t *bmap;
+       int count;
+
+       l1size = H2FMSHIFT(HAMMER2_FREEMAP_LEVEL1_RADIX);
+
+       /*
+        * Calculate the portion of the 2GB map that should be initialized
+        * as free.  Portions below or after will be initialized as allocated.
+        * SEGMASK-align the areas so we don't have to worry about sub-scans
+        * or endianess when using memset.
+        *
+        * (1) Ensure that all statically allocated space from newfs_hammer2
+        *     is marked allocated.
+        *
+        * (2) Ensure that the reserved area is marked allocated (typically
+        *     the first 4MB of the 2GB area being represented).
+        *
+        * (3) Ensure that any trailing space at the end-of-volume is marked
+        *     allocated.
+        *
+        * WARNING! It is possible for lokey to be larger than hikey if the
+        *          entire 2GB segment is within the static allocation.
+        */
+       lokey = (trans->hmp->voldata.allocator_beg + HAMMER2_SEGMASK64) &
+               ~HAMMER2_SEGMASK64;
+
+       if (lokey < H2FMBASE(key, HAMMER2_FREEMAP_LEVEL1_RADIX) +
+                 HAMMER2_ZONE_SEG64) {
+               lokey = H2FMBASE(key, HAMMER2_FREEMAP_LEVEL1_RADIX) +
+                       HAMMER2_ZONE_SEG64;
+       }
+
+       hikey = key + H2FMSHIFT(HAMMER2_FREEMAP_LEVEL1_RADIX);
+       if (hikey > trans->hmp->voldata.volu_size) {
+               hikey = trans->hmp->voldata.volu_size & ~HAMMER2_SEGMASK64;
+       }
+
+       chain->bref.check.freemap.avail =
+               H2FMSHIFT(HAMMER2_FREEMAP_LEVEL1_RADIX);
+       bmap = &chain->data->bmdata[0];
+
+       for (count = 0; count < HAMMER2_FREEMAP_COUNT; ++count) {
+               if (key < lokey || key >= hikey) {
+                       memset(bmap->bitmap, -1,
+                              sizeof(bmap->bitmap));
+                       bmap->avail = 0;
+                       chain->bref.check.freemap.avail -=
+                               H2FMSHIFT(HAMMER2_FREEMAP_LEVEL0_RADIX);
+               } else {
+                       bmap->avail = H2FMSHIFT(HAMMER2_FREEMAP_LEVEL0_RADIX);
+               }
+               key += H2FMSHIFT(HAMMER2_FREEMAP_LEVEL0_RADIX);
+               ++bmap;
+       }
+}
+
+/*
+ * The current Level 1 freemap has been exhausted, iterate to the next
+ * one, return ENOSPC if no freemaps remain.
+ *
+ * XXX this should rotate back to the beginning to handle freed-up space
+ * XXX or use intermediate entries to locate free space. TODO
+ */
 static int
 hammer2_freemap_iterate(hammer2_trans_t *trans, hammer2_chain_t **parentp,
                        hammer2_chain_t **chainp,
                        hammer2_off_t bpref, hammer2_off_t *bnextp)
 {
-       *bnextp += H2FMSHIFT(HAMMER2_FREEMAP_LEVEL0_RADIX);
+       *bnextp &= ~(H2FMSHIFT(HAMMER2_FREEMAP_LEVEL1_RADIX) - 1);
+       *bnextp += H2FMSHIFT(HAMMER2_FREEMAP_LEVEL1_RADIX);
        if (*bnextp >= trans->hmp->voldata.volu_size)
                return (ENOSPC);
        return(EAGAIN);
 }
 
-#endif
-
 #if 0
 
 void
index ac72bd8..39ff93f 100644 (file)
@@ -325,10 +325,10 @@ hammer2_getradix(size_t bytes)
 
        if (bytes == HAMMER2_PBUFSIZE)
                radix = HAMMER2_PBUFRADIX;
-       else if (bytes >= 16384)
-               radix = 14;
-       else if (bytes >= 1024)
-               radix = 10;
+       else if (bytes >= HAMMER2_LBUFSIZE)
+               radix = HAMMER2_LBUFRADIX;
+       else if (bytes >= HAMMER2_MIN_ALLOC)    /* clamp */
+               radix = HAMMER2_MIN_RADIX;
        else
                radix = 0;