hammer2 - update documentation, begin working on callback I/O master
authorMatthew Dillon <dillon@apollo.backplane.com>
Thu, 30 Oct 2014 21:14:58 +0000 (14:14 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Thu, 30 Oct 2014 21:14:58 +0000 (14:14 -0700)
* Update documentation with my current thinking.

* Implement hammer2_iocb - a structure to govern async block I/O requests
  with a callback function.

* Implement iocb API.

* Replace old hammer2_chain_load_async() and related code with the new
  callback mechanism.

* Implemented a somewhat more sophisticated freemap DOMAYFREE setting,
  but note that we still do not iterate file data blocks when removing
  a file to adjust their freemap (and I may decide not to just in general,
  since the meta-data scan and freemap update can add considerable latency
  to a remove() operation).

  This means that the whole concept of partial block free states may wind
  up being thrown out the window, but as of now it's still the plan.

sys/vfs/hammer2/FREEMAP
sys/vfs/hammer2/TODO
sys/vfs/hammer2/hammer2.h
sys/vfs/hammer2/hammer2_chain.c
sys/vfs/hammer2/hammer2_cluster.c
sys/vfs/hammer2/hammer2_freemap.c
sys/vfs/hammer2/hammer2_inode.c
sys/vfs/hammer2/hammer2_io.c
sys/vfs/hammer2/hammer2_ioctl.c
sys/vfs/hammer2/hammer2_vfsops.c
sys/vfs/hammer2/hammer2_vnops.c

index b315305..80be8c2 100644 (file)
@@ -6,51 +6,54 @@
     HAMMER2 Media is broken down into 2 GByte zones.  Each 2 GByte zone
     contains a 4 MByte header (64 x 64K blocks = 0.2% of storage).  The
     blocks in this header are reserved for various purposes.  For example,
-    block #0 is reserved for a volume header or volume header backup.  Most
-    of the 64KB blocks in this header are reserved for use by the freemap.
+    block #0 is reserved for a volume headers.  Most of the remaining
+    64KB blocks in this header are reserved for use by the freemap.
 
     The freemap only uses blocks from these reserved areas.  In order to
-    ensure that any of the four volume headers can be used for the mount
+    ensure that any of the four volume headers can be used by the mount code
     (in case some are found to be corrupted), each freemap block in the
-    logical freemap topology iterates through 6 different copies.  The
-    freemap is a 4-layer topology (+1 3-bit layer in the volume header),
-    so 6x4 = 24 of the 64 reserved blocks are dedicated to freemap
-    operations.
-
-    Any given modification of a freemap block that crosses a flush group
-    must cycle to the next copy of the freemap block.  Having 6 copies
-    ensures that:
-
-    - Each of the four backup volume headers points to a consistent
-      freemap topology.  This eats 4 copies.
-
-    - That recovery operations during mount do not modify the state of the
-      freemap topology pointed to by older volume headers that are still
-      valid.  This eats 1 copy.
-
-    - The bulk free scan eats 1 copy to use as spool-off space if the
-      thread hits its ram limits.  This copy is not part of the normal
-      rotation.
-
-    - Total is 6 copies.
-
-    However, there is one major restriction: If an older volume header is
-    selected by the mount code, any newer (presumably corrupt since the
-    mount code didn't select it) volume headers will lose freemap consistency
-    as the freemap code rotates into freemap blocks that might have been used
-    by the topology pointed to by the newer (but not selected) backup
-    volume headers.  For a RW mount, this means that if an older volume
-    backup is selected, the newer ones that were not selected MUST be
-    formally invalidated and cannot be used in a remount attempt.  To
-    mitigate the potential loss of data, any volume headers lost in this
-    manner can be snapshotted and the freemap recovery scan (in a RW mount)
-    can also scan the snapshots to try to ensure that the blocks are marked
-    as allocated.  The system operator can then check the snapshot manually.
-
-    During normal operation, each filesystem flush rotates to a new backup
-    volume header (a filesystem has up to four) and retains full consistency
-    for the older volume headers.  Each logical freemap block in the topology
-    rotates through the 6 possible versions (on-modify only).
+    logical freemap topology will iterate through up to 8 copies whos
+    block numbers are taken the reserved area.
+
+    - Four copies, one for each of the four volume headers which H2 sequences
+      through on each flush.  This ensures that a mount from any of the four
+      volume headers is handed a consistent freemap topology.
+
+    - One copy to ensure that recovery operations during mount do not modify
+      the state of the freemap topology pointed to by older volume headers
+      which are still valid.  Note that the freemap for volume headers
+      indexed after the mount point being recovered may lose freemap
+      consistency, so if you choose an older mount point for a RW mount,
+      you have to stick with it.
+
+    - One copy for live operations.  This allows HAMMER2 to retire the
+      related buffer (or for the OS to retire the buffer cache buffer)
+      prior to the next flush and also allows the buffers to be flushed
+      asynchronously.
+
+    - The two remaining copies add robustness to the specification.  For
+      example, with appropriate feature code the filesystem can tolerate
+      a limited number of bad blocks in the reserved area.
+
+    For the moment we use a simple calculation for the freemap block.  In
+    a later version I would like to mix the blocks up a bit so the blocks
+    in each set of 8 are not situated near each other.
+
+                               RW Mount Restrictions
+
+    If an older volume header is explicitly selected by the mount code, any
+    newer (presumably corrupt since the mount code didn't select it) volume
+    headers will lose freemap consistency as the freemap code rotates into
+    freemap blocks that might have been used by the topology pointed to by
+    the newer (but not selected) volume headers.  For a RW mount, this means
+    that if an older volume header is selected, the newer ones that were
+    not selected WILL be formally invalidated by the mount code and cannot
+    be used in a remount attempt.
+
+    During normal operation, each filesystem flush rotates to a new volume
+    header.  A filesystem may have up to four volume headers spread at 2GB
+    intervals.  Filesystems smaller than ~9GB or so will have fewer volume
+    headers to rotate through.
 
                                Freemap Topology
 
     simplify the algorithm and to ensure freemap locality to the blocks
     under management.
 
+    Freemap blocks are allocated from the reserved area in each 2GB zone.
+    The leafs represent data in the zone.  Higher levels in the freemap
+    topology will cover more area but the physical freemap meta-data blocks
+    always occur prior to the area being covered.  Thus a HAMMER2 filesystem
+    of almost any size can be formatted and the related freemap blocks
+    will always exist.
+
     Level 1 - (radix 10 + 21) 64KB representing 2GB.  This is represented
              by a hammer2_bmap_data[1024] array.  Each entry represents
              2MB worth of media storage x 1024 entries to represent 2GB.
              volume header).
 
     Each level is assign reserved blocks in the 4MB header per 2GB zone.
-    Since we use block 0 for the volume header / volume header backup,
-    our level names above can simply also represent the relative block
-    number.  Level 1 uses block 1 through level 4 using block 4.  Level 5
-    is stored in the volume header.
+    Since we use block 0 for the volume header, the first freemap reserved
+    block in the zone begins at block 1.
+
+    Freemap copy #0:
+       Level 1 uses block 1 (this is the leaf block)
+       Level 2 uses block 2
+       Level 3 uses block 3
+       Level 4 uses block 4
+
+    Freemap copy #1:
+       Level 1 uses block 5 (this is the leaf block)
+       Level 2 uses block 6
+       Level 3 uses block 7
+       Level 4 uses block 8
+
+    ... and so forth up to Freemap copy #7 using blocks 29, 30, 31, and 32.
 
                                    Flushing
 
     The freemap does not have to be flushed by fsync/sync, but should probably
     be flushed at least once a minute by the normal filesystem sync.  The
-    reason it does not have to be flushed every time is that the freemap
-    recovery (using the last fully flushed freemap TID) will simply do an
-    incremental scan of the main filesystem tree between the freemap TID
-    and the main filesystem tree's TID to ensure that blocks allocated in
-    the interim are properly allocated in the freemap.  Simple as that.
+    reason it does not have to be flushed every time is that freemap recovery
+    is executed on-mount and will use the last fully flushed freemap TID
+    stored in the volume header to do an incremental meta-data scan of the
+    H2 filesystem between that TID and the last flushed TID.  All blocks not
+    found to have been marked allocated will be marked allocated.  Simple as
+    that.
 
                                Freemap Granularity
 
     blocks to appear fully allocated until some later date when the
     bulk scan code defragments it.
 
-                                  Block Selection
+                                Block Selection
 
     Block selection is localized to be near the inode's (or nearby data)
     blockref.  The algorithmic complexity of determining locality is not
     defined here atm.
 
-                               Leaf Substructure
+                            Freemap Leaf Substructure
+
+    * linear - Linear sub-granular allocation offset.  Allows ~1KB granular
+              linear allocations.
+
+    * class  - Allocation clustering class ((type << 8) | radix).
 
-    * radix  - Clustering radix.  All allocations for any given ~2MB zone
-              are always the same size, allowing the filesystem code to
-              cluster buffer cache I/O.
+    * avail  - Available space in bytes, currently only used by layer 1 leaf.
+              Used as an allocation clustering aid.
 
-    * bitmap - four 32 bit words representing ~2MB in 16KB allocation chunks
+    * bitmap - Eight 32 bit words representing ~2MB in 16KB allocation chunks
               at 2 bits per chunk.  The filesystem allocation granularity
               can be smaller (currently ~1KB minimum), and the live
-              filesystem keeps caches iterations when allocating multiple
-              chunks.  However, on remount any partial allocations out of
-              a 64KB allocation block causes the entire 64KB to be
-              considered allocated.  Fragmented space can potentially be
-              reclaimed and/or relocated by the bulk block free scan.
+              filesystem caches iterations when allocating multiple chunks.
+              However, on remount any partial allocations out of a 64KB
+              allocation block MAY cause the entire 64KB to be considered
+              allocated.  Fragmented space can potentially be reclaimed
+              and/or relocated by the bulk block free scan.
 
               The 2-bit bitmap fields are assigned as follows:
 
               00       FREE
-              01       ARMED for free stage (future use)
-              10       ARMED for free stage (future use)
+              01       POSSIBLY FREE (type 1)
+              10       POSSIBLY FREE (type 2)
               11       ALLOCATED
 
-              It should be noted that in some cases, such as snapshot
-              destruction, H2 does not bother to actually ARM the related
-              blocks (which would take a long time).  Instead, the bulk
-              free-scan may have to do a more exhaustive scan.
+                         Freemap Metadata Substructure
+                            (Levels 2, 3, 4, and 5)
 
-                             Blockref Substructure
-
-    The blockref substructure at each level steals some space from the
-    check code area (a 24-byte area).  We only need 4 bytes for the check
-    code icrc.  We use some of the remaining space to store information
-    that allows the block allocator to do its work more efficiently.
+    Freemap layers 2, 3, 4, and 5 operate as arrays of blockrefs but steal
+    some of the check area (a 24-byte area) for freemap-specific meta-data.
+    We reserve a few fields to store information which allows the block
+    allocator to do its work more efficiently.
 
     * bigmask - A mask of radixes available for allocation under this
                blockref.  Typically initialized to -1.
 
     The freemap allocator uses a cylinder-group-like abstraction using
     the localized allocation concept first implemented by UFS.  In HAMMER2
-    there is no such thing as a real cylinder group, but we do the next
-    best thing by implementing our layer 1 blockmap representing 2GB.
-
-                               Level 2, 3, 4, 5
+    there is no such thing as a real cylinder group, nor are there specific
+    reserved areas for inodes vs data, but we do the next best thing by
+    roughly typing leafs (each leaf representing ~2MB) to hopefully allow
+    the drive to employ its zone-cache to make both stat-only and tar-style
+    bulk accesses efficient (in addition to normal file accesses).
 
     Levels 2, 3, and 4 contains an array blockmap[1024] (64KB total),
-    supplying 10 bits of address space each.  Level 5 is a blockmap[8] stored
-    in the volume header supplying 3 bits of address space.  (level 0
-    supplies 10 + 21 bits of address space).
+    supplying 10 bits of address space each.  Level 5 is a blockmap[8]
+    stored in the volume header supplying 3 bits of address space.
+    (level 0 supplies 10 + 21 bits of address space).
 
     The Level1 blockmap is HAMMER2's idea of a 'cylinder group', thus
     effectively fixed at multiples of ~2MB or so.
 
                        How blocks are allocated and freed
 
-    H2 keeps track of sub-16KB allocations in-memory.  On a crash/reboot any
-    partial allocations effectively become full 16KB block allocations until
-    the bulk freeing code comes along and fixes it.  2-bit patterns are as
-    follows:
-
-       00      FREE
-       01      ARMED (for free) (future use)
-       10      ARMED (for free) (future use)
-       11      ALLOCATED
-
-    Currently H2 only implements 00 and 11.  When a file, topology, or
-    snapshot is deleted H2 simply leaves the blocks marked allocated but
-    records the related freezone/radix(s) in memory.
-
-    At some point a background bulk free-scan will run.  This code must
-    scan meta-data and has a limited cache to detect duplicative sub-trees
-    (due to snapshots).  It uses the freezone/radix information recorded
-    in memory to reduce the complexity of the scan, find all references to
-    the related blocks in the meta-data, and determines what can actually
-    be freed.  Once this determination is made the bulk free-scan sets
-    the related freemap bits to FREE (00).
-
-    An exhaustive free-scan is not usually required during normal operation
-    but is typically run incrementally by cron every so often to ensure, over
-    time, that all freeable blocks are actually freed.  This is most useful
-    when maintaining multiple snapshots.
+    The H2 freemap leaf bitmap operates in 16KB chunks, but the leaf also
+    contains a linear allocation offset that can keep track of sub-16KB
+    allocations with certain restrictions.  More random sub-16KB allocations
+    are tracked in-memory, but will be lost (assumed to be a full 16KB) if
+    a crash occurs.
+
+    NOTE!  All operations on the freemap occur on the current live version
+          of the freemap, including bulkfree operations.
+
+    Blocks are allocated by transitioning the 2-bit pattern in the leaf
+    to 11.  That is, (00, 01, 10) -> (11).  This handles races between the
+    live allocator and the asynchronous bulkfree code.  A live allocation
+    which occurs while the asynchronous bulkfree process is running will
+    operate race-free by transitioning the (01) an (10) states back
+    to (11), which prevents bulkfree from later marking those blocks
+    FREE (00).
+
+    Blocks can be freed several ways, but all block freeing operations
+    require at least two passes before the related blocks can actually be
+    reused.
+
+    Method #1 - Removal in the filesystem marks the related freemap bitmap
+               POSSIBLY FREE (either 01 or 10).  The asynchronous bulkfree
+               process later determines that the block is actually free and
+               transitions it to FREE (00), or moves it back to
+               ALLOCATED (11).
+
+               This works whether the blocks can actually be freed or not,
+               so we don't care if the related blocks are part of some other
+               snapshot or not.  bulkfree will figure it out.
+
+    Method #2 - Removal in the filesystem ignores the freemap.  The freemap
+               blocks are left alone (typically ALLOCATED (11)).
+
+               In this situation bulkfree must make extra passes to determine
+               if blocks are possibly free, then transition the leaf bitmap
+               entries to POSSIBLY FREE (01 or 10).  bulkfree cannot directly
+               transition the entries to FREE (00) without another pass.
+
+               However, this method has numerous advantages including making
+               snapshot manipulation (including deletions) instantanious
+               and allow whole subtrees and/or large-files to be rm -rf'd
+               with only a single disk write to update the inode in the
+               best case.
+
+    Method #3 - Brute force.  *ALL* freemap bitmap entries are marked
+               POSSIBLY FREE and bulkfree then must do multiple passes
+               (particularly in order to ensure that its memory use remains
+               bounded) to again transition all the freemap bitmap entries
+               to either FREE (00) or ALLOCATED (11).
+
+               This method can be faster than #2 but wastes a considerable
+               amount of write-bandwidth (and SSD wear if the target drive
+               is a SSD).
+
+    In all cases the bulkfree code must make a final pass on the filesystem
+    to do the final transition of POSSIBLY FREE blocks to FREE (00) or
+    ALLOCATED (11).  Again, races for the FREE (00) are handled by observing
+    if the bitmap code was moved to ALLOCATED (11) by the live system while
+    bulkfree ran asynchrnously and not transitioning the element to FREE (00)
+    in that situation.
+
+    All bulkfree passes are done on meta-data.  Actual data blocks do not
+    need to be read unless the media is being verified.  H2 uses method #2
+    by default and efficiency depends on how much ram the system has to
+    cache scan information.  That said, the bulkfree process is not only
+    incremental but it is also interruptable and restartable.  It does not
+    interfere with live operations other than using disk bandwidth, so
+    there are several ways to run it including in the background.
+
+    The biggest issue is that *NO* space can be freed up by the live
+    filesystem without the bulkfree process unless we optimize the case
+    where data is created and deleted from within a single snapshot.
+    This is made more difficult by the fact that each flush represents
+    a fine-grained snapshot (up to four, representing the four volume
+    headers the flush iterates through).
+
+                     Snapshots and Replicated Topologies
+
+    The bulkfree code maintains information in-memory to the best of its
+    ability for a multitude of reasons, including attempting to detect
+    snapshot recursions down block chains which have already been scanned
+    via some other snapshot.  Without this, a large number of snapshots
+    can cause a huge multiplication of disk I/O reads (but not writes) during
+    the topology scan.
 
                        Use of Generic indirect-block API
 
 
     The Freemap is defined above as a fixed 5-level scheme (level 1-5),
     but in actual operation the radix tree can be shortcut just as it
-    is with normal files.  However, shorcuts are forced into the radix
-    values of this specification and reserved blocks are calculated based
-    on the radix level and offset, so as the freemap becomes more fleshed
-    out the tree looks more and more like the specification.
+    is with normal files.  However, unlike normal files, shorcuts will
+    be forced to use specific radix values in order to guarantee that
+    reserved block numbers can be trivially calculated.  As the freemap
+    becomes more fleshed out the tree on-media will look more and more like
+    the actual specification.
 
     One advantage of doing things this way is that smaller filesystems
-    won't actually use a 6-level scheme.  A 16GB filesystem can use 8
-    blockrefs at layer 5 (in the volume header) that point directly to
-    layer 1.  A 16TB filesystem can use 8 blockrefs at layer5 that point
-    to layer 2.  And so forth.
+    won't actually use a 5-level scheme.  A 16GB filesystem can use 8
+    blockrefs in the volume header which point directly to layer 1 leaf
+    blocks.  A 16TB filesystem can be managed with only three levels
+    (layer 3, 2, and 1 only where the 8 x layer 3 blockrefs are stored in
+    the volume header).  And so forth.
 
     At the moment we have no plans to return any of the unused 4MB zone
     header space (per 2GB of storage) back to the filesystem for general use.
     There are lots of things we may want to use the reserved areas for in
     the future.
+
+                               Emergency Deletions
+
+    All filesystem modifications including deletions must allocate blocks
+    in order to update the main topology all the way to the root.  H2 will
+    reserve roughly 5% of the available blocks in the filesystem for
+    deletions in order to allow a system operator to recover from a
+    filesystem full condition.
+
+    Despite this, situations may come up, due to having snapshots, where
+    deletions eat up available blocks but fail to create freeable space.
+    When this situation occurs the system operator may be forced to issue
+    emergency in-place deletions which replace existing blocks rather then
+    allocate new blocks.  For the moment the spec for dealing with these
+    situations remains incomplete.
index 3c497d6..e5fbec5 100644 (file)
@@ -21,9 +21,6 @@
 
 * inode always contains target cluster/chain, not hardlink
 
-* cluster_modify_ip -> data returned mod to all chains
-* and hammer2_cluster_data() -> same thing
-
 * chain refs in cluster, cluster refs
 
 * check inode shared lock ... can end up in endless loop if following
index 61d8ab3..7f2051c 100644 (file)
@@ -94,6 +94,8 @@
 #include "hammer2_ioctl.h"
 #include "hammer2_ccms.h"
 
+struct hammer2_io;
+struct hammer2_iocb;
 struct hammer2_chain;
 struct hammer2_cluster;
 struct hammer2_inode;
@@ -177,6 +179,7 @@ typedef uint32_t hammer2_xid_t;
 RB_HEAD(hammer2_chain_tree, hammer2_chain);
 TAILQ_HEAD(h2_flush_list, hammer2_chain);
 TAILQ_HEAD(h2_core_list, hammer2_chain);
+TAILQ_HEAD(h2_iocb_list, hammer2_iocb);
 
 #define CHAIN_CORE_DELETE_BMAP_ENTRIES \
        (HAMMER2_PBUFSIZE / sizeof(hammer2_blockref_t) / sizeof(uint32_t))
@@ -196,36 +199,63 @@ typedef struct hammer2_chain_core hammer2_chain_core_t;
 #define HAMMER2_CORE_UNUSED0001                0x0001
 #define HAMMER2_CORE_COUNTEDBREFS      0x0002
 
+RB_HEAD(hammer2_io_tree, hammer2_io);
+
 /*
- * H2 is a copy-on-write filesystem.  In order to allow chains to allocate
- * smaller blocks (down to 64-bytes), but improve performance and make
- * clustered I/O possible using larger block sizes, the kernel buffer cache
- * is abstracted via the hammer2_io structure.
+ * IOCB - IO callback (into chain, cluster, or manual request)
  */
-RB_HEAD(hammer2_io_tree, hammer2_io);
+struct hammer2_iocb {
+       TAILQ_ENTRY(hammer2_iocb) entry;
+       void (*callback)(struct hammer2_iocb *iocb);
+       struct hammer2_io       *dio;
+       struct hammer2_cluster  *cluster;
+       struct hammer2_chain    *chain;
+       void                    *ptr;
+       off_t                   lbase;
+       int                     lsize;
+       uint32_t                flags;
+       int                     error;
+};
+
+typedef struct hammer2_iocb hammer2_iocb_t;
+
+#define HAMMER2_IOCB_INTERLOCK 0x00000001
+#define HAMMER2_IOCB_ONQ       0x00000002
+#define HAMMER2_IOCB_DONE      0x00000004
+#define HAMMER2_IOCB_INPROG    0x00000008
+#define HAMMER2_IOCB_DIDBP     0x00000010      /* loaded dio->buf */
+#define HAMMER2_IOCB_QUICK     0x00010000
+#define HAMMER2_IOCB_ZERO      0x00020000
+#define HAMMER2_IOCB_READ      0x00040000
+#define HAMMER2_IOCB_WAKEUP    0x00080000
 
+/*
+ * DIO - Management structure wrapping system buffer cache.
+ *
+ *      Used for multiple purposes including concurrent management
+ *      if small requests by chains into larger DIOs.
+ */
 struct hammer2_io {
        RB_ENTRY(hammer2_io) rbnode;    /* indexed by device offset */
+       struct h2_iocb_list iocbq;
        struct spinlock spin;
        struct hammer2_mount *hmp;
        struct buf      *bp;
-       struct bio      *bio;
        off_t           pbase;
        int             psize;
-       void            (*callback)(struct hammer2_io *dio,
-                                   struct hammer2_cluster *cluster,
-                                   struct hammer2_chain *chain,
-                                   void *arg1, off_t arg2);
-       struct hammer2_cluster *arg_l;          /* INPROG I/O only */
-       struct hammer2_chain *arg_c;            /* INPROG I/O only */
-       void            *arg_p;                 /* INPROG I/O only */
-       off_t           arg_o;                  /* INPROG I/O only */
        int             refs;
        int             act;                    /* activity */
 };
 
 typedef struct hammer2_io hammer2_io_t;
 
+#define HAMMER2_DIO_INPROG     0x80000000      /* bio in progress */
+#define HAMMER2_DIO_GOOD       0x40000000      /* buf data is good */
+#define HAMMER2_DIO_WAITING    0x20000000      /* (old) */
+#define HAMMER2_DIO_DIRTY      0x10000000      /* flush on last drop */
+
+#define HAMMER2_DIO_MASK       0x0FFFFFFF
+
 /*
  * Primary chain structure keeps track of the topology in-memory.
  */
@@ -402,6 +432,7 @@ struct hammer2_cluster {
        struct hammer2_pfsmount *pmp;
        uint32_t                flags;
        int                     nchains;
+       hammer2_iocb_t          iocb;
        hammer2_chain_t         *focus;         /* current focus (or mod) */
        hammer2_chain_t         *array[HAMMER2_MAXCLUSTER];
        char                    missed[HAMMER2_MAXCLUSTER];
@@ -421,6 +452,10 @@ RB_HEAD(hammer2_inode_tree, hammer2_inode);
  *
  * NOTE: The inode's attribute CST which is also used to lock the inode
  *      is embedded in the chain (chain.cst) and aliased w/ attr_cst.
+ *
+ * NOTE: The inode-embedded cluster is never used directly for I/O (since
+ *      it may be shared).  Instead it will be replicated-in and synchronized
+ *      back out if changed.
  */
 struct hammer2_inode {
        RB_ENTRY(hammer2_inode) rbnode;         /* inumber lookup (HL) */
@@ -825,12 +860,12 @@ void hammer2_chain_core_alloc(hammer2_trans_t *trans, hammer2_chain_t *chain);
 void hammer2_chain_ref(hammer2_chain_t *chain);
 void hammer2_chain_drop(hammer2_chain_t *chain);
 int hammer2_chain_lock(hammer2_chain_t *chain, int how);
-void hammer2_chain_load_async(hammer2_cluster_t *cluster,
-                               void (*func)(hammer2_io_t *dio,
-                                            hammer2_cluster_t *cluster,
-                                            hammer2_chain_t *chain,
-                                            void *arg_p, off_t arg_o),
-                               void *arg_p);
+const hammer2_media_data_t *hammer2_chain_rdata(hammer2_chain_t *chain);
+hammer2_media_data_t *hammer2_chain_wdata(hammer2_chain_t *chain);
+
+void hammer2_cluster_load_async(hammer2_cluster_t *cluster,
+                               void (*callback)(hammer2_iocb_t *iocb),
+                               void *ptr);
 void hammer2_chain_moved(hammer2_chain_t *chain);
 void hammer2_chain_modify(hammer2_trans_t *trans,
                                hammer2_chain_t *chain, int flags);
@@ -909,11 +944,14 @@ int hammer2_ioctl(hammer2_inode_t *ip, u_long com, void *data,
 /*
  * hammer2_io.c
  */
-hammer2_io_t *hammer2_io_getblk(hammer2_mount_t *hmp, off_t lbase,
-                               int lsize, int *ownerp);
 void hammer2_io_putblk(hammer2_io_t **diop);
 void hammer2_io_cleanup(hammer2_mount_t *hmp, struct hammer2_io_tree *tree);
 char *hammer2_io_data(hammer2_io_t *dio, off_t lbase);
+void hammer2_io_getblk(hammer2_mount_t *hmp, off_t lbase, int lsize,
+                               hammer2_iocb_t *iocb);
+void hammer2_io_complete(hammer2_iocb_t *iocb);
+void hammer2_io_callback(struct bio *bio);
+void hammer2_iocb_wait(hammer2_iocb_t *iocb);
 int hammer2_io_new(hammer2_mount_t *hmp, off_t lbase, int lsize,
                                hammer2_io_t **diop);
 int hammer2_io_newnz(hammer2_mount_t *hmp, off_t lbase, int lsize,
@@ -922,14 +960,6 @@ int hammer2_io_newq(hammer2_mount_t *hmp, off_t lbase, int lsize,
                                hammer2_io_t **diop);
 int hammer2_io_bread(hammer2_mount_t *hmp, off_t lbase, int lsize,
                                hammer2_io_t **diop);
-void hammer2_io_breadcb(hammer2_mount_t *hmp, off_t lbase, int lsize,
-                               void (*callback)(hammer2_io_t *dio,
-                                                hammer2_cluster_t *arg_l,
-                                                hammer2_chain_t *arg_c,
-                                                void *arg_p, off_t arg_o),
-                               hammer2_cluster_t *arg_l,
-                               hammer2_chain_t *arg_c,
-                               void *arg_p, off_t arg_o);
 void hammer2_io_bawrite(hammer2_io_t **diop);
 void hammer2_io_bdwrite(hammer2_io_t **diop);
 int hammer2_io_bwrite(hammer2_io_t **diop);
@@ -971,7 +1001,7 @@ void hammer2_freemap_adjust(hammer2_trans_t *trans, hammer2_mount_t *hmp,
  */
 int hammer2_cluster_need_resize(hammer2_cluster_t *cluster, int bytes);
 uint8_t hammer2_cluster_type(hammer2_cluster_t *cluster);
-const hammer2_media_data_t *hammer2_cluster_data(hammer2_cluster_t *cluster);
+const hammer2_media_data_t *hammer2_cluster_rdata(hammer2_cluster_t *cluster);
 hammer2_media_data_t *hammer2_cluster_wdata(hammer2_cluster_t *cluster);
 hammer2_cluster_t *hammer2_cluster_from_chain(hammer2_chain_t *chain);
 int hammer2_cluster_modified(hammer2_cluster_t *cluster);
index 242b815..3a1e097 100644 (file)
@@ -729,81 +729,6 @@ hammer2_chain_lock(hammer2_chain_t *chain, int how)
 }
 
 /*
- * This basically calls hammer2_io_breadcb() but does some pre-processing
- * of the chain first to handle certain cases.
- */
-void
-hammer2_chain_load_async(hammer2_cluster_t *cluster,
-                        void (*callback)(hammer2_io_t *dio,
-                                         hammer2_cluster_t *cluster,
-                                         hammer2_chain_t *chain,
-                                         void *arg_p, off_t arg_o),
-                        void *arg_p)
-{
-       hammer2_chain_t *chain;
-       hammer2_mount_t *hmp;
-       struct hammer2_io *dio;
-       hammer2_blockref_t *bref;
-       int error;
-       int i;
-
-       /*
-        * If no chain specified see if any chain data is available and use
-        * that, otherwise begin an I/O iteration using the first chain.
-        */
-       chain = NULL;
-       for (i = 0; i < cluster->nchains; ++i) {
-               chain = cluster->array[i];
-               if (chain && chain->data)
-                       break;
-       }
-       if (i == cluster->nchains) {
-               chain = cluster->array[0];
-               i = 0;
-       }
-
-       if (chain->data) {
-               callback(NULL, cluster, chain, arg_p, (off_t)i);
-               return;
-       }
-
-       /*
-        * We must resolve to a device buffer, either by issuing I/O or
-        * by creating a zero-fill element.  We do not mark the buffer
-        * dirty when creating a zero-fill element (the hammer2_chain_modify()
-        * API must still be used to do that).
-        *
-        * The device buffer is variable-sized in powers of 2 down
-        * to HAMMER2_MIN_ALLOC (typically 1K).  A 64K physical storage
-        * chunk always contains buffers of the same size. (XXX)
-        *
-        * The minimum physical IO size may be larger than the variable
-        * block size.
-        */
-       bref = &chain->bref;
-       hmp = chain->hmp;
-
-       /*
-        * The getblk() optimization can only be used on newly created
-        * elements if the physical block size matches the request.
-        */
-       if ((chain->flags & HAMMER2_CHAIN_INITIAL) &&
-           chain->bytes == hammer2_devblksize(chain->bytes)) {
-               error = hammer2_io_new(hmp, bref->data_off, chain->bytes, &dio);
-               KKASSERT(error == 0);
-               callback(dio, cluster, chain, arg_p, (off_t)i);
-               return;
-       }
-
-       /*
-        * Otherwise issue a read
-        */
-       hammer2_adjreadcounter(&chain->bref, chain->bytes);
-       hammer2_io_breadcb(hmp, bref->data_off, chain->bytes,
-                          callback, cluster, chain, arg_p, (off_t)i);
-}
-
-/*
  * Unlock and deref a chain element.
  *
  * On the last lock release any non-embedded data (chain->dio) will be
@@ -1111,6 +1036,7 @@ hammer2_chain_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip,
 void
 hammer2_chain_modify(hammer2_trans_t *trans, hammer2_chain_t *chain, int flags)
 {
+       hammer2_blockref_t obref;
        hammer2_mount_t *hmp;
        hammer2_io_t *dio;
        int error;
@@ -1119,9 +1045,10 @@ hammer2_chain_modify(hammer2_trans_t *trans, hammer2_chain_t *chain, int flags)
        char *bdata;
 
        hmp = chain->hmp;
+       obref = chain->bref;
 
        /*
-        * data is not optional for freemap chains (we must always be sure
+        * Data is not optional for freemap chains (we must always be sure
         * to copy the data on COW storage allocations).
         */
        if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE ||
@@ -1294,6 +1221,18 @@ skip2:
         */
        if (chain->parent)
                hammer2_chain_setflush(trans, chain->parent);
+
+       /*
+        * Adjust the freemap bitmap to indicate that the related blocks
+        * MIGHT be freeable.  Bulkfree must still determine that the blocks
+        * are actually freeable.
+        */
+       if (obref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE &&
+           obref.type != HAMMER2_BREF_TYPE_FREEMAP_LEAF &&
+           (obref.data_off & ~HAMMER2_OFF_MASK_RADIX)) {
+               hammer2_freemap_adjust(trans, hmp,
+                                      &obref, HAMMER2_FREEMAP_DOMAYFREE);
+       }
 }
 
 /*
@@ -2679,6 +2618,20 @@ _hammer2_chain_delete_helper(hammer2_trans_t *trans,
                 */
                atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED);
        }
+
+       /*
+        * If the deletion is permanent (i.e. the chain is not simply being
+        * moved within the topology), adjust the freemap to indicate that
+        * the block *might* be freeable.  bulkfree must still determine
+        * that it is actually freeable.
+        */
+       if ((flags & HAMMER2_DELETE_PERMANENT) &&
+           chain->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE &&
+           chain->bref.type != HAMMER2_BREF_TYPE_FREEMAP_LEAF &&
+           (chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX)) {
+               hammer2_freemap_adjust(trans, hmp, &chain->bref,
+                                      HAMMER2_FREEMAP_DOMAYFREE);
+       }
 }
 
 /*
@@ -3842,6 +3795,20 @@ hammer2_chain_wait(hammer2_chain_t *chain)
        tsleep(chain, 0, "chnflw", 1);
 }
 
+const hammer2_media_data_t *
+hammer2_chain_rdata(hammer2_chain_t *chain)
+{
+       KKASSERT(chain->data != NULL);
+       return (chain->data);
+}
+
+hammer2_media_data_t *
+hammer2_chain_wdata(hammer2_chain_t *chain)
+{
+       KKASSERT(chain->data != NULL);
+       return (chain->data);
+}
+
 /*
  * Set the check data for a chain.  This can be a heavy-weight operation
  * and typically only runs on-flush.  For file data check data is calculated
index 2386053..9d25a24 100644 (file)
@@ -77,23 +77,6 @@ hammer2_cluster_type(hammer2_cluster_t *cluster)
        return(cluster->focus->bref.type);
 }
 
-/*
- * NOTE: When modifying a cluster object via hammer2_cluster_wdata()
- *      and hammer2_cluster_modsync(), remember that block array
- *      entries are not copied to the elements of the cluster.
- */
-const hammer2_media_data_t *
-hammer2_cluster_data(hammer2_cluster_t *cluster)
-{
-       return(cluster->focus->data);
-}
-
-hammer2_media_data_t *
-hammer2_cluster_wdata(hammer2_cluster_t *cluster)
-{
-       return(cluster->focus->data);
-}
-
 int
 hammer2_cluster_modified(hammer2_cluster_t *cluster)
 {
@@ -274,6 +257,10 @@ hammer2_cluster_ref(hammer2_cluster_t *cluster)
 /*
  * Drop the caller's reference to the cluster.  When the ref count drops to
  * zero this function frees the cluster and drops all underlying chains.
+ *
+ * In-progress read I/Os are typically detached from the cluster once the
+ * first one returns (the remaining stay attached to the DIOs but are then
+ * ignored and drop naturally).
  */
 void
 hammer2_cluster_drop(hammer2_cluster_t *cluster)
@@ -529,7 +516,8 @@ hammer2_cluster_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip,
 }
 
 /*
- * Adjust the cluster's chains to allow modification.
+ * Adjust the cluster's chains to allow modification and adjust the
+ * focus.  Data will be accessible on return.
  */
 void
 hammer2_cluster_modify(hammer2_trans_t *trans, hammer2_cluster_t *cluster,
@@ -548,7 +536,9 @@ hammer2_cluster_modify(hammer2_trans_t *trans, hammer2_cluster_t *cluster,
 }
 
 /*
- * Synchronize modifications with other chains in a cluster.
+ * Synchronize modifications from the focus to other chains in a cluster.
+ * Convenient because nominal API users can just modify the contents of the
+ * focus (at least for non-blockref data).
  *
  * Nominal front-end operations only edit non-block-table data in a single
  * chain.  This code copies such modifications to the other chains in the
@@ -977,7 +967,7 @@ hammer2_cluster_snapshot(hammer2_trans_t *trans, hammer2_cluster_t *ocluster,
 {
        hammer2_mount_t *hmp;
        hammer2_cluster_t *ncluster;
-       const hammer2_inode_data_t *ipdata;
+       const hammer2_inode_data_t *ripdata;
        hammer2_inode_data_t *wipdata;
        hammer2_inode_t *nip;
        size_t name_len;
@@ -992,8 +982,11 @@ hammer2_cluster_snapshot(hammer2_trans_t *trans, hammer2_cluster_t *ocluster,
        name_len = strlen(pfs->name);
        lhc = hammer2_dirhash(pfs->name, name_len);
 
-       ipdata = &hammer2_cluster_data(ocluster)->ipdata;
-       opfs_clid = ipdata->pfs_clid;
+       /*
+        * Get the clid
+        */
+       ripdata = &hammer2_cluster_rdata(ocluster)->ipdata;
+       opfs_clid = ripdata->pfs_clid;
        hmp = ocluster->focus->hmp;
 
        /*
@@ -1043,7 +1036,7 @@ hammer2_cluster_snapshot(hammer2_trans_t *trans, hammer2_cluster_t *ocluster,
                /* XXX hack blockset copy */
                /* XXX doesn't work with real cluster */
                KKASSERT(ocluster->nchains == 1);
-               wipdata->u.blockset = ocluster->focus->data->ipdata.u.blockset;
+               wipdata->u.blockset = ripdata->u.blockset;
                hammer2_cluster_modsync(ncluster);
                for (i = 0; i < ncluster->nchains; ++i) {
                        if (ncluster->array[i])
@@ -1090,6 +1083,125 @@ hammer2_cluster_parent(hammer2_cluster_t *cluster)
 }
 
 /************************************************************************
+ *                             CLUSTER I/O                             *
+ ************************************************************************
+ *
+ *
+ * WARNING! blockref[] array data is not universal.  These functions should
+ *         only be used to access universal data.
+ *
+ * NOTE!    The rdata call will wait for at least one of the chain I/Os to
+ *         complete if necessary.  The I/O's should have already been
+ *         initiated by the cluster_lock/chain_lock operation.
+ *
+ *         The cluster must already be in a modified state before wdata
+ *         is called.  The data will already be available for this case.
+ */
+const hammer2_media_data_t *
+hammer2_cluster_rdata(hammer2_cluster_t *cluster)
+{
+       return(cluster->focus->data);
+}
+
+hammer2_media_data_t *
+hammer2_cluster_wdata(hammer2_cluster_t *cluster)
+{
+       KKASSERT(hammer2_cluster_modified(cluster));
+       return(cluster->focus->data);
+}
+
+/*
+ * Load async into independent buffer - used to load logical buffers from
+ * underlying device data.  The callback is made for the first validated
+ * data found, or NULL if no valid data is available.
+ *
+ * NOTE! The cluster structure is either unique or serialized (e.g. embedded
+ *      in the inode with an exclusive lock held), the chain structure may be
+ *      shared.
+ */
+void
+hammer2_cluster_load_async(hammer2_cluster_t *cluster,
+                          void (*callback)(hammer2_iocb_t *iocb), void *ptr)
+{
+       hammer2_chain_t *chain;
+       hammer2_iocb_t *iocb;
+       hammer2_mount_t *hmp;
+       hammer2_blockref_t *bref;
+       int i;
+
+       /*
+        * Try to find a chain whos data is already resolved.  If none can
+        * be found, start with the first chain.
+        */
+       chain = NULL;
+       for (i = 0; i < cluster->nchains; ++i) {
+               chain = cluster->array[i];
+               if (chain && chain->data)
+                       break;
+       }
+       if (i == cluster->nchains) {
+               chain = cluster->array[0];
+               i = 0;
+       }
+
+       iocb = &cluster->iocb;
+       iocb->callback = callback;
+       iocb->dio = NULL;               /* for already-validated case */
+       iocb->cluster = cluster;
+       iocb->chain = chain;
+       iocb->ptr = ptr;
+       iocb->lbase = (off_t)i;
+       iocb->flags = 0;
+       iocb->error = 0;
+
+       /*
+        * Data already validated
+        */
+       if (chain->data) {
+               callback(iocb);
+               return;
+       }
+
+       /*
+        * We must resolve to a device buffer, either by issuing I/O or
+        * by creating a zero-fill element.  We do not mark the buffer
+        * dirty when creating a zero-fill element (the hammer2_chain_modify()
+        * API must still be used to do that).
+        *
+        * The device buffer is variable-sized in powers of 2 down
+        * to HAMMER2_MIN_ALLOC (typically 1K).  A 64K physical storage
+        * chunk always contains buffers of the same size. (XXX)
+        *
+        * The minimum physical IO size may be larger than the variable
+        * block size.
+        */
+       bref = &chain->bref;
+       hmp = chain->hmp;
+
+#if 0
+       /* handled by callback? <- TODO XXX even needed for loads? */
+       /*
+        * The getblk() optimization for a 100% overwrite can only be used
+        * if the physical block size matches the request.
+        */
+       if ((chain->flags & HAMMER2_CHAIN_INITIAL) &&
+           chain->bytes == hammer2_devblksize(chain->bytes)) {
+               error = hammer2_io_new(hmp, bref->data_off, chain->bytes, &dio);
+               KKASSERT(error == 0);
+               iocb->dio = dio;
+               callback(iocb);
+               return;
+       }
+#endif
+
+       /*
+        * Otherwise issue a read
+        */
+       hammer2_adjreadcounter(&chain->bref, chain->bytes);
+       hammer2_io_getblk(hmp, bref->data_off, chain->bytes, iocb);
+}
+
+/************************************************************************
  *                         NODE FAILURES                               *
  ************************************************************************
  *
index 52426b1..94287a7 100644 (file)
@@ -998,3 +998,40 @@ again:
 done:
        hammer2_chain_unlock(parent);
 }
+
+/*
+ * Validate the freemap, in three stages.
+ *
+ * stage-1     ALLOCATED     -> POSSIBLY FREE
+ *             POSSIBLY FREE -> POSSIBLY FREE (type corrected)
+ *
+ *     This transitions bitmap entries from ALLOCATED to POSSIBLY FREE.
+ *     The POSSIBLY FREE state does not mean that a block is actually free
+ *     and may be transitioned back to ALLOCATED in stage-2.
+ *
+ *     This is typically done during normal filesystem operations when
+ *     something is deleted or a block is replaced.
+ *
+ *     This is done by bulkfree in-bulk after a memory-bounded meta-data
+ *     scan to try to determine what might be freeable.
+ *
+ *     This can be done unconditionally through a freemap scan when the
+ *     intention is to brute-force recover the proper state of the freemap.
+ *
+ * stage-2     POSSIBLY FREE -> ALLOCATED      (scan metadata topology)
+ *
+ *     This is done by bulkfree during a meta-data scan to ensure that
+ *     all blocks still actually allocated by the filesystem are marked
+ *     as such.
+ *
+ *     NOTE! Live filesystem transitions to POSSIBLY FREE can occur while
+ *           the bulkfree stage-2 and stage-3 is running.  The live filesystem
+ *           will use the alternative POSSIBLY FREE type (2) to prevent
+ *           stage-3 from improperly transitioning unvetted possibly-free
+ *           blocks to FREE.
+ *
+ * stage-3     POSSIBLY FREE (type 1) -> FREE  (scan freemap)
+ *
+ *     This is done by bulkfree to finalize POSSIBLY FREE states.
+ *
+ */
index 5f79d3c..34e1894 100644 (file)
@@ -82,7 +82,7 @@ hammer2_inode_cmp(hammer2_inode_t *ip1, hammer2_inode_t *ip2)
 hammer2_cluster_t *
 hammer2_inode_lock_ex(hammer2_inode_t *ip)
 {
-       const hammer2_inode_data_t *ipdata;
+       const hammer2_inode_data_t *ripdata;
        hammer2_cluster_t *cluster;
        hammer2_chain_t *chain;
        int i;
@@ -113,10 +113,10 @@ hammer2_inode_lock_ex(hammer2_inode_t *ip)
        /*
         * Returned cluster must resolve hardlink pointers
         */
-       ipdata = &hammer2_cluster_data(cluster)->ipdata;
-       KKASSERT(ipdata->type != HAMMER2_OBJTYPE_HARDLINK);
+       ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
+       KKASSERT(ripdata->type != HAMMER2_OBJTYPE_HARDLINK);
        /*
-       if (ipdata->type == HAMMER2_OBJTYPE_HARDLINK &&
+       if (ripdata->type == HAMMER2_OBJTYPE_HARDLINK &&
            (cluster->focus->flags & HAMMER2_CHAIN_DELETED) == 0) {
                error = hammer2_hardlink_find(ip->pip, NULL, cluster);
                KKASSERT(error == 0);
@@ -147,7 +147,7 @@ hammer2_inode_unlock_ex(hammer2_inode_t *ip, hammer2_cluster_t *cluster)
 hammer2_cluster_t *
 hammer2_inode_lock_sh(hammer2_inode_t *ip)
 {
-       const hammer2_inode_data_t *ipdata;
+       const hammer2_inode_data_t *ripdata;
        hammer2_cluster_t *cluster;
        hammer2_chain_t *chain;
        int i;
@@ -177,10 +177,10 @@ hammer2_inode_lock_sh(hammer2_inode_t *ip)
        /*
         * Returned cluster must resolve hardlink pointers
         */
-       ipdata = &hammer2_cluster_data(cluster)->ipdata;
-       KKASSERT(ipdata->type != HAMMER2_OBJTYPE_HARDLINK);
+       ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
+       KKASSERT(ripdata->type != HAMMER2_OBJTYPE_HARDLINK);
        /*
-       if (ipdata->type == HAMMER2_OBJTYPE_HARDLINK &&
+       if (ripdata->type == HAMMER2_OBJTYPE_HARDLINK &&
            (cluster->focus->flags & HAMMER2_CHAIN_DELETED) == 0) {
                error = hammer2_hardlink_find(ip->pip, NULL, cluster);
                KKASSERT(error == 0);
@@ -333,7 +333,7 @@ hammer2_inode_drop(hammer2_inode_t *ip)
 struct vnode *
 hammer2_igetv(hammer2_inode_t *ip, hammer2_cluster_t *cparent, int *errorp)
 {
-       const hammer2_inode_data_t *ipdata;
+       const hammer2_inode_data_t *ripdata;
        hammer2_pfsmount_t *pmp;
        struct vnode *vp;
        ccms_state_t ostate;
@@ -342,7 +342,7 @@ hammer2_igetv(hammer2_inode_t *ip, hammer2_cluster_t *cparent, int *errorp)
        KKASSERT(pmp != NULL);
        *errorp = 0;
 
-       ipdata = &hammer2_cluster_data(cparent)->ipdata;
+       ripdata = &hammer2_cluster_rdata(cparent)->ipdata;
 
        for (;;) {
                /*
@@ -405,15 +405,15 @@ hammer2_igetv(hammer2_inode_t *ip, hammer2_cluster_t *cparent, int *errorp)
                        continue;
                }
 
-               switch (ipdata->type) {
+               switch (ripdata->type) {
                case HAMMER2_OBJTYPE_DIRECTORY:
                        vp->v_type = VDIR;
                        break;
                case HAMMER2_OBJTYPE_REGFILE:
                        vp->v_type = VREG;
-                       vinitvmio(vp, ipdata->size,
+                       vinitvmio(vp, ripdata->size,
                                  HAMMER2_LBUFSIZE,
-                                 (int)ipdata->size & HAMMER2_LBUFMASK);
+                                 (int)ripdata->size & HAMMER2_LBUFMASK);
                        break;
                case HAMMER2_OBJTYPE_SOFTLINK:
                        /*
@@ -422,25 +422,25 @@ hammer2_igetv(hammer2_inode_t *ip, hammer2_cluster_t *cparent, int *errorp)
                         * association.
                         */
                        vp->v_type = VLNK;
-                       vinitvmio(vp, ipdata->size,
+                       vinitvmio(vp, ripdata->size,
                                  HAMMER2_LBUFSIZE,
-                                 (int)ipdata->size & HAMMER2_LBUFMASK);
+                                 (int)ripdata->size & HAMMER2_LBUFMASK);
                        break;
                case HAMMER2_OBJTYPE_CDEV:
                        vp->v_type = VCHR;
                        /* fall through */
                case HAMMER2_OBJTYPE_BDEV:
                        vp->v_ops = &pmp->mp->mnt_vn_spec_ops;
-                       if (ipdata->type != HAMMER2_OBJTYPE_CDEV)
+                       if (ripdata->type != HAMMER2_OBJTYPE_CDEV)
                                vp->v_type = VBLK;
-                       addaliasu(vp, ipdata->rmajor, ipdata->rminor);
+                       addaliasu(vp, ripdata->rmajor, ripdata->rminor);
                        break;
                case HAMMER2_OBJTYPE_FIFO:
                        vp->v_type = VFIFO;
                        vp->v_ops = &pmp->mp->mnt_vn_fifo_ops;
                        break;
                default:
-                       panic("hammer2: unhandled objtype %d", ipdata->type);
+                       panic("hammer2: unhandled objtype %d", ripdata->type);
                        break;
                }
 
@@ -494,7 +494,7 @@ hammer2_inode_get(hammer2_pfsmount_t *pmp, hammer2_inode_t *dip,
         */
 again:
        for (;;) {
-               iptmp = &hammer2_cluster_data(cluster)->ipdata;
+               iptmp = &hammer2_cluster_rdata(cluster)->ipdata;
                nip = hammer2_inode_lookup(pmp, iptmp->inum);
                if (nip == NULL)
                        break;
@@ -533,7 +533,7 @@ again:
        nip->cluster.flags |= HAMMER2_CLUSTER_INODE;
        hammer2_cluster_replace(&nip->cluster, cluster);
 
-       nipdata = &hammer2_cluster_data(cluster)->ipdata;
+       nipdata = &hammer2_cluster_rdata(cluster)->ipdata;
        nip->inum = nipdata->inum;
        nip->size = nipdata->size;
        nip->mtime = nipdata->mtime;
@@ -622,7 +622,7 @@ hammer2_inode_create(hammer2_trans_t *trans, hammer2_inode_t *dip,
         */
 retry:
        cparent = hammer2_inode_lock_ex(dip);
-       dipdata = &hammer2_cluster_data(cparent)->ipdata;
+       dipdata = &hammer2_cluster_rdata(cparent)->ipdata;
        dip_uid = dipdata->uid;
        dip_gid = dipdata->gid;
        dip_mode = dipdata->mode;
@@ -793,7 +793,7 @@ hammer2_hardlink_shiftup(hammer2_trans_t *trans, hammer2_cluster_t *cluster,
        hammer2_blockref_t bref;
        int ddflag;
 
-       iptmp = &hammer2_cluster_data(cluster)->ipdata;
+       iptmp = &hammer2_cluster_rdata(cluster)->ipdata;
        lhc = iptmp->inum;
        KKASSERT((lhc & HAMMER2_DIRHASH_VISIBLE) == 0);
 
@@ -1013,9 +1013,9 @@ hammer2_inode_connect(hammer2_trans_t *trans,
                wipdata->name_key = lhc;
                wipdata->name_len = name_len;
                wipdata->target_type =
-                               hammer2_cluster_data(ocluster)->ipdata.type;
+                               hammer2_cluster_rdata(ocluster)->ipdata.type;
                wipdata->type = HAMMER2_OBJTYPE_HARDLINK;
-               wipdata->inum = hammer2_cluster_data(ocluster)->ipdata.inum;
+               wipdata->inum = hammer2_cluster_rdata(ocluster)->ipdata.inum;
                wipdata->version = HAMMER2_INODE_VERSION_ONE;
                wipdata->nlinks = 1;
                wipdata->op_flags = HAMMER2_OPFLAG_DIRECTDATA;
@@ -1186,7 +1186,7 @@ again:
                                     0, &ddflag);
        while (cluster) {
                if (hammer2_cluster_type(cluster) == HAMMER2_BREF_TYPE_INODE) {
-                       ripdata = &hammer2_cluster_data(cluster)->ipdata;
+                       ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
                        if (ripdata->name_len == name_len &&
                            bcmp(ripdata->filename, name, name_len) == 0) {
                                break;
@@ -1207,7 +1207,7 @@ again:
                error = ENOENT;
                goto done;
        }
-       ripdata = &hammer2_cluster_data(cluster)->ipdata;
+       ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
        type = ripdata->type;
        if (type == HAMMER2_OBJTYPE_HARDLINK) {
                hlink = 1;
@@ -1429,7 +1429,7 @@ hammer2_inode_install_hidden(hammer2_pfsmount_t *pmp)
         * fields.
         */
        cparent = hammer2_inode_lock_ex(pmp->iroot);
-       ripdata = &hammer2_cluster_data(cparent)->ipdata;
+       ripdata = &hammer2_cluster_rdata(cparent)->ipdata;
        dip_check_algo = ripdata->check_algo;
        dip_comp_algo = ripdata->comp_algo;
        ripdata = NULL;
@@ -1557,7 +1557,7 @@ hammer2_hardlink_consolidate(hammer2_trans_t *trans,
        int error;
 
        cluster = *clusterp;
-       ripdata = &hammer2_cluster_data(cluster)->ipdata;
+       ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
        if (nlinks == 0 &&                      /* no hardlink needed */
            (ripdata->name_key & HAMMER2_DIRHASH_VISIBLE)) {
                return (0);
@@ -1576,7 +1576,7 @@ hammer2_hardlink_consolidate(hammer2_trans_t *trans,
         * this is already a hardlink target, all we need to do is adjust
         * the link count.
         */
-       ripdata = &hammer2_cluster_data(cluster)->ipdata;
+       ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
        if (cdip == ip->pip &&
            (ripdata->name_key & HAMMER2_DIRHASH_VISIBLE) == 0) {
                if (nlinks) {
@@ -1611,7 +1611,7 @@ hammer2_hardlink_consolidate(hammer2_trans_t *trans,
 
        hammer2_cluster_delete(trans, cparent, cluster, 0);
 
-       ripdata = &hammer2_cluster_data(cluster)->ipdata;
+       ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
        KKASSERT(ripdata->type != HAMMER2_OBJTYPE_HARDLINK);
        if (ripdata->name_key & HAMMER2_DIRHASH_VISIBLE) {
                hammer2_cluster_t *ncluster;
@@ -1747,7 +1747,7 @@ hammer2_hardlink_find(hammer2_inode_t *dip,
        /*
         * Locate the hardlink.  pip is referenced and not locked.
         */
-       ipdata = &hammer2_cluster_data(cluster)->ipdata;
+       ipdata = &hammer2_cluster_rdata(cluster)->ipdata;
        lhc = ipdata->inum;
 
        /*
@@ -1873,7 +1873,7 @@ hammer2_inode_fsync(hammer2_trans_t *trans, hammer2_inode_t *ip,
        int dosync = 0;
        int ddflag;
 
-       ripdata = &hammer2_cluster_data(cparent)->ipdata;    /* target file */
+       ripdata = &hammer2_cluster_rdata(cparent)->ipdata;    /* target file */
 
        if (ip->flags & HAMMER2_INODE_MTIME) {
                wipdata = hammer2_cluster_modify_ip(trans, ip, cparent, 0);
index 0ce2ee3..2fe5be1 100644 (file)
@@ -41,7 +41,6 @@
  * using smaller allocations, without causing deadlocks.
  *
  */
-static void hammer2_io_callback(struct bio *bio);
 static int hammer2_io_cleanup_callback(hammer2_io_t *dio, void *arg);
 
 static int
@@ -63,21 +62,23 @@ struct hammer2_cleanupcb_info {
        int     count;
 };
 
-
 #define HAMMER2_DIO_INPROG     0x80000000
-#define HAMMER2_DIO_GOOD       0x40000000
-#define HAMMER2_DIO_WAITING    0x20000000
-#define HAMMER2_DIO_DIRTY      0x10000000
+#define HAMMER2_DIO_GOOD       0x40000000      /* buf/bio is good */
+#define HAMMER2_DIO_WAITING    0x20000000      /* iocb's queued */
+#define HAMMER2_DIO_DIRTY      0x10000000      /* flush on last drop */
 
 #define HAMMER2_DIO_MASK       0x0FFFFFFF
 
+#define HAMMER2_GETBLK_GOOD    0
+#define HAMMER2_GETBLK_QUEUED  1
+#define HAMMER2_GETBLK_OWNED   2
+
 /*
- * Acquire the requested dio, set *ownerp based on state.  If state is good
- * *ownerp is set to 0, otherwise *ownerp is set to DIO_INPROG and the
- * caller must resolve the buffer.
+ * Allocate/Locate the requested dio, reference it, issue or queue iocb.
  */
-hammer2_io_t *
-hammer2_io_getblk(hammer2_mount_t *hmp, off_t lbase, int lsize, int *ownerp)
+void
+hammer2_io_getblk(hammer2_mount_t *hmp, off_t lbase, int lsize,
+                 hammer2_iocb_t *iocb)
 {
        hammer2_io_t *dio;
        hammer2_io_t *xio;
@@ -111,6 +112,8 @@ hammer2_io_getblk(hammer2_mount_t *hmp, off_t lbase, int lsize, int *ownerp)
                dio->pbase = pbase;
                dio->psize = psize;
                dio->refs = 1;
+               spin_init(&dio->spin, "h2dio");
+               TAILQ_INIT(&dio->iocbq);
                spin_lock(&hmp->io_spin);
                xio = RB_INSERT(hammer2_io_tree, &hmp->iotree, dio);
                if (xio == NULL) {
@@ -130,87 +133,165 @@ hammer2_io_getblk(hammer2_mount_t *hmp, off_t lbase, int lsize, int *ownerp)
        /*
         * Obtain/Validate the buffer.
         */
+       iocb->dio = dio;
+
        for (;;) {
                refs = dio->refs;
                cpu_ccfence();
 
                /*
-                * Stop if the buffer is good.  Once set GOOD the flag cannot
-                * be cleared until refs drops to 0.
+                * Issue the iocb immediately if the buffer is already good.
+                * Once set GOOD cannot be cleared until refs drops to 0.
                 */
                if (refs & HAMMER2_DIO_GOOD) {
-                       *ownerp = 0;
-                       goto done;
+                       iocb->callback(iocb);
+                       break;
                }
 
                /*
-                * We need to acquire the in-progress lock on the buffer
+                * Try to own the buffer.  If we cannot we queue the iocb.
                 */
                if (refs & HAMMER2_DIO_INPROG) {
-                       tsleep_interlock(dio, 0);
+                       spin_lock(&dio->spin);
                        if (atomic_cmpset_int(&dio->refs, refs,
                                              refs | HAMMER2_DIO_WAITING)) {
-                               tsleep(dio, PINTERLOCKED, "h2dio", 0);
+                               iocb->flags |= HAMMER2_IOCB_ONQ |
+                                              HAMMER2_IOCB_INPROG;
+                               TAILQ_INSERT_TAIL(&dio->iocbq, iocb, entry);
+                               spin_unlock(&dio->spin);
+                               break;
                        }
+                       spin_unlock(&dio->spin);
                        /* retry */
                } else {
                        if (atomic_cmpset_int(&dio->refs, refs,
                                              refs | HAMMER2_DIO_INPROG)) {
+                               iocb->flags |= HAMMER2_IOCB_INPROG;
+                               iocb->callback(iocb);
                                break;
                        }
+                       /* retry */
                }
                /* retry */
        }
+       if (dio->act < 5)
+               ++dio->act;
+}
+
+/*
+ * The iocb is done.
+ */
+void
+hammer2_io_complete(hammer2_iocb_t *iocb)
+{
+       hammer2_io_t *dio = iocb->dio;
+       uint32_t orefs;
+       uint32_t nrefs;
+       uint32_t oflags;
+       uint32_t nflags;
+
+       /*
+        * If IOCB_INPROG is not set then the completion was synchronous.
+        * We can set IOCB_DONE safely without having to worry about waiters.
+        * XXX
+        */
+       if ((iocb->flags & HAMMER2_IOCB_INPROG) == 0) {
+               iocb->flags |= HAMMER2_IOCB_DONE;
+               return;
+       }
 
        /*
-        * We need to do more work before the buffer is usable
+        * bp is held for all comers, make sure the lock is not owned by
+        * a particular thread.
         */
-       *ownerp = HAMMER2_DIO_INPROG;
-done:
-       if (dio->act < 5)
-               ++dio->act;
-       return(dio);
+       if (iocb->flags & HAMMER2_IOCB_DIDBP)
+               BUF_KERNPROC(dio->bp);
+
+       /*
+        * Set the GOOD bit on completion with no error if dio->bp is
+        * not NULL.  Only applicable if INPROG was set.
+        */
+       if (dio->bp && iocb->error == 0)
+               atomic_set_int(&dio->refs, HAMMER2_DIO_GOOD);
+
+       for (;;) {
+               oflags = iocb->flags;
+               cpu_ccfence();
+               nflags = oflags;
+               nflags &= ~(HAMMER2_IOCB_DIDBP |
+                           HAMMER2_IOCB_WAKEUP |
+                           HAMMER2_IOCB_INPROG);
+               nflags |= HAMMER2_IOCB_DONE;
+
+               if (atomic_cmpset_int(&iocb->flags, oflags, nflags)) {
+                       if (oflags & HAMMER2_IOCB_WAKEUP)
+                               wakeup(iocb);
+                       /* SMP: iocb is now stale */
+                       break;
+               }
+       }
+       iocb = NULL;
+
+       /*
+        * Now finish up the dio.  If another iocb is pending chain to it,
+        * otherwise clear INPROG (and WAITING).
+        */
+       for (;;) {
+               orefs = dio->refs;
+               nrefs = orefs & ~(HAMMER2_DIO_WAITING | HAMMER2_DIO_INPROG);
+
+               if ((orefs & HAMMER2_DIO_WAITING) && TAILQ_FIRST(&dio->iocbq)) {
+                       spin_lock(&dio->spin);
+                       iocb = TAILQ_FIRST(&dio->iocbq);
+                       if (iocb) {
+                               TAILQ_REMOVE(&dio->iocbq, iocb, entry);
+                               spin_unlock(&dio->spin);
+                               iocb->callback(iocb);   /* chained */
+                               break;
+                       }
+                       spin_unlock(&dio->spin);
+                       /* retry */
+               } else if (atomic_cmpset_int(&dio->refs, orefs, nrefs)) {
+                       break;
+               } /* else retry */
+               /* retry */
+       }
+       /* SMP: dio is stale now */
 }
 
 /*
- * If part of an asynchronous I/O the asynchronous I/O is biodone()'d.
  *
- * If the caller owned INPROG then the dio will be set GOOD or not
- * depending on whether the caller disposed of dio->bp or not.
  */
-static
 void
-hammer2_io_complete(hammer2_io_t *dio, int owner)
+hammer2_iocb_wait(hammer2_iocb_t *iocb)
 {
-       int refs;
-       int good;
+       uint32_t oflags;
+       uint32_t nflags;
 
-       while (owner & HAMMER2_DIO_INPROG) {
-               refs = dio->refs;
+       for (;;) {
+               oflags = iocb->flags;
                cpu_ccfence();
-               good = dio->bp ? HAMMER2_DIO_GOOD : 0;
-               if (atomic_cmpset_int(&dio->refs, refs,
-                                     (refs & ~(HAMMER2_DIO_WAITING |
-                                               HAMMER2_DIO_INPROG)) |
-                                     good)) {
-                       if (refs & HAMMER2_DIO_WAITING)
-                               wakeup(dio);
-                       if (good)
-                               BUF_KERNPROC(dio->bp);
+               nflags = oflags | HAMMER2_IOCB_WAKEUP;
+               if (oflags & HAMMER2_IOCB_DONE)
                        break;
+               tsleep_interlock(iocb, 0);
+               if (atomic_cmpset_int(&iocb->flags, oflags, nflags)) {
+                       tsleep(iocb, PINTERLOCKED, "h2iocb", hz);
                }
-               /* retry */
        }
+
 }
 
 /*
- * Release our ref on *diop, dispose of the underlying buffer.
+ * Release our ref on *diop, dispose of the underlying buffer, and flush
+ * on last drop if it was dirty.
  */
 void
 hammer2_io_putblk(hammer2_io_t **diop)
 {
        hammer2_mount_t *hmp;
        hammer2_io_t *dio;
+       hammer2_iocb_t iocb;
        struct buf *bp;
        off_t peof;
        off_t pbase;
@@ -220,6 +301,9 @@ hammer2_io_putblk(hammer2_io_t **diop)
        dio = *diop;
        *diop = NULL;
 
+       /*
+        * Drop refs, on 1->0 transition clear flags, set INPROG.
+        */
        for (;;) {
                refs = dio->refs;
 
@@ -242,22 +326,18 @@ hammer2_io_putblk(hammer2_io_t **diop)
        }
 
        /*
-        * Locked INPROG on 1->0 transition and we cleared DIO_GOOD (which is
-        * legal only on the last ref).  This allows us to dispose of the
-        * buffer.  refs is now 0.
+        * We have set DIO_INPROG to gain control of the buffer and we have
+        * cleared DIO_GOOD to prevent other accessors from thinking it is
+        * still good.
         *
-        * The instant we call io_complete dio is a free agent again and
-        * can be ripped out from under us.  Acquisition of the dio after
-        * this point will require a shared or exclusive spinlock.
+        * We can now dispose of the buffer, and should do it before calling
+        * io_complete() in case there's a race against a new reference
+        * which causes io_complete() to chain and instantiate the bp again.
         */
-       hmp = dio->hmp;
-       bp = dio->bp;
-       dio->bp = NULL;
        pbase = dio->pbase;
        psize = dio->psize;
-       atomic_add_int(&hmp->iofree_count, 1);
-       hammer2_io_complete(dio, HAMMER2_DIO_INPROG);   /* clears INPROG */
-       dio = NULL;     /* dio stale */
+       bp = dio->bp;
+       dio->bp = NULL;
 
        if (refs & HAMMER2_DIO_GOOD) {
                KKASSERT(bp != NULL);
@@ -275,9 +355,31 @@ hammer2_io_putblk(hammer2_io_t **diop)
                } else {
                        bqrelse(bp);
                }
+       } else if (bp) {
+               if (refs & HAMMER2_DIO_DIRTY) {
+                       bdwrite(bp);
+               } else {
+                       brelse(bp);
+               }
        }
 
        /*
+        * The instant we call io_complete dio is a free agent again and
+        * can be ripped out from under us.
+        *
+        * we can cleanup our final DIO_INPROG by simulating an iocb
+        * completion.
+        */
+       hmp = dio->hmp;                         /* extract fields */
+       atomic_add_int(&hmp->iofree_count, 1);
+       cpu_ccfence();
+
+       iocb.dio = dio;
+       iocb.flags = HAMMER2_IOCB_INPROG;
+       hammer2_io_complete(&iocb);
+       dio = NULL;                             /* dio stale */
+
+       /*
         * We cache free buffers so re-use cases can use a shared lock, but
         * if too many build up we have to clean them out.
         */
@@ -297,7 +399,7 @@ hammer2_io_putblk(hammer2_io_t **diop)
 }
 
 /*
- * Cleanup any dio's with no references which are not in-progress.
+ * Cleanup any dio's with (INPROG | refs) == 0.
  */
 static
 int
@@ -336,6 +438,9 @@ hammer2_io_cleanup(hammer2_mount_t *hmp, struct hammer2_io_tree *tree)
        }
 }
 
+/*
+ * Returns a pointer to the requested data.
+ */
 char *
 hammer2_io_data(hammer2_io_t *dio, off_t lbase)
 {
@@ -349,157 +454,188 @@ hammer2_io_data(hammer2_io_t *dio, off_t lbase)
        return(bp->b_data + off);
 }
 
+/*
+ * Helpers for hammer2_io_new*() functions
+ */
 static
-int
-_hammer2_io_new(hammer2_mount_t *hmp, off_t lbase, int lsize,
-               hammer2_io_t **diop, int dozero, int quick)
+void
+hammer2_iocb_new_callback(hammer2_iocb_t *iocb)
 {
-       hammer2_io_t *dio;
-       int owner;
-       int error;
+       hammer2_io_t *dio = iocb->dio;
+       int gbctl = (iocb->flags & HAMMER2_IOCB_QUICK) ? GETBLK_NOWAIT : 0;
 
-       dio = *diop = hammer2_io_getblk(hmp, lbase, lsize, &owner);
-       if (owner) {
-               if (lsize == dio->psize) {
-                       dio->bp = getblk(hmp->devvp,
-                                            dio->pbase, dio->psize,
-                                            (quick ? GETBLK_NOWAIT : 0),
-                                            0);
-                       if (dio->bp) {
-                               vfs_bio_clrbuf(dio->bp);
-                               if (quick) {
-                                       dio->bp->b_flags |= B_CACHE;
-                                       bqrelse(dio->bp);
+       /*
+        * If INPROG is not set the dio already has a good buffer and we
+        * can't mess with it other than zero the requested range.
+        *
+        * If INPROG is set it gets a bit messy.
+        */
+       if (iocb->flags & HAMMER2_IOCB_INPROG) {
+               if ((iocb->flags & HAMMER2_IOCB_READ) == 0) {
+                       if (iocb->lsize == dio->psize) {
+                               /*
+                                * Fully covered buffer, try to optimize to
+                                * avoid any I/O.
+                                */
+                               if (dio->bp == NULL) {
+                                       dio->bp = getblk(dio->hmp->devvp,
+                                                        dio->pbase, dio->psize,
+                                                        gbctl, 0);
+                               }
+                               if (dio->bp) {
+                                       vfs_bio_clrbuf(dio->bp);
+                                       if (iocb->flags & HAMMER2_IOCB_QUICK) {
+                                               dio->bp->b_flags |= B_CACHE;
+                                               bqrelse(dio->bp);
+                                               dio->bp = NULL;
+                                       }
+                               }
+                       } else if (iocb->flags & HAMMER2_IOCB_QUICK) {
+                               /*
+                                * Partial buffer, quick mode.  Do nothing.
+                                */
+                       } else if (dio->bp == NULL ||
+                                  (dio->bp->b_flags & B_CACHE) == 0) {
+                               /*
+                                * Partial buffer, normal mode, requires
+                                * read-before-write.  Chain the read.
+                                */
+                               if (dio->bp) {
+                                       if (dio->refs & HAMMER2_DIO_DIRTY)
+                                               bdwrite(dio->bp);
+                                       else
+                                               bqrelse(dio->bp);
                                        dio->bp = NULL;
                                }
-                       }
-                       error = 0;
-               } else if (quick) {
-                       /* do nothing */
-                       error = 0;
-               } else {
-                       error = bread(hmp->devvp, dio->pbase,
-                                     dio->psize, &dio->bp);
-               }
-               if (error) {
-                       brelse(dio->bp);
-                       dio->bp = NULL;
+                               iocb->flags |= HAMMER2_IOCB_READ;
+                               breadcb(dio->hmp->devvp,
+                                       dio->pbase, dio->psize,
+                                       hammer2_io_callback, iocb);
+                               return;
+                       } /* else buffer is good */
                }
-               hammer2_io_complete(dio, owner);
-       } else {
-               error = 0;
        }
        if (dio->bp) {
-               if (dozero)
-                       bzero(hammer2_io_data(dio, lbase), lsize);
+               if (iocb->flags & HAMMER2_IOCB_ZERO)
+                       bzero(hammer2_io_data(dio, iocb->lbase), iocb->lsize);
                atomic_set_int(&dio->refs, HAMMER2_DIO_DIRTY);
        }
-       return error;
+       hammer2_io_complete(iocb);
+}
+
+static
+int
+_hammer2_io_new(hammer2_mount_t *hmp, off_t lbase, int lsize,
+               hammer2_io_t **diop, int flags)
+{
+       hammer2_iocb_t iocb;
+       hammer2_io_t *dio;
+
+       iocb.callback = hammer2_iocb_new_callback;
+       iocb.cluster = NULL;
+       iocb.chain = NULL;
+       iocb.ptr = NULL;
+       iocb.lbase = lbase;
+       iocb.lsize = lsize;
+       iocb.flags = flags;
+       iocb.error = 0;
+       hammer2_io_getblk(hmp, lbase, lsize, &iocb);
+       if ((iocb.flags & HAMMER2_IOCB_DONE) == 0)
+               hammer2_iocb_wait(&iocb);
+       dio = *diop = iocb.dio;
+
+       return (iocb.error);
 }
 
 int
 hammer2_io_new(hammer2_mount_t *hmp, off_t lbase, int lsize,
               hammer2_io_t **diop)
 {
-       return(_hammer2_io_new(hmp, lbase, lsize, diop, 1, 0));
+       return(_hammer2_io_new(hmp, lbase, lsize, diop, HAMMER2_IOCB_ZERO));
 }
 
 int
 hammer2_io_newnz(hammer2_mount_t *hmp, off_t lbase, int lsize,
               hammer2_io_t **diop)
 {
-       return(_hammer2_io_new(hmp, lbase, lsize, diop, 0, 0));
+       return(_hammer2_io_new(hmp, lbase, lsize, diop, 0));
 }
 
 int
 hammer2_io_newq(hammer2_mount_t *hmp, off_t lbase, int lsize,
               hammer2_io_t **diop)
 {
-       return(_hammer2_io_new(hmp, lbase, lsize, diop, 0, 1));
+       return(_hammer2_io_new(hmp, lbase, lsize, diop, HAMMER2_IOCB_QUICK));
 }
 
-int
-hammer2_io_bread(hammer2_mount_t *hmp, off_t lbase, int lsize,
-               hammer2_io_t **diop)
+static
+void
+hammer2_iocb_bread_callback(hammer2_iocb_t *iocb)
 {
-       hammer2_io_t *dio;
+       hammer2_io_t *dio = iocb->dio;
        off_t peof;
-       int owner;
        int error;
 
-       dio = *diop = hammer2_io_getblk(hmp, lbase, lsize, &owner);
-       if (owner) {
+       if (iocb->flags & HAMMER2_IOCB_INPROG) {
                if (hammer2_cluster_enable) {
                        peof = (dio->pbase + HAMMER2_SEGMASK64) &
                               ~HAMMER2_SEGMASK64;
-                       error = cluster_read(hmp->devvp, peof, dio->pbase,
+                       error = cluster_read(dio->hmp->devvp, peof, dio->pbase,
                                             dio->psize,
                                             dio->psize, HAMMER2_PBUFSIZE*4,
                                             &dio->bp);
                } else {
-                       error = bread(hmp->devvp, dio->pbase,
+                       error = bread(dio->hmp->devvp, dio->pbase,
                                      dio->psize, &dio->bp);
                }
                if (error) {
                        brelse(dio->bp);
                        dio->bp = NULL;
                }
-               hammer2_io_complete(dio, owner);
-       } else {
-               error = 0;
        }
-       return error;
+       hammer2_io_complete(iocb);
 }
 
-void
-hammer2_io_breadcb(hammer2_mount_t *hmp, off_t lbase, int lsize,
-                 void (*callback)(hammer2_io_t *dio,
-                                  hammer2_cluster_t *arg_l,
-                                  hammer2_chain_t *arg_c,
-                                  void *arg_p, off_t arg_o),
-                 hammer2_cluster_t *arg_l, hammer2_chain_t *arg_c,
-                 void *arg_p, off_t arg_o)
+int
+hammer2_io_bread(hammer2_mount_t *hmp, off_t lbase, int lsize,
+               hammer2_io_t **diop)
 {
+       hammer2_iocb_t iocb;
        hammer2_io_t *dio;
-       int owner;
-       int error;
 
-       dio = hammer2_io_getblk(hmp, lbase, lsize, &owner);
-       if (owner) {
-               dio->callback = callback;
-               dio->arg_l = arg_l;
-               dio->arg_c = arg_c;
-               dio->arg_p = arg_p;
-               dio->arg_o = arg_o;
-               breadcb(hmp->devvp, dio->pbase, dio->psize,
-                       hammer2_io_callback, dio);
-       } else {
-               error = 0;
-               callback(dio, arg_l, arg_c, arg_p, arg_o);
-               hammer2_io_bqrelse(&dio);
-       }
+       iocb.callback = hammer2_iocb_bread_callback;
+       iocb.cluster = NULL;
+       iocb.chain = NULL;
+       iocb.ptr = NULL;
+       iocb.lbase = lbase;
+       iocb.lsize = lsize;
+       iocb.flags = 0;
+       iocb.error = 0;
+       hammer2_io_getblk(hmp, lbase, lsize, &iocb);
+       if ((iocb.flags & HAMMER2_IOCB_DONE) == 0)
+               hammer2_iocb_wait(&iocb);
+       dio = *diop = iocb.dio;
+
+       return (iocb.error);
 }
 
-static void
+/*
+ * System buf/bio async callback extracts the iocb and chains
+ * to the iocb callback.
+ */
+void
 hammer2_io_callback(struct bio *bio)
 {
        struct buf *dbp = bio->bio_buf;
-       hammer2_io_t *dio = bio->bio_caller_info1.ptr;
+       hammer2_iocb_t *iocb = bio->bio_caller_info1.ptr;
+       hammer2_io_t *dio;
 
+       dio = iocb->dio;
        if ((bio->bio_flags & BIO_DONE) == 0)
                bpdone(dbp, 0);
        bio->bio_flags &= ~(BIO_DONE | BIO_SYNC);
        dio->bp = bio->bio_buf;
-       KKASSERT((dio->bp->b_flags & B_ERROR) == 0); /* XXX */
-       hammer2_io_complete(dio, HAMMER2_DIO_INPROG);
-
-       /*
-        * We still have the ref and DIO_GOOD is now set so nothing else
-        * should mess with the callback fields until we release the dio.
-        */
-       dio->callback(dio, dio->arg_l, dio->arg_c, dio->arg_p, dio->arg_o);
-       hammer2_io_bqrelse(&dio);
-       /* TODO: async load meta-data and assign chain->dio */
+       iocb->callback(iocb);
 }
 
 void
index ad79d54..06372d3 100644 (file)
@@ -364,7 +364,7 @@ hammer2_ioctl_socket_set(hammer2_inode_t *ip, void *data)
 static int
 hammer2_ioctl_pfs_get(hammer2_inode_t *ip, void *data)
 {
-       const hammer2_inode_data_t *ipdata;
+       const hammer2_inode_data_t *ripdata;
        hammer2_mount_t *hmp;
        hammer2_ioc_pfs_t *pfs;
        hammer2_cluster_t *cparent;
@@ -389,12 +389,12 @@ hammer2_ioctl_pfs_get(hammer2_inode_t *ip, void *data)
                                                 0, (hammer2_key_t)-1,
                                                 0, &ddflag);
        } else if (pfs->name_key == (hammer2_key_t)-1) {
-               ipdata = &hammer2_cluster_data(rcluster)->ipdata;
+               ripdata = &hammer2_cluster_rdata(rcluster)->ipdata;
                cluster = hammer2_cluster_lookup(cparent, &key_next,
-                                                ipdata->name_key,
-                                                ipdata->name_key,
+                                                ripdata->name_key,
+                                                ripdata->name_key,
                                                 0, &ddflag);
-               ipdata = NULL;  /* safety */
+               ripdata = NULL; /* safety */
        } else {
                cluster = hammer2_cluster_lookup(cparent, &key_next,
                                                 pfs->name_key, pfs->name_key,
@@ -412,15 +412,15 @@ hammer2_ioctl_pfs_get(hammer2_inode_t *ip, void *data)
                /*
                 * Load the data being returned by the ioctl.
                 */
-               ipdata = &hammer2_cluster_data(cluster)->ipdata;
-               pfs->name_key = ipdata->name_key;
-               pfs->pfs_type = ipdata->pfs_type;
-               pfs->pfs_clid = ipdata->pfs_clid;
-               pfs->pfs_fsid = ipdata->pfs_fsid;
-               KKASSERT(ipdata->name_len < sizeof(pfs->name));
-               bcopy(ipdata->filename, pfs->name, ipdata->name_len);
-               pfs->name[ipdata->name_len] = 0;
-               ipdata = NULL;  /* safety */
+               ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
+               pfs->name_key = ripdata->name_key;
+               pfs->pfs_type = ripdata->pfs_type;
+               pfs->pfs_clid = ripdata->pfs_clid;
+               pfs->pfs_fsid = ripdata->pfs_fsid;
+               KKASSERT(ripdata->name_len < sizeof(pfs->name));
+               bcopy(ripdata->filename, pfs->name, ripdata->name_len);
+               pfs->name[ripdata->name_len] = 0;
+               ripdata = NULL; /* safety */
 
                /*
                 * Calculate the next field
@@ -434,8 +434,8 @@ hammer2_ioctl_pfs_get(hammer2_inode_t *ip, void *data)
                         hammer2_cluster_type(cluster) !=
                          HAMMER2_BREF_TYPE_INODE);
                if (cluster) {
-                       ipdata = &hammer2_cluster_data(cluster)->ipdata;
-                       pfs->name_next = ipdata->name_key;
+                       ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
+                       pfs->name_next = ripdata->name_key;
                        hammer2_cluster_unlock(cluster);
                } else {
                        pfs->name_next = (hammer2_key_t)-1;
@@ -455,7 +455,7 @@ hammer2_ioctl_pfs_get(hammer2_inode_t *ip, void *data)
 static int
 hammer2_ioctl_pfs_lookup(hammer2_inode_t *ip, void *data)
 {
-       const hammer2_inode_data_t *ipdata;
+       const hammer2_inode_data_t *ripdata;
        hammer2_mount_t *hmp;
        hammer2_ioc_pfs_t *pfs;
        hammer2_cluster_t *cparent;
@@ -480,12 +480,12 @@ hammer2_ioctl_pfs_lookup(hammer2_inode_t *ip, void *data)
                                         HAMMER2_LOOKUP_SHARED, &ddflag);
        while (cluster) {
                if (hammer2_cluster_type(cluster) == HAMMER2_BREF_TYPE_INODE) {
-                       ipdata = &hammer2_cluster_data(cluster)->ipdata;
-                       if (ipdata->name_len == len &&
-                           bcmp(ipdata->filename, pfs->name, len) == 0) {
+                       ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
+                       if (ripdata->name_len == len &&
+                           bcmp(ripdata->filename, pfs->name, len) == 0) {
                                break;
                        }
-                       ipdata = NULL;  /* safety */
+                       ripdata = NULL; /* safety */
                }
                cluster = hammer2_cluster_next(cparent, cluster, &key_next,
                                           key_next,
@@ -497,12 +497,12 @@ hammer2_ioctl_pfs_lookup(hammer2_inode_t *ip, void *data)
         * Load the data being returned by the ioctl.
         */
        if (cluster) {
-               ipdata = &hammer2_cluster_data(cluster)->ipdata;
-               pfs->name_key = ipdata->name_key;
-               pfs->pfs_type = ipdata->pfs_type;
-               pfs->pfs_clid = ipdata->pfs_clid;
-               pfs->pfs_fsid = ipdata->pfs_fsid;
-               ipdata = NULL;
+               ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
+               pfs->name_key = ripdata->name_key;
+               pfs->pfs_type = ripdata->pfs_type;
+               pfs->pfs_clid = ripdata->pfs_clid;
+               pfs->pfs_fsid = ripdata->pfs_fsid;
+               ripdata = NULL;
 
                hammer2_cluster_unlock(cluster);
        } else {
@@ -612,15 +612,15 @@ hammer2_ioctl_pfs_snapshot(hammer2_inode_t *ip, void *data)
 static int
 hammer2_ioctl_inode_get(hammer2_inode_t *ip, void *data)
 {
-       const hammer2_inode_data_t *ipdata;
+       const hammer2_inode_data_t *ripdata;
        hammer2_ioc_inode_t *ino;
        hammer2_cluster_t *cparent;
 
        ino = data;
 
        cparent = hammer2_inode_lock_sh(ip);
-       ipdata = &hammer2_cluster_data(cparent)->ipdata;
-       ino->ip_data = *ipdata;
+       ripdata = &hammer2_cluster_rdata(cparent)->ipdata;
+       ino->ip_data = *ripdata;
        ino->kdata = ip;
        hammer2_inode_unlock_sh(ip, cparent);
 
@@ -644,7 +644,7 @@ hammer2_ioctl_inode_set(hammer2_inode_t *ip, void *data)
 
        hammer2_trans_init(&trans, ip->pmp, 0);
        cparent = hammer2_inode_lock_ex(ip);
-       ripdata = &hammer2_cluster_data(cparent)->ipdata;
+       ripdata = &hammer2_cluster_rdata(cparent)->ipdata;
 
        if (ino->ip_data.check_algo != ripdata->check_algo) {
                wipdata = hammer2_cluster_modify_ip(&trans, ip, cparent, 0);
index ec998f4..0887205 100644 (file)
@@ -210,20 +210,20 @@ static void hammer2_vfs_unmount_hmp2(struct mount *mp, hammer2_mount_t *hmp);
  */
 static void hammer2_write_file_core(struct buf *bp, hammer2_trans_t *trans,
                                hammer2_inode_t *ip,
-                               hammer2_inode_data_t *ipdata,
+                               const hammer2_inode_data_t *ripdata,
                                hammer2_cluster_t *cparent,
                                hammer2_key_t lbase, int ioflag, int pblksize,
                                int *errorp);
 static void hammer2_compress_and_write(struct buf *bp, hammer2_trans_t *trans,
                                hammer2_inode_t *ip,
-                               const hammer2_inode_data_t *ipdata,
+                               const hammer2_inode_data_t *ripdata,
                                hammer2_cluster_t *cparent,
                                hammer2_key_t lbase, int ioflag,
                                int pblksize, int *errorp,
                                int comp_algo, int check_algo);
 static void hammer2_zero_check_and_write(struct buf *bp,
                                hammer2_trans_t *trans, hammer2_inode_t *ip,
-                               const hammer2_inode_data_t *ipdata,
+                               const hammer2_inode_data_t *ripdata,
                                hammer2_cluster_t *cparent,
                                hammer2_key_t lbase,
                                int ioflag, int pblksize, int *errorp,
@@ -231,7 +231,7 @@ static void hammer2_zero_check_and_write(struct buf *bp,
 static int test_block_zeros(const char *buf, size_t bytes);
 static void zero_write(struct buf *bp, hammer2_trans_t *trans,
                                hammer2_inode_t *ip,
-                               const hammer2_inode_data_t *ipdata,
+                               const hammer2_inode_data_t *ripdata,
                                hammer2_cluster_t *cparent,
                                hammer2_key_t lbase,
                                int *errorp);
@@ -326,7 +326,7 @@ hammer2_vfs_uninit(struct vfsconf *vfsp __unused)
  * mounts and the spmp structure for media (hmp) structures.
  */
 static hammer2_pfsmount_t *
-hammer2_pfsalloc(const hammer2_inode_data_t *ipdata, hammer2_tid_t alloc_tid)
+hammer2_pfsalloc(const hammer2_inode_data_t *ripdata, hammer2_tid_t alloc_tid)
 {
        hammer2_pfsmount_t *pmp;
 
@@ -341,9 +341,9 @@ hammer2_pfsalloc(const hammer2_inode_data_t *ipdata, hammer2_tid_t alloc_tid)
 
        pmp->alloc_tid = alloc_tid + 1;   /* our first media transaction id */
        pmp->flush_tid = pmp->alloc_tid;
-       if (ipdata) {
-               pmp->inode_tid = ipdata->pfs_inum + 1;
-               pmp->pfs_clid = ipdata->pfs_clid;
+       if (ripdata) {
+               pmp->inode_tid = ripdata->pfs_inum + 1;
+               pmp->pfs_clid = ripdata->pfs_clid;
        }
        mtx_init(&pmp->wthread_mtx);
        bioq_init(&pmp->wthread_bioq);
@@ -389,7 +389,7 @@ hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
        hammer2_chain_t *rchain;
        hammer2_cluster_t *cluster;
        hammer2_cluster_t *cparent;
-       const hammer2_inode_data_t *ipdata;
+       const hammer2_inode_data_t *ripdata;
        hammer2_blockref_t bref;
        struct file *fp;
        char devstr[MNAMELEN];
@@ -630,8 +630,9 @@ hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
                /*
                 * Sanity-check schain's pmp, finish initializing spmp.
                 */
+               ripdata = &hammer2_chain_rdata(schain)->ipdata;
                KKASSERT(schain->pmp == spmp);
-               spmp->pfs_clid = schain->data->ipdata.pfs_clid;
+               spmp->pfs_clid = ripdata->pfs_clid;
 
                /*
                 * NOTE: inode_get sucks up schain's lock.
@@ -693,7 +694,7 @@ hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
        while (cluster) {
                if (hammer2_cluster_type(cluster) == HAMMER2_BREF_TYPE_INODE &&
                    strcmp(label,
-                      hammer2_cluster_data(cluster)->ipdata.filename) == 0) {
+                      hammer2_cluster_rdata(cluster)->ipdata.filename) == 0) {
                        break;
                }
                cluster = hammer2_cluster_next(cparent, cluster, &key_next,
@@ -740,11 +741,11 @@ hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
         * Check to see if the cluster id is already mounted at the mount
         * point.  If it is, add us to the cluster.
         */
-       ipdata = &hammer2_cluster_data(cluster)->ipdata;
+       ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
        hammer2_cluster_bref(cluster, &bref);
        TAILQ_FOREACH(pmp, &hammer2_pfslist, mntentry) {
                if (pmp->spmp_hmp == NULL &&
-                   bcmp(&pmp->pfs_clid, &ipdata->pfs_clid,
+                   bcmp(&pmp->pfs_clid, &ripdata->pfs_clid,
                         sizeof(pmp->pfs_clid)) == 0) {
                        break;
                }
@@ -799,7 +800,7 @@ hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
         *
         * From this point on we have to call hammer2_unmount() on failure.
         */
-       pmp = hammer2_pfsalloc(ipdata, bref.mirror_tid);
+       pmp = hammer2_pfsalloc(ripdata, bref.mirror_tid);
        kprintf("PMP mirror_tid is %016jx\n", bref.mirror_tid);
        for (i = 0; i < cluster->nchains; ++i) {
                rchain = cluster->array[i];
@@ -1104,14 +1105,15 @@ retry:
 static
 void
 hammer2_write_file_core(struct buf *bp, hammer2_trans_t *trans,
-                       hammer2_inode_t *ip, hammer2_inode_data_t *ipdata,
+                       hammer2_inode_t *ip,
+                       const hammer2_inode_data_t *ripdata,
                        hammer2_cluster_t *cparent,
                        hammer2_key_t lbase, int ioflag, int pblksize,
                        int *errorp)
 {
        hammer2_cluster_t *cluster;
 
-       switch(HAMMER2_DEC_ALGO(ipdata->comp_algo)) {
+       switch(HAMMER2_DEC_ALGO(ripdata->comp_algo)) {
        case HAMMER2_COMP_NONE:
                /*
                 * We have to assign physical storage to the buffer
@@ -1125,7 +1127,7 @@ hammer2_write_file_core(struct buf *bp, hammer2_trans_t *trans,
                                                lbase, pblksize,
                                                errorp);
                hammer2_write_bp(cluster, bp, ioflag, pblksize, errorp,
-                                ipdata->check_algo);
+                                ripdata->check_algo);
                if (cluster)
                        hammer2_cluster_unlock(cluster);
                break;
@@ -1134,9 +1136,9 @@ hammer2_write_file_core(struct buf *bp, hammer2_trans_t *trans,
                 * Check for zero-fill only
                 */
                hammer2_zero_check_and_write(bp, trans, ip,
-                                   ipdata, cparent, lbase,
+                                   ripdata, cparent, lbase,
                                    ioflag, pblksize, errorp,
-                                   ipdata->check_algo);
+                                   ripdata->check_algo);
                break;
        case HAMMER2_COMP_LZ4:
        case HAMMER2_COMP_ZLIB:
@@ -1145,11 +1147,11 @@ hammer2_write_file_core(struct buf *bp, hammer2_trans_t *trans,
                 * Check for zero-fill and attempt compression.
                 */
                hammer2_compress_and_write(bp, trans, ip,
-                                          ipdata, cparent,
+                                          ripdata, cparent,
                                           lbase, ioflag,
                                           pblksize, errorp,
-                                          ipdata->comp_algo,
-                                          ipdata->check_algo);
+                                          ripdata->comp_algo,
+                                          ripdata->check_algo);
                break;
        }
 }
@@ -1162,7 +1164,7 @@ hammer2_write_file_core(struct buf *bp, hammer2_trans_t *trans,
 static
 void
 hammer2_compress_and_write(struct buf *bp, hammer2_trans_t *trans,
-       hammer2_inode_t *ip, const hammer2_inode_data_t *ipdata,
+       hammer2_inode_t *ip, const hammer2_inode_data_t *ripdata,
        hammer2_cluster_t *cparent,
        hammer2_key_t lbase, int ioflag, int pblksize,
        int *errorp, int comp_algo, int check_algo)
@@ -1175,7 +1177,7 @@ hammer2_compress_and_write(struct buf *bp, hammer2_trans_t *trans,
        char *comp_buffer;
 
        if (test_block_zeros(bp->b_data, pblksize)) {
-               zero_write(bp, trans, ip, ipdata, cparent, lbase, errorp);
+               zero_write(bp, trans, ip, ripdata, cparent, lbase, errorp);
                return;
        }
 
@@ -1278,7 +1280,7 @@ hammer2_compress_and_write(struct buf *bp, hammer2_trans_t *trans,
        cluster = hammer2_assign_physical(trans, ip, cparent,
                                          lbase, comp_block_size,
                                          errorp);
-       ipdata = &hammer2_cluster_data(cparent)->ipdata;
+       ripdata = NULL;
 
        if (*errorp) {
                kprintf("WRITE PATH: An error occurred while "
@@ -1288,18 +1290,19 @@ hammer2_compress_and_write(struct buf *bp, hammer2_trans_t *trans,
        }
 
        for (i = 0; i < cluster->nchains; ++i) {
+               hammer2_inode_data_t *wipdata;
                hammer2_io_t *dio;
                char *bdata;
 
-               chain = cluster->array[i];
+               chain = cluster->array[i];      /* XXX */
                KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
 
                switch(chain->bref.type) {
                case HAMMER2_BREF_TYPE_INODE:
-                       KKASSERT(chain->data->ipdata.op_flags &
-                                HAMMER2_OPFLAG_DIRECTDATA);
+                       wipdata = &hammer2_chain_wdata(chain)->ipdata;
+                       KKASSERT(wipdata->op_flags & HAMMER2_OPFLAG_DIRECTDATA);
                        KKASSERT(bp->b_loffset == 0);
-                       bcopy(bp->b_data, chain->data->ipdata.u.data,
+                       bcopy(bp->b_data, wipdata->u.data,
                              HAMMER2_EMBEDDED_BYTES);
                        break;
                case HAMMER2_BREF_TYPE_DATA:
@@ -1393,7 +1396,7 @@ done:
 static
 void
 hammer2_zero_check_and_write(struct buf *bp, hammer2_trans_t *trans,
-       hammer2_inode_t *ip, const hammer2_inode_data_t *ipdata,
+       hammer2_inode_t *ip, const hammer2_inode_data_t *ripdata,
        hammer2_cluster_t *cparent,
        hammer2_key_t lbase, int ioflag, int pblksize, int *errorp,
        int check_algo)
@@ -1401,7 +1404,7 @@ hammer2_zero_check_and_write(struct buf *bp, hammer2_trans_t *trans,
        hammer2_cluster_t *cluster;
 
        if (test_block_zeros(bp->b_data, pblksize)) {
-               zero_write(bp, trans, ip, ipdata, cparent, lbase, errorp);
+               zero_write(bp, trans, ip, ripdata, cparent, lbase, errorp);
        } else {
                cluster = hammer2_assign_physical(trans, ip, cparent,
                                                  lbase, pblksize, errorp);
@@ -1435,7 +1438,7 @@ test_block_zeros(const char *buf, size_t bytes)
 static
 void
 zero_write(struct buf *bp, hammer2_trans_t *trans,
-          hammer2_inode_t *ip, const hammer2_inode_data_t *ipdata,
+          hammer2_inode_t *ip, const hammer2_inode_data_t *ripdata,
           hammer2_cluster_t *cparent,
           hammer2_key_t lbase, int *errorp __unused)
 {
@@ -1475,6 +1478,7 @@ hammer2_write_bp(hammer2_cluster_t *cluster, struct buf *bp, int ioflag,
                                int pblksize, int *errorp, int check_algo)
 {
        hammer2_chain_t *chain;
+       hammer2_inode_data_t *wipdata;
        hammer2_io_t *dio;
        char *bdata;
        int error;
@@ -1483,16 +1487,15 @@ hammer2_write_bp(hammer2_cluster_t *cluster, struct buf *bp, int ioflag,
        error = 0;      /* XXX TODO below */
 
        for (i = 0; i < cluster->nchains; ++i) {
-               chain = cluster->array[i];
-
+               chain = cluster->array[i];      /* XXX */
                KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
 
                switch(chain->bref.type) {
                case HAMMER2_BREF_TYPE_INODE:
-                       KKASSERT(chain->data->ipdata.op_flags &
-                                HAMMER2_OPFLAG_DIRECTDATA);
+                       wipdata = &hammer2_chain_wdata(chain)->ipdata;
+                       KKASSERT(wipdata->op_flags & HAMMER2_OPFLAG_DIRECTDATA);
                        KKASSERT(bp->b_loffset == 0);
-                       bcopy(bp->b_data, chain->data->ipdata.u.data,
+                       bcopy(bp->b_data, wipdata->u.data,
                              HAMMER2_EMBEDDED_BYTES);
                        error = 0;
                        break;
@@ -1991,6 +1994,7 @@ hammer2_recovery_scan(hammer2_trans_t *trans, hammer2_mount_t *hmp,
                      struct hammer2_recovery_info *info,
                      hammer2_tid_t sync_tid)
 {
+       const hammer2_inode_data_t *ripdata;
        hammer2_chain_t *chain;
        int cache_index;
        int cumulative_error = 0;
@@ -2018,12 +2022,13 @@ hammer2_recovery_scan(hammer2_trans_t *trans, hammer2_mount_t *hmp,
                 * for recursion.
                 */
                hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
-               if (parent->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA) {
+               ripdata = &hammer2_chain_rdata(parent)->ipdata;
+               if (ripdata->op_flags & HAMMER2_OPFLAG_DIRECTDATA) {
                        /* not applicable to recovery scan */
                        hammer2_chain_unlock(parent);
                        return 0;
                }
-               if ((parent->data->ipdata.op_flags & HAMMER2_OPFLAG_PFSROOT) &&
+               if ((ripdata->op_flags & HAMMER2_OPFLAG_PFSROOT) &&
                    info->depth != 0) {
                        pfs_boundary = 1;
                        sync_tid = parent->bref.mirror_tid - 1;
@@ -2692,7 +2697,7 @@ hammer2_autodmsg(kdmsg_msg_t *msg)
 static void
 hammer2_update_spans(hammer2_mount_t *hmp, kdmsg_state_t *state)
 {
-       const hammer2_inode_data_t *ipdata;
+       const hammer2_inode_data_t *ripdata;
        hammer2_cluster_t *cparent;
        hammer2_cluster_t *cluster;
        hammer2_pfsmount_t *spmp;
@@ -2716,20 +2721,20 @@ hammer2_update_spans(hammer2_mount_t *hmp, kdmsg_state_t *state)
        while (cluster) {
                if (hammer2_cluster_type(cluster) != HAMMER2_BREF_TYPE_INODE)
                        continue;
-               ipdata = &hammer2_cluster_data(cluster)->ipdata;
-               kprintf("UPDATE SPANS: %s\n", ipdata->filename);
+               ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
+               kprintf("UPDATE SPANS: %s\n", ripdata->filename);
 
                rmsg = kdmsg_msg_alloc(state, DMSG_LNK_SPAN | DMSGF_CREATE,
                                       hammer2_lnk_span_reply, NULL);
-               rmsg->any.lnk_span.pfs_clid = ipdata->pfs_clid;
-               rmsg->any.lnk_span.pfs_fsid = ipdata->pfs_fsid;
-               rmsg->any.lnk_span.pfs_type = ipdata->pfs_type;
+               rmsg->any.lnk_span.pfs_clid = ripdata->pfs_clid;
+               rmsg->any.lnk_span.pfs_fsid = ripdata->pfs_fsid;
+               rmsg->any.lnk_span.pfs_type = ripdata->pfs_type;
                rmsg->any.lnk_span.peer_type = DMSG_PEER_HAMMER2;
                rmsg->any.lnk_span.proto_version = DMSG_SPAN_PROTO_1;
-               name_len = ipdata->name_len;
+               name_len = ripdata->name_len;
                if (name_len >= sizeof(rmsg->any.lnk_span.fs_label))
                        name_len = sizeof(rmsg->any.lnk_span.fs_label) - 1;
-               bcopy(ipdata->filename, rmsg->any.lnk_span.fs_label, name_len);
+               bcopy(ripdata->filename, rmsg->any.lnk_span.fs_label, name_len);
 
                kdmsg_msg_write(rmsg);
 
index 2691e2c..b74dce9 100644 (file)
@@ -209,7 +209,7 @@ hammer2_vop_inactive(struct vop_inactive_args *ap)
         */
        cluster = hammer2_inode_lock_ex(ip);
        KKASSERT(cluster);
-       ripdata = &hammer2_cluster_data(cluster)->ipdata;
+       ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
 
        /*
         * Check for deleted inodes and recycle immediately.
@@ -259,7 +259,7 @@ hammer2_vop_reclaim(struct vop_reclaim_args *ap)
         */
        pmp = ip->pmp;
        cluster = hammer2_inode_lock_ex(ip);
-       ripdata = &hammer2_cluster_data(cluster)->ipdata;
+       ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
 
        /*
         * The final close of a deleted file or directory marks it for
@@ -368,7 +368,7 @@ int
 hammer2_vop_access(struct vop_access_args *ap)
 {
        hammer2_inode_t *ip = VTOI(ap->a_vp);
-       const hammer2_inode_data_t *ipdata;
+       const hammer2_inode_data_t *ripdata;
        hammer2_cluster_t *cluster;
        uid_t uid;
        gid_t gid;
@@ -376,10 +376,10 @@ hammer2_vop_access(struct vop_access_args *ap)
 
        LOCKSTART;
        cluster = hammer2_inode_lock_sh(ip);
-       ipdata = &hammer2_cluster_data(cluster)->ipdata;
-       uid = hammer2_to_unix_xid(&ipdata->uid);
-       gid = hammer2_to_unix_xid(&ipdata->gid);
-       error = vop_helper_access(ap, uid, gid, ipdata->mode, ipdata->uflags);
+       ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
+       uid = hammer2_to_unix_xid(&ripdata->uid);
+       gid = hammer2_to_unix_xid(&ripdata->gid);
+       error = vop_helper_access(ap, uid, gid, ripdata->mode, ripdata->uflags);
        hammer2_inode_unlock_sh(ip, cluster);
 
        LOCKSTOP;
@@ -390,7 +390,7 @@ static
 int
 hammer2_vop_getattr(struct vop_getattr_args *ap)
 {
-       const hammer2_inode_data_t *ipdata;
+       const hammer2_inode_data_t *ripdata;
        hammer2_cluster_t *cluster;
        hammer2_pfsmount_t *pmp;
        hammer2_inode_t *ip;
@@ -405,29 +405,29 @@ hammer2_vop_getattr(struct vop_getattr_args *ap)
        pmp = ip->pmp;
 
        cluster = hammer2_inode_lock_sh(ip);
-       ipdata = &hammer2_cluster_data(cluster)->ipdata;
+       ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
        KKASSERT(hammer2_cluster_type(cluster) == HAMMER2_BREF_TYPE_INODE);
 
        vap->va_fsid = pmp->mp->mnt_stat.f_fsid.val[0];
-       vap->va_fileid = ipdata->inum;
-       vap->va_mode = ipdata->mode;
-       vap->va_nlink = ipdata->nlinks;
-       vap->va_uid = hammer2_to_unix_xid(&ipdata->uid);
-       vap->va_gid = hammer2_to_unix_xid(&ipdata->gid);
+       vap->va_fileid = ripdata->inum;
+       vap->va_mode = ripdata->mode;
+       vap->va_nlink = ripdata->nlinks;
+       vap->va_uid = hammer2_to_unix_xid(&ripdata->uid);
+       vap->va_gid = hammer2_to_unix_xid(&ripdata->gid);
        vap->va_rmajor = 0;
        vap->va_rminor = 0;
        vap->va_size = ip->size;        /* protected by shared lock */
        vap->va_blocksize = HAMMER2_PBUFSIZE;
-       vap->va_flags = ipdata->uflags;
-       hammer2_time_to_timespec(ipdata->ctime, &vap->va_ctime);
-       hammer2_time_to_timespec(ipdata->mtime, &vap->va_mtime);
-       hammer2_time_to_timespec(ipdata->mtime, &vap->va_atime);
+       vap->va_flags = ripdata->uflags;
+       hammer2_time_to_timespec(ripdata->ctime, &vap->va_ctime);
+       hammer2_time_to_timespec(ripdata->mtime, &vap->va_mtime);
+       hammer2_time_to_timespec(ripdata->mtime, &vap->va_atime);
        vap->va_gen = 1;
        vap->va_bytes = vap->va_size;   /* XXX */
-       vap->va_type = hammer2_get_vtype(ipdata);
+       vap->va_type = hammer2_get_vtype(ripdata);
        vap->va_filerev = 0;
-       vap->va_uid_uuid = ipdata->uid;
-       vap->va_gid_uuid = ipdata->gid;
+       vap->va_uid_uuid = ripdata->uid;
+       vap->va_gid_uuid = ripdata->gid;
        vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
                          VA_FSID_UUID_VALID;
 
@@ -469,7 +469,7 @@ hammer2_vop_setattr(struct vop_setattr_args *ap)
        hammer2_pfs_memory_wait(ip->pmp);
        hammer2_trans_init(&trans, ip->pmp, 0);
        cluster = hammer2_inode_lock_ex(ip);
-       ripdata = &hammer2_cluster_data(cluster)->ipdata;
+       ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
        error = 0;
 
        if (vap->va_flags != VNOVAL) {
@@ -546,7 +546,7 @@ hammer2_vop_setattr(struct vop_setattr_args *ap)
                        }
                        cluster = hammer2_inode_lock_ex(ip);
                        /* RELOAD */
-                       ripdata = &hammer2_cluster_data(cluster)->ipdata;
+                       ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
                        domtime = 1;
                        break;
                default:
@@ -626,7 +626,7 @@ static
 int
 hammer2_vop_readdir(struct vop_readdir_args *ap)
 {
-       const hammer2_inode_data_t *ipdata;
+       const hammer2_inode_data_t *ripdata;
        hammer2_inode_t *ip;
        hammer2_inode_t *xip;
        hammer2_cluster_t *cparent;
@@ -666,7 +666,7 @@ hammer2_vop_readdir(struct vop_readdir_args *ap)
        cookie_index = 0;
 
        cparent = hammer2_inode_lock_sh(ip);
-       ipdata = &hammer2_cluster_data(cparent)->ipdata;
+       ripdata = &hammer2_cluster_rdata(cparent)->ipdata;
 
        /*
         * Handle artificial entries.  To ensure that only positive 64 bit
@@ -681,7 +681,7 @@ hammer2_vop_readdir(struct vop_readdir_args *ap)
        cluster = (void *)(intptr_t)-1; /* non-NULL for early goto done case */
 
        if (saveoff == 0) {
-               inum = ipdata->inum & HAMMER2_DIRHASH_USERMSK;
+               inum = ripdata->inum & HAMMER2_DIRHASH_USERMSK;
                r = vop_write_dirent(&error, uio, inum, DT_DIR, 1, ".");
                if (r)
                        goto done;
@@ -699,7 +699,7 @@ hammer2_vop_readdir(struct vop_readdir_args *ap)
                 *
                 * (ip is the current dir. xip is the parent dir).
                 */
-               inum = ipdata->inum & HAMMER2_DIRHASH_USERMSK;
+               inum = ripdata->inum & HAMMER2_DIRHASH_USERMSK;
                while (ip->pip != NULL && ip != ip->pmp->iroot) {
                        xip = ip->pip;
                        hammer2_inode_ref(xip);
@@ -707,9 +707,9 @@ hammer2_vop_readdir(struct vop_readdir_args *ap)
                        xcluster = hammer2_inode_lock_sh(xip);
                        cparent = hammer2_inode_lock_sh(ip);
                        hammer2_inode_drop(xip);
-                       ipdata = &hammer2_cluster_data(cparent)->ipdata;
+                       ripdata = &hammer2_cluster_rdata(cparent)->ipdata;
                        if (xip == ip->pip) {
-                               inum = hammer2_cluster_data(xcluster)->
+                               inum = hammer2_cluster_rdata(xcluster)->
                                        ipdata.inum & HAMMER2_DIRHASH_USERMSK;
                                hammer2_inode_unlock_sh(xip, xcluster);
                                break;
@@ -754,15 +754,15 @@ hammer2_vop_readdir(struct vop_readdir_args *ap)
                                bref.key, key_next);
 
                if (bref.type == HAMMER2_BREF_TYPE_INODE) {
-                       ipdata = &hammer2_cluster_data(cluster)->ipdata;
-                       dtype = hammer2_get_dtype(ipdata);
+                       ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
+                       dtype = hammer2_get_dtype(ripdata);
                        saveoff = bref.key & HAMMER2_DIRHASH_USERMSK;
                        r = vop_write_dirent(&error, uio,
-                                            ipdata->inum &
+                                            ripdata->inum &
                                              HAMMER2_DIRHASH_USERMSK,
                                             dtype,
-                                            ipdata->name_len,
-                                            ipdata->filename);
+                                            ripdata->name_len,
+                                            ripdata->filename);
                        if (r)
                                break;
                        if (cookies)
@@ -1228,7 +1228,7 @@ hammer2_vop_nresolve(struct vop_nresolve_args *ap)
        hammer2_inode_t *dip;
        hammer2_cluster_t *cparent;
        hammer2_cluster_t *cluster;
-       const hammer2_inode_data_t *ipdata;
+       const hammer2_inode_data_t *ripdata;
        hammer2_key_t key_next;
        hammer2_key_t lhc;
        struct namecache *ncp;
@@ -1254,9 +1254,9 @@ hammer2_vop_nresolve(struct vop_nresolve_args *ap)
                                         HAMMER2_LOOKUP_SHARED, &ddflag);
        while (cluster) {
                if (hammer2_cluster_type(cluster) == HAMMER2_BREF_TYPE_INODE) {
-                       ipdata = &hammer2_cluster_data(cluster)->ipdata;
-                       if (ipdata->name_len == name_len &&
-                           bcmp(ipdata->filename, name, name_len) == 0) {
+                       ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
+                       if (ripdata->name_len == name_len &&
+                           bcmp(ripdata->filename, name, name_len) == 0) {
                                break;
                        }
                }
@@ -1271,9 +1271,9 @@ hammer2_vop_nresolve(struct vop_nresolve_args *ap)
         * Resolve hardlink entries before acquiring the inode.
         */
        if (cluster) {
-               ipdata = &hammer2_cluster_data(cluster)->ipdata;
-               if (ipdata->type == HAMMER2_OBJTYPE_HARDLINK) {
-                       hammer2_tid_t inum = ipdata->inum;
+               ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
+               if (ripdata->type == HAMMER2_OBJTYPE_HARDLINK) {
+                       hammer2_tid_t inum = ripdata->inum;
                        error = hammer2_hardlink_find(dip, NULL, cluster);
                        if (error) {
                                kprintf("hammer2: unable to find hardlink "
@@ -1291,16 +1291,17 @@ hammer2_vop_nresolve(struct vop_nresolve_args *ap)
         */
        if (cluster) {
                ip = hammer2_inode_get(dip->pmp, dip, cluster);
-               ipdata = &hammer2_cluster_data(cluster)->ipdata;
-               if (ipdata->type == HAMMER2_OBJTYPE_HARDLINK) {
+               ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
+               if (ripdata->type == HAMMER2_OBJTYPE_HARDLINK) {
                        kprintf("nresolve: fixup hardlink\n");
                        hammer2_inode_ref(ip);
                        hammer2_inode_unlock_ex(ip, NULL);
                        hammer2_cluster_unlock(cluster);
                        cluster = hammer2_inode_lock_ex(ip);
-                       ipdata = &hammer2_cluster_data(cluster)->ipdata;
+                       ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
                        hammer2_inode_drop(ip);
-                       kprintf("nresolve: fixup to type %02x\n", ipdata->type);
+                       kprintf("nresolve: fixup to type %02x\n",
+                               ripdata->type);
                }
        } else {
                ip = NULL;
@@ -1475,13 +1476,13 @@ int
 hammer2_vop_advlock(struct vop_advlock_args *ap)
 {
        hammer2_inode_t *ip = VTOI(ap->a_vp);
-       const hammer2_inode_data_t *ipdata;
+       const hammer2_inode_data_t *ripdata;
        hammer2_cluster_t *cparent;
        hammer2_off_t size;
 
        cparent = hammer2_inode_lock_sh(ip);
-       ipdata = &hammer2_cluster_data(cparent)->ipdata;
-       size = ipdata->size;
+       ripdata = &hammer2_cluster_rdata(cparent)->ipdata;
+       size = ripdata->size;
        hammer2_inode_unlock_sh(ip, cparent);
        return (lf_advlock(ap, &ip->advlock, size));
 }
@@ -2041,18 +2042,19 @@ done:
 }
 
 /*
- * Strategy code
+ * Strategy code (async logical file buffer I/O from system)
  *
  * WARNING: The strategy code cannot safely use hammer2 transactions
  *         as this can deadlock against vfs_sync's vfsync() call
- *         if multiple flushes are queued.
+ *         if multiple flushes are queued.  All H2 structures must
+ *         already be present and ready for the DIO.
+ *
+ *         Reads can be initiated asynchronously, writes have to be
+ *         spooled to a separate thread for action to avoid deadlocks.
  */
 static int hammer2_strategy_read(struct vop_strategy_args *ap);
 static int hammer2_strategy_write(struct vop_strategy_args *ap);
-static void hammer2_strategy_read_callback(hammer2_io_t *dio,
-                               hammer2_cluster_t *cluster,
-                               hammer2_chain_t *chain,
-                               void *arg_p, off_t arg_o);
+static void hammer2_strategy_read_callback(hammer2_iocb_t *iocb);
 
 static
 int
@@ -2083,6 +2085,9 @@ hammer2_vop_strategy(struct vop_strategy_args *ap)
        return (error);
 }
 
+/*
+ * Logical buffer I/O, async read.
+ */
 static
 int
 hammer2_strategy_read(struct vop_strategy_args *ap)
@@ -2106,6 +2111,9 @@ hammer2_strategy_read(struct vop_strategy_args *ap)
        lbase = bio->bio_offset;
        KKASSERT(((int)lbase & HAMMER2_PBUFMASK) == 0);
 
+       /*
+        * Lookup the file offset.
+        */
        cparent = hammer2_inode_lock_sh(ip);
        cluster = hammer2_cluster_lookup(cparent, &key_dummy,
                                       lbase, lbase,
@@ -2130,59 +2138,91 @@ hammer2_strategy_read(struct vop_strategy_args *ap)
         * Cluster elements must be type INODE or type DATA, but the
         * compression mode (or not) for DATA chains can be different for
         * each chain.  This will be handled by the callback.
+        *
+        * If the cluster already has valid data the callback will be made
+        * immediately/synchronously.
         */
        btype = hammer2_cluster_type(cluster);
        if (btype != HAMMER2_BREF_TYPE_INODE &&
            btype != HAMMER2_BREF_TYPE_DATA) {
                panic("READ PATH: hammer2_strategy_read: unknown bref type");
        }
-       hammer2_chain_load_async(cluster, hammer2_strategy_read_callback, nbio);
+       hammer2_cluster_load_async(cluster, hammer2_strategy_read_callback,
+                                  nbio);
        return(0);
 }
 
 /*
- * Read callback for block that is not compressed.
+ * Read callback for hammer2_cluster_load_async().  The load function may
+ * start several actual I/Os but will only make one callback, typically with
+ * the first valid I/O XXX
  */
 static
 void
-hammer2_strategy_read_callback(hammer2_io_t *dio,
-                              hammer2_cluster_t *cluster,
-                              hammer2_chain_t *chain,
-                              void *arg_p, off_t arg_o)
+hammer2_strategy_read_callback(hammer2_iocb_t *iocb)
 {
-       struct bio *bio = arg_p;
-       struct buf *bp = bio->bio_buf;
+       struct bio *bio = iocb->ptr;    /* original logical buffer */
+       struct buf *bp = bio->bio_buf;  /* original logical buffer */
+       hammer2_chain_t *chain;
+       hammer2_cluster_t *cluster;
+       hammer2_io_t *dio;
        char *data;
        int i;
 
        /*
-        * Extract data and handle iteration on I/O failure.  arg_o is the
-        * cluster index for iteration.
+        * Extract data and handle iteration on I/O failure.  iocb->off
+        * is the cluster index for iteration.
+        */
+       cluster = iocb->cluster;
+       dio = iocb->dio;        /* can be NULL */
+
+       /*
+        * Work to do if INPROG set, else data already available.
         */
-       if (dio) {
+       if (iocb->flags & HAMMER2_IOCB_INPROG) {
+               /*
+                * read not issued yet, chain the iocb to execute the
+                * read operation.
+                */
+               if ((iocb->flags & HAMMER2_IOCB_READ) == 0) {
+                       iocb->flags |= HAMMER2_IOCB_READ;
+                       breadcb(dio->hmp->devvp, dio->pbase, dio->psize,
+                               hammer2_io_callback, iocb);
+                       return;
+               }
+
+               /*
+                * check results.
+                */
                if (dio->bp->b_flags & B_ERROR) {
-                       i = (int)arg_o + 1;
+                       i = (int)iocb->lbase + 1;
                        if (i >= cluster->nchains) {
                                bp->b_flags |= B_ERROR;
                                bp->b_error = dio->bp->b_error;
+                               hammer2_io_complete(iocb);
                                biodone(bio);
                                hammer2_cluster_unlock(cluster);
                        } else {
+                               hammer2_io_complete(iocb);
                                chain = cluster->array[i];
                                kprintf("hammer2: IO CHAIN-%d %p\n", i, chain);
                                hammer2_adjreadcounter(&chain->bref,
                                                       chain->bytes);
-                               hammer2_io_breadcb(chain->hmp,
-                                                  chain->bref.data_off,
-                                                  chain->bytes,
-                                              hammer2_strategy_read_callback,
-                                                  cluster, chain,
-                                                  arg_p, (off_t)i);
+                               iocb->chain = chain;
+                               iocb->lbase = (off_t)i;
+                               iocb->flags = 0;
+                               iocb->error = 0;
+                               hammer2_io_getblk(chain->hmp,
+                                                 chain->bref.data_off,
+                                                 chain->bytes,
+                                                 iocb);
                        }
                        return;
                }
+               chain = iocb->chain;
                data = hammer2_io_data(dio, chain->bref.data_off);
        } else {
+               chain = iocb->chain;
                data = (void *)chain->data;
        }
 
@@ -2232,6 +2272,7 @@ hammer2_strategy_read_callback(hammer2_io_t *dio,
                        hammer2_io_bqrelse(&dio);
                panic("hammer2_strategy_read: unknown bref type");
        }
+       hammer2_io_complete(iocb);
        hammer2_cluster_unlock(cluster);
        biodone(bio);
 }
@@ -2339,11 +2380,10 @@ hammer2_run_unlinkq(hammer2_trans_t *trans, hammer2_pfsmount_t *pmp)
                kfree(ipul, pmp->minode);
 
                cluster = hammer2_inode_lock_ex(ip);
-               ripdata = &hammer2_cluster_data(cluster)->ipdata;
+               ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
                if (hammer2_debug & 0x400) {
                        kprintf("hammer2: unlink on reclaim: %s refs=%d\n",
-                               cluster->focus->data->ipdata.filename,
-                               ip->refs);
+                               ripdata->filename, ip->refs);
                }
                KKASSERT(ripdata->nlinks == 0);