hammer2 - more indirect block work, add advlock
authorMatthew Dillon <dillon@apollo.backplane.com>
Tue, 28 Feb 2012 05:22:53 +0000 (21:22 -0800)
committerMatthew Dillon <dillon@apollo.backplane.com>
Tue, 28 Feb 2012 05:22:53 +0000 (21:22 -0800)
* Fix several bugs in the indirect block code.  Directories and files should
  both now be able to expand properly.  Except for an excessive dirty buffer
  buildup (see 'bawrite' below), we can now create regular files linearly
  and create sub-directory trees.  e.g. cpdup is starting to show results.

* Fix an issue with readdir related to how ranged searches work (results
  can be returned in any order within the range).  Do a direct lookup of
  the key before resorting to a range.  Until we sort the keys this is not
  going to work correctly in the face of concurrent deletions.

* Try conditionalizing bawrite() vs bdwrite() in the flush code, but there
  are still major issues when NFS is used to back a VN device for testing
  which can deadlock the system.

* Remove or conditionalize a bunch of debugging kprintfs.

sys/vfs/hammer2/hammer2.h
sys/vfs/hammer2/hammer2_chain.c
sys/vfs/hammer2/hammer2_freemap.c
sys/vfs/hammer2/hammer2_inode.c
sys/vfs/hammer2/hammer2_vfsops.c
sys/vfs/hammer2/hammer2_vnops.c

index a50cc06..d1174a9 100644 (file)
@@ -130,6 +130,7 @@ SPLAY_PROTOTYPE(hammer2_chain_splay, hammer2_chain, snode, hammer2_chain_cmp);
 #define HAMMER2_CHAIN_INITIAL          0x00000020      /* initial write */
 #define HAMMER2_CHAIN_FLUSHED          0x00000040      /* flush on unlock */
 #define HAMMER2_CHAIN_MOVED            0x00000080      /* moved */
+#define HAMMER2_CHAIN_IOFLUSH          0x00000100      /* bawrite on put */
 
 /*
  * Flags passed to hammer2_chain_lookup() and hammer2_chain_next()
@@ -166,6 +167,7 @@ struct hammer2_inode {
        struct vnode            *vp;
        hammer2_chain_t         chain;
        struct hammer2_inode_data ip_data;
+       struct lockf            advlock;
 };
 
 typedef struct hammer2_inode hammer2_inode_t;
@@ -244,6 +246,8 @@ extern struct vop_ops hammer2_vnode_vops;
 extern struct vop_ops hammer2_spec_vops;
 extern struct vop_ops hammer2_fifo_vops;
 
+extern int hammer2_debug;
+
 /*
  * hammer2_subr.c
  */
index 1f3334b..e9fca3c 100644 (file)
@@ -433,10 +433,14 @@ hammer2_chain_unlock(hammer2_mount_t *hmp, hammer2_chain_t *chain)
        if (chain->bp) {
                chain->data = NULL;
                if (chain->flags & (HAMMER2_CHAIN_MODIFIED |
-                                   HAMMER2_CHAIN_FLUSHED))
-                       bdwrite(chain->bp);
-               else
+                                   HAMMER2_CHAIN_FLUSHED)) {
+                       if (chain->flags & HAMMER2_CHAIN_IOFLUSH)
+                               bawrite(chain->bp);
+                       else
+                               bdwrite(chain->bp);
+               } else {
                        bqrelse(chain->bp);
+               }
                chain->bp = NULL;
        }
        lockmgr(&chain->lk, LK_RELEASE);
@@ -563,6 +567,10 @@ hammer2_chain_put(hammer2_mount_t *hmp, hammer2_chain_t *chain)
  * typically points to an inode but can also point to a related indirect
  * block and this function will recurse upwards and find the inode again.
  *
+ * WARNING!  THIS DOES NOT RETURN KEYS IN LOGICAL KEY ORDER!  ANY KEY
+ *          WITHIN THE RANGE CAN BE RETURNED.  HOWEVER, AN ITERATION
+ *          WHICH PICKS UP WHERE WE LEFT OFF WILL CONTINUE THE SCAN.
+ *
  * (*parentp) must be exclusively locked and referenced and can be an inode
  * or an existing indirect block within the inode.
  *
@@ -733,12 +741,20 @@ again:
                return (NULL);
        } else {
                /*
-                * Continue iteration with next parent
+                * Continue iteration with next parent unless the current
+                * parent covers the range.
                 */
                hammer2_chain_t *nparent;
 
                if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT)
                        return (NULL);
+
+               scan_beg = parent->bref.key;
+               scan_end = scan_beg +
+                           ((hammer2_key_t)1 << parent->bref.keybits) - 1;
+               if (key_beg >= scan_beg && key_end <= scan_end)
+                       return (NULL);
+
                i = parent->index + 1;
                nparent = parent->parent;
                hammer2_chain_ref(hmp, nparent);        /* ref new parent */
@@ -788,11 +804,6 @@ again2:
                        ++i;
                        continue;
                }
-#if 0
-               kprintf("nextxx(%016jx,%d) %d: %016jx/%d\n",
-                       parent->bref.data_off, i,
-                       bref->type,bref->key, bref->keybits);
-#endif
                scan_beg = bref->key;
                scan_end = scan_beg + ((hammer2_key_t)1 << bref->keybits) - 1;
                if (key_beg <= scan_end && key_end >= scan_beg)
@@ -834,9 +845,12 @@ again2:
 
 /*
  * Create and return a new hammer2 system memory structure of the specified
- * key, type and size and insert it under (parent).  (parent) is typically
- * acquired as a side effect of issuing a prior lookup.  parent must be locked
- * and held.
+ * key, type and size and insert it RELATIVE TO (PARENT).
+ *
+ * (parent) is typically either an inode or an indirect  block, acquired
+ * acquired as a side effect of issuing a prior failed lookup.  parent
+ * must be locked and held.  Do not pass the inode chain to this function
+ * unless that is the chain returned by the failed lookup.
  *
  * Non-indirect types will automatically allocate indirect blocks as required
  * if the new item does not fit in the current (parent).
@@ -1110,8 +1124,9 @@ hammer2_chain_create_indirect(hammer2_mount_t *hmp, hammer2_chain_t *parent,
                }
 
                /*
-                * Expand are calculated key range (key, keybits) to fit
-                * the scanned key.
+                * Expand our calculated key range (key, keybits) to fit
+                * the scanned key.  nkeybits represents the full range
+                * that we will later cut in half (two halves @ nkeybits - 1).
                 */
                nkeybits = keybits;
                if (nkeybits < bref->keybits)
@@ -1149,25 +1164,38 @@ hammer2_chain_create_indirect(hammer2_mount_t *hmp, hammer2_chain_t *parent,
        }
 
        /*
-        * The key for the indirect block will be the lower half or
-        * the upper half of the above calculated keyspace.
+        * Adjust keybits to represent half of the full range calculated
+        * above.
+        */
+       --keybits;
+
+       /*
+        * Select whichever half contains the most elements.  Theoretically
+        * we can select either side as long as it contains at least one
+        * element (in order to ensure that a free slot is present to hold
+        * the indirect block).
         */
        key &= ~(((hammer2_key_t)1 << keybits) - 1);
        if (hammer2_indirect_optimize) {
                /*
-                * Insert node for least number of keys, best for linear
-                * files (?)  XXX won't work if least number is 0.
+                * Insert node for least number of keys, this will arrange
+                * the first few blocks of a large file or the first few
+                * inodes in a directory with fewer indirect blocks when
+                * created linearly.
                 */
-               panic("hammer2_indirect_optimize not working yet");
-               if (hicount < locount)
-                       key |= (hammer2_key_t)1 << (keybits - 1);
+               if (hicount < locount && hicount != 0)
+                       key |= (hammer2_key_t)1 << keybits;
+               else
+                       key &= ~(hammer2_key_t)1 << keybits;
        } else {
                /*
                 * Insert node for most number of keys, best for heavily
                 * fragmented files.
                 */
                if (hicount > locount)
-                       key |= (hammer2_key_t)1 << (keybits - 1);
+                       key |= (hammer2_key_t)1 << keybits;
+               else
+                       key &= ~(hammer2_key_t)1 << keybits;
        }
 
        /*
@@ -1175,19 +1203,23 @@ hammer2_chain_create_indirect(hammer2_mount_t *hmp, hammer2_chain_t *parent,
         */
        dummy.bref.type = HAMMER2_BREF_TYPE_INDIRECT;
        dummy.bref.key = key;
-       dummy.bref.keybits = keybits - 1;
+       dummy.bref.keybits = keybits;
        dummy.bref.data_off = (hammer2_off_t)
                            hammer2_freemap_bytes_to_radix(HAMMER2_PBUFSIZE);
-       dummy.index = -1;       /* not yet assigned */
        ichain = hammer2_chain_alloc(hmp, &dummy.bref);
-       kprintf("create_indirect2: allocate %016jx/%d\n", key, keybits);
 
        /*
         * Iterate the original parent and move the matching brefs into
-        * the new indirect block.  All the keys are inclusive of keybits
-        * so we only have to check bit (keybits - 1).
+        * the new indirect block.
         */
        for (i = 0; i < count; ++i) {
+               /*
+                * For keying purposes access the bref from the media or
+                * from our in-memory cache.  In cases where the in-memory
+                * cache overrides the media the keyrefs will be the same
+                * anyway so we can avoid checking the cache when the media
+                * has a key.
+                */
                bref = &base[i];
                if (bref->type == 0) {
                        dummy.index = i;
@@ -1209,7 +1241,7 @@ hammer2_chain_create_indirect(hammer2_mount_t *hmp, hammer2_chain_t *parent,
                 * (keybits - 1) needs to be compared but for safety we
                 * will compare all msb bits plus that bit again.
                 */
-               if ((~(((hammer2_key_t)1 << (keybits - 1)) - 1) &
+               if ((~(((hammer2_key_t)1 << keybits) - 1) &
                    (key ^ bref->key)) != 0) {
                        continue;
                }
@@ -1220,7 +1252,6 @@ hammer2_chain_create_indirect(hammer2_mount_t *hmp, hammer2_chain_t *parent,
                 */
                if (ichain->index < 0)
                        ichain->index = i;
-               bzero(&base[i], sizeof(base[i]));
 
                /*
                 * Load the new indirect block by acquiring or allocating
@@ -1244,6 +1275,7 @@ hammer2_chain_create_indirect(hammer2_mount_t *hmp, hammer2_chain_t *parent,
                if (SPLAY_INSERT(hammer2_chain_splay, &ichain->shead, chain))
                        panic("hammer2_chain_create_indirect: collision");
                chain->parent = ichain;
+               bzero(&base[i], sizeof(base[i]));
                atomic_add_int(&parent->refs, -1);
                atomic_add_int(&ichain->refs, 1);
                if (chain->flags & HAMMER2_CHAIN_MOVED) {
@@ -1261,7 +1293,6 @@ hammer2_chain_create_indirect(hammer2_mount_t *hmp, hammer2_chain_t *parent,
         * insertion index in the loop above (ichain->index).
         */
        KKASSERT(ichain->index >= 0);
-       kprintf("insert ichain at %d\n", ichain->index);
        if (SPLAY_INSERT(hammer2_chain_splay, &parent->shead, ichain))
                panic("hammer2_chain_create_indirect: ichain insertion");
        ichain->parent = parent;
@@ -1288,7 +1319,7 @@ hammer2_chain_create_indirect(hammer2_mount_t *hmp, hammer2_chain_t *parent,
                 * return the original parent.
                 */
                hammer2_chain_put(hmp, ichain);
-       } else if (~(((hammer2_key_t)1 << (keybits - 1)) - 1) &
+       } else if (~(((hammer2_key_t)1 << keybits) - 1) &
                   (create_key ^ key)) {
                /*
                 * Key being created is outside the key range,
@@ -1302,8 +1333,6 @@ hammer2_chain_create_indirect(hammer2_mount_t *hmp, hammer2_chain_t *parent,
                parent = ichain;
        }
 
-       kprintf("create_indirect9\n");
-
        return(parent);
 }
 
index 03fecc4..b8bbd4c 100644 (file)
@@ -112,7 +112,10 @@ hammer2_freemap_alloc(hammer2_mount_t *hmp, size_t bytes)
                        hmp->freecache[radix] = data_next;
                }
        }
-       kprintf("hammer2: allocate %016jx: %zd\n", (intmax_t)data_off, bytes);
+       if (hammer2_debug & 0x0001) {
+               kprintf("hammer2: allocate %016jx: %zd\n",
+                       (intmax_t)data_off, bytes);
+       }
        return (data_off | radix);
 }
 
index 55ba0b9..960cb87 100644 (file)
@@ -142,7 +142,6 @@ hammer2_igetv(hammer2_inode_t *ip, int *errorp)
                        continue;
                }
 
-               kprintf("igetv new\n");
                switch (ip->ip_data.type) {
                case HAMMER2_OBJTYPE_DIRECTORY:
                        vp->v_type = VDIR;
@@ -172,8 +171,10 @@ hammer2_igetv(hammer2_inode_t *ip, int *errorp)
        /*
         * Return non-NULL vp and *errorp == 0, or NULL vp and *errorp != 0.
         */
-       kprintf("igetv vp %p refs %d aux %d\n",
-               vp, vp->v_sysref.refcnt, vp->v_auxrefs);
+       if (hammer2_debug & 0x0002) {
+               kprintf("igetv vp %p refs %d aux %d\n",
+                       vp, vp->v_sysref.refcnt, vp->v_auxrefs);
+       }
        return (vp);
 }
 
@@ -220,7 +221,7 @@ hammer2_create_inode(hammer2_mount_t *hmp,
                ++lhc;
        }
        if (error == 0) {
-               chain = hammer2_chain_create(hmp, &dip->chain, lhc, 0,
+               chain = hammer2_chain_create(hmp, parent, lhc, 0,
                                             HAMMER2_BREF_TYPE_INODE,
                                             HAMMER2_INODE_BYTES);
                if (chain == NULL)
index bb526a5..f25ef8a 100644 (file)
@@ -41,6 +41,7 @@
 #include <sys/buf.h>
 #include <sys/uuid.h>
 #include <sys/vfsops.h>
+#include <sys/sysctl.h>
 
 #include "hammer2.h"
 #include "hammer2_disk.h"
@@ -51,6 +52,13 @@ struct hammer2_sync_info {
        int waitfor;
 };
 
+int hammer2_debug;
+
+SYSCTL_NODE(_vfs, OID_AUTO, hammer2, CTLFLAG_RW, 0, "HAMMER2 filesystem");
+
+SYSCTL_INT(_vfs_hammer2, OID_AUTO, debug, CTLFLAG_RW,
+          &hammer2_debug, 0, "");
+
 static int hammer2_vfs_init(struct vfsconf *conf);
 static int hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
                                struct ucred *cred);
index ed2076f..06b5457 100644 (file)
@@ -238,7 +238,7 @@ hammer2_vop_readdir(struct vop_readdir_args *ap)
         * allow '..' to cross the mount point into (e.g.) the super-root.
         */
        error = 0;
-       chain = (void *)(intptr_t)-1;   /* non-NULL early done means not eof */
+       chain = (void *)(intptr_t)-1;   /* non-NULL for early goto done case */
 
        if (saveoff == 0) {
                r = vop_write_dirent(&error, uio,
@@ -283,7 +283,11 @@ hammer2_vop_readdir(struct vop_readdir_args *ap)
                hammer2_chain_put(hmp, parent);
                goto done;
        }
-       chain = hammer2_chain_lookup(hmp, &parent, lkey, (hammer2_key_t)-1, 0);
+       chain = hammer2_chain_lookup(hmp, &parent, lkey, lkey, 0);
+       if (chain == NULL) {
+               chain = hammer2_chain_lookup(hmp, &parent,
+                                            lkey, (hammer2_key_t)-1, 0);
+       }
        while (chain) {
                if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
                        dtype = hammer2_get_dtype(chain->u.ip);
@@ -326,7 +330,7 @@ hammer2_vop_readdir(struct vop_readdir_args *ap)
 done:
        if (ap->a_eofflag)
                *ap->a_eofflag = (chain == NULL);
-       uio->uio_offset = saveoff;
+       uio->uio_offset = saveoff & ~HAMMER2_DIRHASH_VISIBLE;
        if (error && cookie_index == 0) {
                if (cookies) {
                        kfree(cookies, M_TEMP);
@@ -765,6 +769,21 @@ hammer2_vop_open(struct vop_open_args *ap)
        return vop_stdopen(ap);
 }
 
+/*
+ * hammer_vop_advlock { vp, id, op, fl, flags }
+ *
+ * MPSAFE - does not require fs_token
+ */
+static
+int
+hammer2_vop_advlock(struct vop_advlock_args *ap)
+{
+       hammer2_inode_t *ip = VTOI(ap->a_vp);
+
+       return (lf_advlock(ap, &ip->advlock, ip->ip_data.size));
+}
+
+
 static
 int
 hammer2_vop_close(struct vop_close_args *ap)
@@ -941,15 +960,16 @@ hammer2_strategy_write(struct vop_strategy_args *ap)
        if (chain) {
                hammer2_chain_modify(hmp, chain);
                bcopy(bp->b_data, chain->data->buf + off_lo, bp->b_bcount);
-               hammer2_chain_put(hmp, chain);
        } else {
                chain = hammer2_chain_create(hmp, parent,
                                             off_hi, HAMMER2_PBUFRADIX,
                                             HAMMER2_BREF_TYPE_DATA,
                                             HAMMER2_PBUFSIZE);
                bcopy(bp->b_data, chain->data->buf + off_lo, bp->b_bcount);
-               hammer2_chain_put(hmp, chain);
        }
+       if (off_lo + bp->b_bcount == HAMMER2_PBUFSIZE)
+               atomic_set_int(&chain->flags, HAMMER2_CHAIN_IOFLUSH);
+       hammer2_chain_put(hmp, chain);
        hammer2_chain_put(hmp, parent);
 
        bp->b_resid = 0;
@@ -991,6 +1011,7 @@ struct vop_ops hammer2_vnode_vops = {
        .vop_getpages   = vop_stdgetpages,
        .vop_putpages   = vop_stdputpages,
        .vop_access     = hammer2_vop_access,
+       .vop_advlock    = hammer2_vop_advlock,
        .vop_close      = hammer2_vop_close,
        .vop_ncreate    = hammer2_vop_ncreate,
        .vop_getattr    = hammer2_vop_getattr,