hammer2 - Add feature to allow sector overwrite, fix meta-data check code
authorMatthew Dillon <dillon@apollo.backplane.com>
Sat, 9 Jul 2016 23:17:19 +0000 (16:17 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Sat, 9 Jul 2016 23:21:50 +0000 (16:21 -0700)
* If a file is set to use no check code (hammer2 setcheck none <file>),
  data overwrites will reuse the same sector as long as it does not violate
  the most recent snapshot.

  This allows the program to relax copy-on-write requirements for certain
  files, for example files which might be mmap()'d SHARED+RW and then
  modified constantly where the programmer has determined that the
  possibility of corruption is ok.

* Implement pfs_lsnap_tid in the PFS root inode meta-data.  This records the
  last snapshot TID so the chain code can determine if an overwrite is
  allowed.

* Remove attr_tid and dirent_tid from the inode meta-data for now.

* Only BREF_TYPE_DATA brefs inherit the inode check mode.  Meta-data brefs
  such as indirect blocks, or directory entries, will only use the check
  code type specified in the parent inode if it is not NONE.  Otherwise
  they will use the default check code.

  This fixes a bug where meta-data brefs could wind up being unchecked.  We
  want all meta-data to always be checked (at least for now).

sys/vfs/hammer2/DESIGN
sys/vfs/hammer2/hammer2.h
sys/vfs/hammer2/hammer2_chain.c
sys/vfs/hammer2/hammer2_disk.h
sys/vfs/hammer2/hammer2_flush.c
sys/vfs/hammer2/hammer2_freemap.c
sys/vfs/hammer2/hammer2_inode.c
sys/vfs/hammer2/hammer2_ioctl.c
sys/vfs/hammer2/hammer2_strategy.c
sys/vfs/hammer2/hammer2_synchro.c
sys/vfs/hammer2/hammer2_xops.c

index 4583406..56d7467 100644 (file)
@@ -4,6 +4,7 @@
                                Matthew Dillon
                             dillon@backplane.com
 
+                              09-Jul-2016 (v4)
                               03-Apr-2015 (v3)
                               14-May-2013 (v2)
                               08-Feb-2012 (v1)
@@ -14,8 +15,8 @@
   - bulkfree           - operational
   - Compression                - operational
   - Snapshots          - operational
-  - Deduper            - specced
-  - Subhierarchy quotas - specced
+  - Deduper            - live operational, batch specced
+  - Subhierarchy quotas - (may have to be discarded)
   - Logical Encryption - not specced yet
   - Copies             - not specced yet
   - fsync bypass       - not specced yet
   buffer to be scanned, is fully supported.  This allows the writing of 0's
   to create holes.
 
+* Allow sector overwrite (avoid copy-on-write) under certain circumstances.
+  This is allowed on file data blocks if the file check mode is set to NONE,
+  as long as the data block's modify_tid does not violate the last snapshot
+  taken (if it does, a copy is made and overwrites are allowed on the copy
+  until the next snapshot).
+
 * Copies support for redundancy within a single physical filesystem.
   Up to 256 physical disks and/or partitions can be ganged to form a
   single physical filesystem.  If you use a disk or RAID aggregation 
@@ -313,9 +320,9 @@ not propagate up, instead serving as a seed for update_tid.
 
 There are several other stored transaction ids in HAMMER2.  There is a
 separate freemap_tid in the volume header that is used to allow freemap
-flushes to be deferred, and inodes have an attr_tid and a dirent_tid which
-tracks attribute changes and (for directories) create/rename/delete changes.
-The inode TIDs are used as an aid for the cache coherency subsystem.
+flushes to be deferred, and inodes have a pfs_psnap_tid which is used in
+conjuction with CHECK_NONE to allow blocks without a check code which do
+not violate the most recent snapshot to be overwritten in-place.
 
 Remember that since this is a copy-on-write filesystem, we can propagate
 a considerable amount of information up the tree to the volume header
index 6ea8c9d..071efa2 100644 (file)
@@ -1425,8 +1425,8 @@ hammer2_blockref_t *hammer2_chain_scan(hammer2_chain_t *parent,
                                hammer2_blockref_t *bref,
                                int *firstp, int *cache_indexp, int flags);
 
-int hammer2_chain_create(hammer2_chain_t **parentp,
-                               hammer2_chain_t **chainp, hammer2_pfs_t *pmp,
+int hammer2_chain_create(hammer2_chain_t **parentp, hammer2_chain_t **chainp,
+                               hammer2_pfs_t *pmp, int methods,
                                hammer2_key_t key, int keybits,
                                int type, size_t bytes, hammer2_tid_t mtid,
                                hammer2_off_t dedup_off, int flags);
index 2f540fb..ae9d3cc 100644 (file)
@@ -1507,8 +1507,36 @@ hammer2_chain_modify(hammer2_chain_t *chain, hammer2_tid_t mtid,
                 */
                atomic_add_long(&hammer2_count_modified_chains, 1);
                atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
-               hammer2_pfs_memory_inc(chain->pmp);     /* can be NULL */
-               newmod = 1;
+               hammer2_pfs_memory_inc(chain->pmp);  /* can be NULL */
+
+               /*
+                * We may be able to avoid a copy-on-write if the chain's
+                * check mode is set to NONE and the chain's current
+                * modify_tid is beyond the last explicit snapshot tid.
+                *
+                * This implements HAMMER2's overwrite-in-place feature.
+                *
+                * NOTE! This data-block cannot be used as a de-duplication
+                *       source when the check mode is set to NONE.
+                */
+               if (chain->bref.type == HAMMER2_BREF_TYPE_DATA &&
+                   (chain->flags & HAMMER2_CHAIN_INITIAL) == 0 &&
+                   HAMMER2_DEC_CHECK(chain->bref.methods) ==
+                    HAMMER2_CHECK_NONE &&
+                   chain->pmp &&
+                   chain->bref.modify_tid >
+                    chain->pmp->iroot->meta.pfs_lsnap_tid &&
+                   modified_needs_new_allocation(chain) == 0) {
+                       /*
+                        * Sector overwrite allowed.
+                        */
+                       newmod = 0;
+               } else {
+                       /*
+                        * Sector overwrite not allowed, must copy-on-write.
+                        */
+                       newmod = 1;
+               }
        } else {
                /*
                 * Already flagged modified, no new allocation is needed.
@@ -2752,8 +2780,8 @@ done:
  * and will be reassigned.
  */
 int
-hammer2_chain_create(hammer2_chain_t **parentp,
-                    hammer2_chain_t **chainp, hammer2_pfs_t *pmp,
+hammer2_chain_create(hammer2_chain_t **parentp, hammer2_chain_t **chainp,
+                    hammer2_pfs_t *pmp, int methods,
                     hammer2_key_t key, int keybits, int type, size_t bytes,
                     hammer2_tid_t mtid, hammer2_off_t dedup_off, int flags)
 {
@@ -2792,12 +2820,21 @@ hammer2_chain_create(hammer2_chain_t **parentp,
                dummy.key = key;
                dummy.keybits = keybits;
                dummy.data_off = hammer2_getradix(bytes);
-               dummy.methods = parent->bref.methods;
-               if (parent->bref.type == HAMMER2_BREF_TYPE_INODE &&
-                   parent->data) {
-                       dummy.methods &= ~HAMMER2_ENC_CHECK(-1);
-                       dummy.methods |= HAMMER2_ENC_CHECK(
-                                         parent->data->ipdata.meta.check_algo);
+
+               /*
+                * Inherit methods from parent by default.  Primarily used
+                * for BREF_TYPE_DATA.  Non-data types *must* be set to
+                * a non-NONE check algorithm.
+                */
+               if (methods == -1)
+                       dummy.methods = parent->bref.methods;
+               else
+                       dummy.methods = (uint8_t)methods;
+
+               if (type != HAMMER2_BREF_TYPE_DATA &&
+                   HAMMER2_DEC_CHECK(dummy.methods) == HAMMER2_CHECK_NONE) {
+                       dummy.methods |=
+                               HAMMER2_ENC_CHECK(HAMMER2_CHECK_DEFAULT);
                }
 
                chain = hammer2_chain_alloc(hmp, pmp, &dummy);
@@ -3106,7 +3143,8 @@ hammer2_chain_rename(hammer2_blockref_t *bref,
                KKASSERT(parent->refs > 0);
                KKASSERT(parent->error == 0);
 
-               hammer2_chain_create(parentp, &chain, chain->pmp,
+               hammer2_chain_create(parentp, &chain,
+                                    chain->pmp, HAMMER2_METH_DEFAULT,
                                     bref->key, bref->keybits, bref->type,
                                     chain->bytes, mtid, 0, flags);
                KKASSERT(chain->flags & HAMMER2_CHAIN_UPDATE);
@@ -3435,7 +3473,9 @@ hammer2_chain_create_indirect(hammer2_chain_t *parent,
        dummy.bref.key = key;
        dummy.bref.keybits = keybits;
        dummy.bref.data_off = hammer2_getradix(nbytes);
-       dummy.bref.methods = parent->bref.methods;
+       dummy.bref.methods =
+               HAMMER2_ENC_CHECK(HAMMER2_DEC_CHECK(parent->bref.methods)) |
+               HAMMER2_ENC_COMP(HAMMER2_COMP_NONE);
 
        ichain = hammer2_chain_alloc(hmp, parent->pmp, &dummy.bref);
        atomic_set_int(&ichain->flags, HAMMER2_CHAIN_INITIAL);
index e41761a..2a9b482 100644 (file)
@@ -673,6 +673,8 @@ typedef struct hammer2_blockref hammer2_blockref_t;
 #define HAMMER2_CHECK_SHA192           4
 #define HAMMER2_CHECK_FREEMAP          5
 
+#define HAMMER2_CHECK_DEFAULT          HAMMER2_CHECK_XXHASH64
+
 /* user-specifiable check modes only */
 #define HAMMER2_CHECK_STRINGS          { "none", "disabled", "crc32", \
                                          "xxhash64", "sha192" }
@@ -696,6 +698,11 @@ typedef struct hammer2_blockref hammer2_blockref_t;
 #define HAMMER2_COMP_STRINGS           { "none", "autozero", "lz4", "zlib" }
 #define HAMMER2_COMP_STRINGS_COUNT     4
 
+/*
+ * Passed to hammer2_chain_create(), causes methods to be inherited from
+ * parent.
+ */
+#define HAMMER2_METH_DEFAULT           -1
 
 /*
  * HAMMER2 block references are collected into sets of 4 blockrefs.  These
@@ -920,8 +927,15 @@ struct hammer2_inode_meta {
        hammer2_key_t   unusedB8;       /* 00B8 subtree byte count */
        hammer2_key_t   inode_quota;    /* 00C0 subtree quota inode count */
        hammer2_key_t   unusedC8;       /* 00C8 subtree inode count */
-       hammer2_tid_t   attr_tid;       /* 00D0 attributes changed */
-       hammer2_tid_t   dirent_tid;     /* 00D8 directory/attr changed */
+
+       /*
+        * The last snapshot tid is tested against modify_tid to determine
+        * when a copy must be made of a data block whos check mode has been
+        * disabled (a disabled check mode allows data blocks to be updated
+        * in place instead of copy-on-write).
+        */
+       hammer2_tid_t   pfs_lsnap_tid;  /* 00D0 last snapshot tid */
+       hammer2_tid_t   reservedD8;     /* 00D8 (avail) */
 
        /*
         * Tracks (possibly degenerate) free areas covering all sub-tree
index a2f55f5..781856f 100644 (file)
@@ -634,7 +634,7 @@ again:
                 *       embedded data don't need this.
                 */
                if (hammer2_debug & 0x1000) {
-                       kprintf("Flush %p.%d %016jx/%d data=%016jx",
+                       kprintf("Flush %p.%d %016jx/%d data=%016jx\n",
                                chain, chain->bref.type,
                                (uintmax_t)chain->bref.key,
                                chain->bref.keybits,
index 093e4fa..facdb72 100644 (file)
@@ -350,7 +350,8 @@ hammer2_freemap_try_alloc(hammer2_chain_t **parentp,
                kprintf("freemap create L1 @ %016jx bpref %016jx\n",
                        key, iter->bpref);
 #endif
-               error = hammer2_chain_create(parentp, &chain, hmp->spmp,
+               error = hammer2_chain_create(parentp, &chain,
+                                    hmp->spmp, HAMMER2_METH_DEFAULT,
                                     key, HAMMER2_FREEMAP_LEVEL1_RADIX,
                                     HAMMER2_BREF_TYPE_FREEMAP_LEAF,
                                     HAMMER2_FREEMAP_LEVELN_PSIZE,
@@ -905,7 +906,8 @@ hammer2_freemap_adjust(hammer2_dev_t *hmp, hammer2_blockref_t *bref,
         * bref.check.freemap structure.
         */
        if (chain == NULL && how == HAMMER2_FREEMAP_DORECOVER) {
-               error = hammer2_chain_create(&parent, &chain, hmp->spmp,
+               error = hammer2_chain_create(&parent, &chain,
+                                    hmp->spmp, HAMMER2_METH_DEFAULT,
                                     key, HAMMER2_FREEMAP_LEVEL1_RADIX,
                                     HAMMER2_BREF_TYPE_FREEMAP_LEAF,
                                     HAMMER2_FREEMAP_LEVELN_PSIZE,
index 8087c8c..a476719 100644 (file)
@@ -1274,7 +1274,7 @@ hammer2_inode_xop_create(hammer2_xop_t *arg, int clindex)
        }
 
        error = hammer2_chain_create(&parent, &chain,
-                                    xop->head.ip1->pmp,
+                                    xop->head.ip1->pmp, HAMMER2_METH_DEFAULT,
                                     xop->lhc, 0,
                                     HAMMER2_BREF_TYPE_INODE,
                                     HAMMER2_INODE_BYTES,
@@ -1464,7 +1464,8 @@ hammer2_inode_xop_connect(hammer2_xop_t *arg, int clindex)
        /*
         * Reconnect the chain to the new parent directory
         */
-       error = hammer2_chain_create(&parent, &chain, pmp,
+       error = hammer2_chain_create(&parent, &chain,
+                                    pmp, HAMMER2_METH_DEFAULT,
                                     xop->lhc, 0,
                                     HAMMER2_BREF_TYPE_INODE,
                                     HAMMER2_INODE_BYTES,
index 3f29fae..1fd8601 100644 (file)
@@ -715,6 +715,7 @@ hammer2_ioctl_pfs_snapshot(hammer2_inode_t *ip, void *data)
 {
        hammer2_ioc_pfs_t *pfs = data;
        hammer2_dev_t   *hmp;
+       hammer2_pfs_t   *pmp;
        hammer2_chain_t *chain;
        hammer2_tid_t   mtid;
        int error;
@@ -724,23 +725,29 @@ hammer2_ioctl_pfs_snapshot(hammer2_inode_t *ip, void *data)
        if (pfs->name[sizeof(pfs->name)-1] != 0)
                return(EINVAL);
 
-       hmp = ip->pmp->pfs_hmps[0];
+       pmp = ip->pmp;
+       ip = pmp->iroot;
+
+       hmp = pmp->pfs_hmps[0];
        if (hmp == NULL)
                return (EINVAL);
 
-       hammer2_vfs_sync(ip->pmp->mp, MNT_WAIT);
+       hammer2_vfs_sync(pmp->mp, MNT_WAIT);
 
-       hammer2_trans_init(ip->pmp, HAMMER2_TRANS_ISFLUSH);
-       mtid = hammer2_trans_sub(ip->pmp);
+       hammer2_trans_init(pmp, HAMMER2_TRANS_ISFLUSH);
+       mtid = hammer2_trans_sub(pmp);
        hammer2_inode_lock(ip, 0);
+       hammer2_inode_modify(ip);
+       ip->meta.pfs_lsnap_tid = mtid;
 
+       /* XXX cluster it! */
        chain = hammer2_inode_chain(ip, 0, HAMMER2_RESOLVE_ALWAYS);
        error = hammer2_chain_snapshot(chain, pfs, mtid);
        hammer2_chain_unlock(chain);
        hammer2_chain_drop(chain);
 
        hammer2_inode_unlock(ip);
-       hammer2_trans_done(ip->pmp);
+       hammer2_trans_done(pmp);
 
        return (error);
 }
index cef81c5..f481bc5 100644 (file)
@@ -706,7 +706,10 @@ retry:
                 */
                dedup_off = hammer2_dedup_lookup((*parentp)->hmp, datap,
                                                 pblksize);
-               *errorp = hammer2_chain_create(parentp, &chain, ip->pmp,
+               *errorp = hammer2_chain_create(parentp, &chain,
+                                              ip->pmp,
+                                      HAMMER2_ENC_CHECK(ip->meta.check_algo) |
+                                      HAMMER2_ENC_COMP(HAMMER2_COMP_NONE),
                                               lbase, HAMMER2_PBUFRADIX,
                                               HAMMER2_BREF_TYPE_DATA,
                                               pblksize, mtid,
@@ -1361,6 +1364,11 @@ hammer2_dedup_record(hammer2_chain_t *chain, char *data)
        default:
                /*
                 * Cannot dedup without a check code
+                *
+                * NOTE: In particular, CHECK_NONE allows a sector to be
+                *       overwritten without copy-on-write, recording
+                *       a dedup block for a CHECK_NONE object would be
+                *       a disaster!
                 */
                return;
        }
index 5c610ab..5d5811d 100644 (file)
@@ -757,7 +757,8 @@ hammer2_sync_insert(hammer2_thread_t *thr,
        KKASSERT(chain == NULL);
 
        chain = NULL;
-       hammer2_chain_create(parentp, &chain, thr->pmp,
+       hammer2_chain_create(parentp, &chain,
+                            thr->pmp, focus->bref.methods,
                             focus->bref.key, focus->bref.keybits,
                             focus->bref.type, focus->bytes,
                             mtid, 0, 0);
@@ -958,10 +959,16 @@ hammer2_sync_replace(hammer2_thread_t *thr,
                                focus->data->ipdata.meta.data_quota;
                        chain->data->ipdata.meta.inode_quota =
                                focus->data->ipdata.meta.inode_quota;
-                       chain->data->ipdata.meta.attr_tid =
-                               focus->data->ipdata.meta.attr_tid;
-                       chain->data->ipdata.meta.dirent_tid =
-                               focus->data->ipdata.meta.dirent_tid;
+
+                       /*
+                        * last snapshot tid controls overwrite
+                        */
+                       if (chain->data->ipdata.meta.pfs_lsnap_tid <
+                           focus->data->ipdata.meta.pfs_lsnap_tid) {
+                               chain->data->ipdata.meta.pfs_lsnap_tid =
+                                       focus->data->ipdata.meta.pfs_lsnap_tid;
+                       }
+
                        hammer2_chain_setcheck(chain, chain->data);
                        break;
                }
index 192ebbf..4300085 100644 (file)
@@ -495,7 +495,8 @@ hammer2_xop_nlink(hammer2_xop_t *arg, int clindex)
                did_delete = 1;
 
                tmp = NULL;
-               error = hammer2_chain_create(&parent, &tmp, pmp,
+               error = hammer2_chain_create(&parent, &tmp,
+                                            pmp, HAMMER2_METH_DEFAULT,
                                             chain->bref.key, 0,
                                             HAMMER2_BREF_TYPE_INODE,
                                             HAMMER2_INODE_BYTES,
@@ -570,7 +571,8 @@ hammer2_xop_nlink(hammer2_xop_t *arg, int clindex)
                        error = EEXIST;
                        goto done;
                }
-               error = hammer2_chain_create(&parent, &chain, pmp,
+               error = hammer2_chain_create(&parent, &chain,
+                                            pmp, HAMMER2_METH_DEFAULT,
                                             wipdata->meta.name_key, 0,
                                             HAMMER2_BREF_TYPE_INODE,
                                             HAMMER2_INODE_BYTES,
@@ -768,7 +770,8 @@ hammer2_xop_nrename(hammer2_xop_t *arg, int clindex)
                goto done;
        }
 
-       error = hammer2_chain_create(&parent, &chain, pmp,
+       error = hammer2_chain_create(&parent, &chain,
+                                    pmp, HAMMER2_METH_DEFAULT,
                                     xop->lhc, 0,
                                     HAMMER2_BREF_TYPE_INODE,
                                     HAMMER2_INODE_BYTES,