HAMMER - Add live dedup sysctl and support
[dragonfly.git] / sys / vfs / hammer / hammer_dedup.c
index 0bb74b5..1bc125f 100644 (file)
@@ -189,3 +189,342 @@ validate_zone(hammer_off_t data_offset)
                return (1);
        }
 }
+
+/************************************************************************
+ *                             LIVE DEDUP                              *
+ ************************************************************************
+ *
+ * HAMMER Live Dedup (aka as efficient cp(1) implementation)
+ *
+ * The dedup cache is operated in a LRU fashion and destroyed on
+ * unmount, so essentially this is a live dedup on a cached dataset and
+ * not a full-fledged fs-wide one - we have a batched dedup for that.
+ * We allow duplicate entries in the buffer cache, data blocks are
+ * deduplicated only on their way to media.  By default the cache is
+ * populated on reads only, but it can be populated on writes too.
+ *
+ * The main implementation gotcha is on-media requirement - in order for
+ * a data block to be added to a dedup cache it has to be present on
+ * disk.  This simplifies cache logic a lot - once data is laid out on
+ * media it remains valid on media all the way up to the point where the
+ * related big block the data was stored in is freed - so there is only
+ * one place we need to bother with invalidation code.
+ */
+
+/*
+ * RB-Tree support for dedup cache structures
+ */
+RB_GENERATE2(hammer_dedup_crc_rb_tree, hammer_dedup_cache, crc_entry,
+               hammer_dedup_crc_rb_compare, hammer_crc_t, crc);
+RB_GENERATE2(hammer_dedup_off_rb_tree, hammer_dedup_cache, off_entry,
+               hammer_dedup_off_rb_compare, hammer_off_t, data_offset);
+
+struct hammer_dedup_inval {
+       struct hammer_mount *hmp;
+       hammer_off_t base_offset;
+};
+
+static int hammer_dedup_scancmp(hammer_dedup_cache_t dc, void *data);
+static int hammer_dedup_cache_inval_callback(hammer_dedup_cache_t dc,
+               void *data);
+static __inline int _vnode_validate(hammer_dedup_cache_t dcp, void *data,
+               int *errorp);
+static __inline int _dev_validate(hammer_dedup_cache_t dcp, void *data,
+               int *errorp);
+
+int
+hammer_dedup_crc_rb_compare(hammer_dedup_cache_t dc1,
+                               hammer_dedup_cache_t dc2)
+{
+       if (dc1->crc < dc2->crc)
+               return (-1);
+       if (dc1->crc > dc2->crc)
+               return (1);
+
+       return (0);
+}
+
+int
+hammer_dedup_off_rb_compare(hammer_dedup_cache_t dc1,
+                               hammer_dedup_cache_t dc2)
+{
+       if (dc1->data_offset < dc2->data_offset)
+               return (-1);
+       if (dc1->data_offset > dc2->data_offset)
+               return (1);
+
+       return (0);
+}
+
+static int
+hammer_dedup_scancmp(hammer_dedup_cache_t dc, void *data)
+{
+       hammer_off_t off = ((struct hammer_dedup_inval *)data)->base_offset;
+
+       if (dc->data_offset < off)
+               return (-1);
+       if (dc->data_offset >= off + HAMMER_LARGEBLOCK_SIZE)
+               return (1);
+
+       return (0);
+}
+
+static int
+hammer_dedup_cache_inval_callback(hammer_dedup_cache_t dc, void *data)
+{
+       hammer_mount_t hmp = ((struct hammer_dedup_inval *)data)->hmp;
+
+       RB_REMOVE(hammer_dedup_crc_rb_tree, &hmp->rb_dedup_crc_root, dc);
+       RB_REMOVE(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, dc);
+       TAILQ_REMOVE(&hmp->dedup_lru_list, dc, lru_entry);
+
+       --hmp->dedup_cache_count;
+       kfree(dc, hmp->m_misc);
+
+       return (0);
+}
+
+hammer_dedup_cache_t
+hammer_dedup_cache_add(hammer_inode_t ip, hammer_btree_leaf_elm_t leaf)
+{
+       hammer_dedup_cache_t dcp, tmp;
+       hammer_mount_t hmp = ip->hmp;
+
+       if (hmp->dedup_free_cache == NULL) {
+               tmp = kmalloc(sizeof(*tmp), hmp->m_misc, M_WAITOK | M_ZERO);
+               if (hmp->dedup_free_cache == NULL)
+                       hmp->dedup_free_cache = tmp;
+               else
+                       kfree(tmp, hmp->m_misc);
+       }
+
+       KKASSERT(leaf != NULL);
+
+       dcp = RB_LOOKUP(hammer_dedup_crc_rb_tree,
+                               &hmp->rb_dedup_crc_root, leaf->data_crc);
+       if (dcp != NULL) {
+               RB_REMOVE(hammer_dedup_off_rb_tree,
+                               &hmp->rb_dedup_off_root, dcp);
+               TAILQ_REMOVE(&hmp->dedup_lru_list, dcp, lru_entry);
+               goto populate;
+       }
+
+       if (hmp->dedup_cache_count < DEDUP_CACHE_SIZE) {
+               dcp = hmp->dedup_free_cache;
+               hmp->dedup_free_cache = NULL;
+               ++hmp->dedup_cache_count;
+       } else {
+               dcp = TAILQ_FIRST(&hmp->dedup_lru_list);
+               RB_REMOVE(hammer_dedup_crc_rb_tree,
+                               &hmp->rb_dedup_crc_root, dcp);
+               RB_REMOVE(hammer_dedup_off_rb_tree,
+                               &hmp->rb_dedup_off_root, dcp);
+               TAILQ_REMOVE(&hmp->dedup_lru_list, dcp, lru_entry);
+       }
+
+       dcp->crc = leaf->data_crc;
+       tmp = RB_INSERT(hammer_dedup_crc_rb_tree, &hmp->rb_dedup_crc_root, dcp);
+       KKASSERT(tmp == NULL);
+
+populate:
+       dcp->hmp = ip->hmp;
+       dcp->obj_id = ip->obj_id;
+       dcp->localization = ip->obj_localization;
+       dcp->file_offset = leaf->base.key - leaf->data_len;
+       dcp->bytes = leaf->data_len;
+       dcp->data_offset = leaf->data_offset;
+
+       tmp = RB_INSERT(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, dcp);
+       KKASSERT(tmp == NULL);
+       TAILQ_INSERT_TAIL(&hmp->dedup_lru_list, dcp, lru_entry);
+
+       return (dcp);
+}
+
+__inline hammer_dedup_cache_t
+hammer_dedup_cache_lookup(hammer_mount_t hmp, hammer_crc_t crc)
+{
+       hammer_dedup_cache_t dcp;
+
+       dcp = RB_LOOKUP(hammer_dedup_crc_rb_tree,
+                               &hmp->rb_dedup_crc_root, crc);
+       return dcp;
+}
+
+void hammer_dedup_cache_inval(hammer_mount_t hmp, hammer_off_t base_offset)
+{
+       struct hammer_dedup_inval di;
+
+       di.hmp = hmp;
+       di.base_offset = base_offset;
+
+       RB_SCAN(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root,
+               hammer_dedup_scancmp, hammer_dedup_cache_inval_callback, &di);
+}
+
+void
+hammer_destroy_dedup_cache(hammer_mount_t hmp)
+{
+       hammer_dedup_cache_t dcp;
+
+       while ((dcp = TAILQ_FIRST(&hmp->dedup_lru_list)) != NULL) {
+               RB_REMOVE(hammer_dedup_crc_rb_tree, &hmp->rb_dedup_crc_root, dcp);
+               RB_REMOVE(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, dcp);
+               TAILQ_REMOVE(&hmp->dedup_lru_list, dcp, lru_entry);
+               --hmp->dedup_cache_count;
+               kfree(dcp, hmp->m_misc);
+       }
+
+       KKASSERT(RB_EMPTY(&hmp->rb_dedup_crc_root));
+       KKASSERT(RB_EMPTY(&hmp->rb_dedup_off_root));
+       KKASSERT(TAILQ_EMPTY(&hmp->dedup_lru_list));
+
+       KKASSERT(hmp->dedup_cache_count == 0);
+}
+
+int
+hammer_dedup_validate(hammer_dedup_cache_t dcp, int zone, int bytes,
+                     void *data)
+{
+       int error;
+
+       /*
+        * Zone validation
+        */
+       if (HAMMER_ZONE_DECODE(dcp->data_offset) != zone)
+               return (0);
+
+       /*
+        * Block length validation
+        */
+       if (dcp->bytes != bytes)
+               return (0);
+
+       /*
+        * Byte-by-byte data comparison
+        *
+        * The data we need for validation may already be present in the
+        * buffer cache in two flavours: vnode-based buffer or
+        * block-device-based buffer.  In case vnode-based buffer wasn't
+        * there or if a non-blocking attempt to acquire it failed use
+        * device-based buffer (for large-zone data blocks it will
+        * generate a separate read).
+        */
+       if (_vnode_validate(dcp, data, &error)) {
+               hammer_live_dedup_vnode_bcmps++;
+               return (1);
+       } else {
+               if (error == 3)
+                       hammer_live_dedup_findblk_failures++;
+       }
+
+       if (error) { /* yes, if there was an error previously */
+               if (_dev_validate(dcp, data, &error)) {
+                       hammer_live_dedup_device_bcmps++;
+                       return (1);
+               }
+       }
+
+       return (0);
+}
+
+static __inline int
+_vnode_validate(hammer_dedup_cache_t dcp, void *data, int *errorp)
+{
+       struct hammer_transaction trans;
+       hammer_inode_t ip;
+       struct vnode *vp;
+       struct buf *bp;
+       off_t dooffset;
+       int result, error;
+
+       result = error = 0;
+       *errorp = 0;
+
+       hammer_simple_transaction(&trans, dcp->hmp);
+
+       ip = hammer_get_inode(&trans, NULL, dcp->obj_id, HAMMER_MAX_TID,
+           dcp->localization, 0, &error);
+       if (ip == NULL) {
+               kprintf("dedup: unable to find objid %016jx:%08x\n",
+                   (intmax_t)dcp->obj_id, dcp->localization);
+               *errorp = 1;
+               goto failed2;
+       }
+
+       error = hammer_get_vnode(ip, &vp);
+       if (error) {
+               kprintf("dedup: unable to acquire vnode for %016jx:%08x\n",
+                   (intmax_t)dcp->obj_id, dcp->localization);
+               *errorp = 2;
+               goto failed;
+       }
+
+       if ((bp = findblk(ip->vp, dcp->file_offset, FINDBLK_NBLOCK)) != NULL) {
+               bremfree(bp);
+
+               /* XXX if (mapped to userspace) goto done, *errorp = 4 */
+
+               if ((bp->b_flags & B_CACHE) == 0 || bp->b_flags & B_DIRTY) {
+                       *errorp = 5;
+                       goto done;
+               }
+
+               if (bp->b_bio2.bio_offset != dcp->data_offset) {
+                       error = VOP_BMAP(ip->vp, dcp->file_offset, &dooffset,
+                           NULL, NULL, BUF_CMD_READ);
+                       if (error) {
+                               *errorp = 6;
+                               goto done;
+                       }
+
+                       if (dooffset != dcp->data_offset) {
+                               *errorp = 7;
+                               goto done;
+                       }
+                       hammer_live_dedup_bmap_saves++;
+               }
+
+               if (bcmp(data, bp->b_data, dcp->bytes) == 0)
+                       result = 1;
+
+done:
+               bqrelse(bp);
+       } else {
+               *errorp = 3;
+       }
+       vput(vp);
+
+failed:
+       hammer_rel_inode(ip, 0);
+failed2:
+       hammer_done_transaction(&trans);
+       return (result);
+}
+
+static __inline int
+_dev_validate(hammer_dedup_cache_t dcp, void *data, int *errorp)
+{
+       hammer_buffer_t buffer = NULL;
+       void *ondisk_data;
+       int result, error;
+
+       result = error = 0;
+       *errorp = 0;
+
+       ondisk_data = hammer_bread_ext(dcp->hmp, dcp->data_offset,
+           dcp->bytes, &error, &buffer);
+       if (error) {
+               *errorp = 1;
+               goto failed;
+       }
+
+       if (bcmp(data, ondisk_data, dcp->bytes) == 0)
+               result = 1;
+
+failed:
+       if (buffer)
+               hammer_rel_buffer(buffer, 0);
+
+       return (result);
+}