HAMMER - Add live dedup sysctl and support
authorIlya Dryomov <idryomov@gmail.com>
Tue, 4 Jan 2011 01:07:02 +0000 (03:07 +0200)
committerIlya Dryomov <idryomov@gmail.com>
Tue, 4 Jan 2011 01:07:02 +0000 (03:07 +0200)
* Adds *experimental* live dedup (aka efficient cp) support

* sysctl vfs.hammer.live_dedup
    0 - disabled (default)
    1 - enabled, populate dedup cache on reads
    2 - enabled, populate dedup cache on reads and writes

sys/vfs/hammer/hammer.h
sys/vfs/hammer/hammer_blockmap.c
sys/vfs/hammer/hammer_dedup.c
sys/vfs/hammer/hammer_object.c
sys/vfs/hammer/hammer_vfsops.c
sys/vfs/hammer/hammer_vnops.c

index d06bcf9..dbcade5 100644 (file)
@@ -271,6 +271,34 @@ typedef struct hammer_node_cache {
 TAILQ_HEAD(hammer_node_cache_list, hammer_node_cache);
 
 /*
 TAILQ_HEAD(hammer_node_cache_list, hammer_node_cache);
 
 /*
+ * Live dedup cache
+ */
+struct hammer_dedup_crc_rb_tree;
+RB_HEAD(hammer_dedup_crc_rb_tree, hammer_dedup_cache);
+RB_PROTOTYPE2(hammer_dedup_crc_rb_tree, hammer_dedup_cache, crc_entry,
+               hammer_dedup_crc_rb_compare, hammer_crc_t);
+
+struct hammer_dedup_off_rb_tree;
+RB_HEAD(hammer_dedup_off_rb_tree, hammer_dedup_cache);
+RB_PROTOTYPE2(hammer_dedup_off_rb_tree, hammer_dedup_cache, off_entry,
+               hammer_dedup_off_rb_compare, hammer_off_t);
+
+#define DEDUP_CACHE_SIZE       4096 /* XXX make it a dynamic tunable */
+
+typedef struct hammer_dedup_cache {
+       RB_ENTRY(hammer_dedup_cache) crc_entry;
+       RB_ENTRY(hammer_dedup_cache) off_entry;
+       TAILQ_ENTRY(hammer_dedup_cache) lru_entry;
+       struct hammer_mount *hmp;
+       int64_t obj_id;
+       u_int32_t localization;
+       off_t file_offset;
+       int bytes;
+       hammer_off_t data_offset; 
+       hammer_crc_t crc;
+} *hammer_dedup_cache_t;
+
+/*
  * Structure used to organize flush groups.  Flush groups must be
  * organized into chunks in order to avoid blowing out the UNDO FIFO.
  * Without this a 'sync' could end up flushing 50,000 inodes in a single
  * Structure used to organize flush groups.  Flush groups must be
  * organized into chunks in order to avoid blowing out the UNDO FIFO.
  * Without this a 'sync' could end up flushing 50,000 inodes in a single
@@ -525,6 +553,7 @@ typedef struct hammer_record *hammer_record_t;
 #define HAMMER_RECF_COMMITTED          0x0010  /* committed to the B-Tree */
 #define HAMMER_RECF_INTERLOCK_BE       0x0020  /* backend interlock */
 #define HAMMER_RECF_WANTED             0x0040  /* wanted by the frontend */
 #define HAMMER_RECF_COMMITTED          0x0010  /* committed to the B-Tree */
 #define HAMMER_RECF_INTERLOCK_BE       0x0020  /* backend interlock */
 #define HAMMER_RECF_WANTED             0x0040  /* wanted by the frontend */
+#define HAMMER_RECF_DEDUPED            0x0080  /* will be live-dedup'ed */
 #define HAMMER_RECF_CONVERT_DELETE     0x0100  /* special case */
 #define HAMMER_RECF_REDO               0x1000  /* REDO was laid down */
 
 #define HAMMER_RECF_CONVERT_DELETE     0x0100  /* special case */
 #define HAMMER_RECF_REDO               0x1000  /* REDO was laid down */
 
@@ -775,6 +804,7 @@ struct hammer_reserve {
        int             refs;
        int             zone;
        int             append_off;
        int             refs;
        int             zone;
        int             append_off;
+       int32_t         bytes_free;
        hammer_off_t    zone_offset;
 };
 
        hammer_off_t    zone_offset;
 };
 
@@ -839,6 +869,10 @@ struct hammer_mount {
        struct hammer_res_rb_tree rb_resv_root;
        struct hammer_buf_rb_tree rb_bufs_root;
        struct hammer_pfs_rb_tree rb_pfsm_root;
        struct hammer_res_rb_tree rb_resv_root;
        struct hammer_buf_rb_tree rb_bufs_root;
        struct hammer_pfs_rb_tree rb_pfsm_root;
+
+       struct hammer_dedup_crc_rb_tree rb_dedup_crc_root;
+       struct hammer_dedup_off_rb_tree rb_dedup_off_root;
+
        struct hammer_volume *rootvol;
        struct hammer_base_elm root_btree_beg;
        struct hammer_base_elm root_btree_end;
        struct hammer_volume *rootvol;
        struct hammer_base_elm root_btree_beg;
        struct hammer_base_elm root_btree_end;
@@ -882,6 +916,7 @@ struct hammer_mount {
        int     io_running_space;               /* io_token */
        int     io_running_wakeup;              /* io_token */
        int     objid_cache_count;
        int     io_running_space;               /* io_token */
        int     io_running_wakeup;              /* io_token */
        int     objid_cache_count;
+       int     dedup_cache_count;
        int     error;                          /* critical I/O error */
        struct krate    krate;                  /* rate limited kprintf */
        hammer_tid_t    asof;                   /* snapshot mount */
        int     error;                          /* critical I/O error */
        struct krate    krate;                  /* rate limited kprintf */
        hammer_tid_t    asof;                   /* snapshot mount */
@@ -908,6 +943,8 @@ struct hammer_mount {
        struct hammer_flush_group_list  flush_group_list;
        hammer_flush_group_t    next_flush_group;
        TAILQ_HEAD(, hammer_objid_cache) objid_cache_list;
        struct hammer_flush_group_list  flush_group_list;
        hammer_flush_group_t    next_flush_group;
        TAILQ_HEAD(, hammer_objid_cache) objid_cache_list;
+       TAILQ_HEAD(, hammer_dedup_cache) dedup_lru_list;
+       hammer_dedup_cache_t    dedup_free_cache;
        TAILQ_HEAD(, hammer_reclaim) reclaim_list;
        TAILQ_HEAD(, hammer_io) iorun_list;
 
        TAILQ_HEAD(, hammer_reclaim) reclaim_list;
        TAILQ_HEAD(, hammer_io) iorun_list;
 
@@ -970,6 +1007,7 @@ extern int hammer_debug_recover;
 extern int hammer_debug_recover_faults;
 extern int hammer_debug_critical;
 extern int hammer_cluster_enable;
 extern int hammer_debug_recover_faults;
 extern int hammer_debug_critical;
 extern int hammer_cluster_enable;
+extern int hammer_live_dedup;
 extern int hammer_count_fsyncs;
 extern int hammer_count_inodes;
 extern int hammer_count_iqueued;
 extern int hammer_count_fsyncs;
 extern int hammer_count_inodes;
 extern int hammer_count_iqueued;
@@ -1020,6 +1058,11 @@ extern int hammer_fsync_mode;
 extern int hammer_autoflush;
 extern int64_t hammer_contention_count;
 
 extern int hammer_autoflush;
 extern int64_t hammer_contention_count;
 
+extern int64_t hammer_live_dedup_vnode_bcmps;
+extern int64_t hammer_live_dedup_device_bcmps;
+extern int64_t hammer_live_dedup_findblk_failures;
+extern int64_t hammer_live_dedup_bmap_saves;
+
 void   hammer_critical_error(hammer_mount_t hmp, hammer_inode_t ip,
                        int error, const char *msg);
 int    hammer_vop_inactive(struct vop_inactive_args *);
 void   hammer_critical_error(hammer_mount_t hmp, hammer_inode_t ip,
                        int error, const char *msg);
 int    hammer_vop_inactive(struct vop_inactive_args *);
@@ -1121,6 +1164,20 @@ hammer_tid_t hammer_alloc_objid(hammer_mount_t hmp, hammer_inode_t dip,
 void hammer_clear_objid(hammer_inode_t dip);
 void hammer_destroy_objid_cache(hammer_mount_t hmp);
 
 void hammer_clear_objid(hammer_inode_t dip);
 void hammer_destroy_objid_cache(hammer_mount_t hmp);
 
+int hammer_dedup_crc_rb_compare(hammer_dedup_cache_t dc1,
+                       hammer_dedup_cache_t dc2);
+int hammer_dedup_off_rb_compare(hammer_dedup_cache_t dc1,
+                       hammer_dedup_cache_t dc2);
+hammer_dedup_cache_t hammer_dedup_cache_add(hammer_inode_t ip,
+                       hammer_btree_leaf_elm_t leaf);
+hammer_dedup_cache_t hammer_dedup_cache_lookup(hammer_mount_t hmp,
+                       hammer_crc_t crc);
+void hammer_dedup_cache_inval(hammer_mount_t hmp, hammer_off_t base_offset);
+void hammer_destroy_dedup_cache(hammer_mount_t hmp);
+void hammer_dump_dedup_cache(hammer_mount_t hmp);
+int hammer_dedup_validate(hammer_dedup_cache_t dcp, int zone, int bytes,
+                       void *data);
+
 int hammer_enter_undo_history(hammer_mount_t hmp, hammer_off_t offset,
                        int bytes);
 void hammer_clear_undo_history(hammer_mount_t hmp);
 int hammer_enter_undo_history(hammer_mount_t hmp, hammer_off_t offset,
                        int bytes);
 void hammer_clear_undo_history(hammer_mount_t hmp);
@@ -1273,6 +1330,8 @@ hammer_off_t hammer_blockmap_alloc(hammer_transaction_t trans, int zone,
                        int bytes, hammer_off_t hint, int *errorp);
 hammer_reserve_t hammer_blockmap_reserve(hammer_mount_t hmp, int zone,
                        int bytes, hammer_off_t *zone_offp, int *errorp);
                        int bytes, hammer_off_t hint, int *errorp);
 hammer_reserve_t hammer_blockmap_reserve(hammer_mount_t hmp, int zone,
                        int bytes, hammer_off_t *zone_offp, int *errorp);
+hammer_reserve_t hammer_blockmap_reserve_dedup(hammer_mount_t hmp, int zone,
+                       int bytes, hammer_off_t zone_offset, int *errorp);
 void hammer_blockmap_reserve_complete(hammer_mount_t hmp,
                        hammer_reserve_t resv);
 void hammer_reserve_clrdelay(hammer_mount_t hmp, hammer_reserve_t resv);
 void hammer_blockmap_reserve_complete(hammer_mount_t hmp,
                        hammer_reserve_t resv);
 void hammer_reserve_clrdelay(hammer_mount_t hmp, hammer_reserve_t resv);
index 644e620..16160da 100644 (file)
@@ -44,6 +44,7 @@ static void hammer_reserve_setdelay_offset(hammer_mount_t hmp,
                                    hammer_off_t base_offset, int zone,
                                    struct hammer_blockmap_layer2 *layer2);
 static void hammer_reserve_setdelay(hammer_mount_t hmp, hammer_reserve_t resv);
                                    hammer_off_t base_offset, int zone,
                                    struct hammer_blockmap_layer2 *layer2);
 static void hammer_reserve_setdelay(hammer_mount_t hmp, hammer_reserve_t resv);
+static int update_bytes_free(hammer_reserve_t resv, int bytes);
 
 /*
  * Reserved big-blocks red-black tree support
 
 /*
  * Reserved big-blocks red-black tree support
@@ -625,6 +626,174 @@ failed:
 }
 
 /*
 }
 
 /*
+ * Frontend function - Dedup bytes in a zone.
+ *
+ * Dedup reservations work exactly the same as normal write reservations
+ * except we only adjust bytes_free field and don't touch append offset.
+ * Finalization mechanic for dedup reservations is also the same as for
+ * normal write ones - the backend finalizes the reservation with
+ * hammer_blockmap_finalize().
+ */
+hammer_reserve_t
+hammer_blockmap_reserve_dedup(hammer_mount_t hmp, int zone, int bytes,
+                             hammer_off_t zone_offset, int *errorp)
+{
+       hammer_volume_t root_volume;
+       hammer_blockmap_t freemap;
+       struct hammer_blockmap_layer1 *layer1;
+       struct hammer_blockmap_layer2 *layer2;
+       hammer_buffer_t buffer1 = NULL;
+       hammer_buffer_t buffer2 = NULL;
+       hammer_off_t layer1_offset;
+       hammer_off_t layer2_offset;
+       hammer_off_t base_off;
+       hammer_reserve_t resv = NULL;
+       hammer_reserve_t resx = NULL;
+
+       /*
+        * Setup
+        */
+       KKASSERT(zone >= HAMMER_ZONE_BTREE_INDEX && zone < HAMMER_MAX_ZONES);
+       root_volume = hammer_get_root_volume(hmp, errorp);
+       if (*errorp)
+               return (NULL);
+       freemap = &hmp->blockmap[HAMMER_ZONE_FREEMAP_INDEX];
+       KKASSERT(freemap->phys_offset != 0);
+
+       bytes = (bytes + 15) & ~15;
+       KKASSERT(bytes > 0 && bytes <= HAMMER_XBUFSIZE);
+
+       /*
+        * Dive layer 1.
+        */
+       layer1_offset = freemap->phys_offset +
+                       HAMMER_BLOCKMAP_LAYER1_OFFSET(zone_offset);
+       layer1 = hammer_bread(hmp, layer1_offset, errorp, &buffer1);
+       if (*errorp)
+               goto failed;
+
+       /*
+        * Check CRC.
+        */
+       if (layer1->layer1_crc != crc32(layer1, HAMMER_LAYER1_CRCSIZE)) {
+               hammer_lock_ex(&hmp->blkmap_lock);
+               if (layer1->layer1_crc != crc32(layer1, HAMMER_LAYER1_CRCSIZE))
+                       panic("CRC FAILED: LAYER1");
+               hammer_unlock(&hmp->blkmap_lock);
+       }
+       KKASSERT(layer1->phys_offset != HAMMER_BLOCKMAP_UNAVAIL);
+
+       /*
+        * Dive layer 2, each entry represents a large-block.
+        */
+       layer2_offset = layer1->phys_offset +
+                       HAMMER_BLOCKMAP_LAYER2_OFFSET(zone_offset);
+       layer2 = hammer_bread(hmp, layer2_offset, errorp, &buffer2);
+       if (*errorp)
+               goto failed;
+
+       /*
+        * Check CRC.
+        */
+       if (layer2->entry_crc != crc32(layer2, HAMMER_LAYER2_CRCSIZE)) {
+               hammer_lock_ex(&hmp->blkmap_lock);
+               if (layer2->entry_crc != crc32(layer2, HAMMER_LAYER2_CRCSIZE))
+                       panic("CRC FAILED: LAYER2");
+               hammer_unlock(&hmp->blkmap_lock);
+       }
+
+       /*
+        * Fail if the zone is owned by someone other than us.
+        */
+       if (layer2->zone && layer2->zone != zone)
+               goto failed;
+
+       /*
+        * We need the lock from this point on.  We have to re-check zone
+        * ownership after acquiring the lock and also check for reservations.
+        */
+       hammer_lock_ex(&hmp->blkmap_lock);
+
+       if (layer2->zone && layer2->zone != zone) {
+               hammer_unlock(&hmp->blkmap_lock);
+               goto failed;
+       }
+
+       base_off = (zone_offset &
+                   (~HAMMER_LARGEBLOCK_MASK64 & ~HAMMER_OFF_ZONE_MASK)) |
+                   HAMMER_ZONE_RAW_BUFFER;
+       resv = RB_LOOKUP(hammer_res_rb_tree, &hmp->rb_resv_root, base_off);
+       if (resv) {
+               if (resv->zone != zone) {
+                       hammer_unlock(&hmp->blkmap_lock);
+                       resv = NULL;
+                       goto failed;
+               }
+               /*
+                * Due to possible big block underflow we can't simply
+                * subtract bytes from bytes_free.
+                */
+               if (update_bytes_free(resv, bytes) == 0) {
+                       hammer_unlock(&hmp->blkmap_lock);
+                       resv = NULL;
+                       goto failed;
+               }
+               ++resv->refs;
+               resx = NULL;
+       } else {
+               resx = kmalloc(sizeof(*resv), hmp->m_misc,
+                              M_WAITOK | M_ZERO | M_USE_RESERVE);
+               resx->refs = 1;
+               resx->zone = zone;
+               resx->bytes_free = layer2->bytes_free;
+               /*
+                * Due to possible big block underflow we can't simply
+                * subtract bytes from bytes_free.
+                */
+               if (update_bytes_free(resx, bytes) == 0) {
+                       hammer_unlock(&hmp->blkmap_lock);
+                       kfree(resx, hmp->m_misc);
+                       goto failed;
+               }
+               resx->zone_offset = base_off;
+               resv = RB_INSERT(hammer_res_rb_tree, &hmp->rb_resv_root, resx);
+               KKASSERT(resv == NULL);
+               resv = resx;
+               ++hammer_count_reservations;
+       }
+
+       hammer_unlock(&hmp->blkmap_lock);
+
+failed:
+       if (buffer1)
+               hammer_rel_buffer(buffer1, 0);
+       if (buffer2)
+               hammer_rel_buffer(buffer2, 0);
+       hammer_rel_volume(root_volume, 0);
+
+       return(resv);
+}
+
+static int
+update_bytes_free(hammer_reserve_t resv, int bytes)
+{
+       int32_t temp;
+
+       /*
+        * Big-block underflow check
+        */
+       temp = resv->bytes_free - HAMMER_LARGEBLOCK_SIZE * 2;
+       cpu_ccfence(); /* XXX do we really need it ? */
+       if (temp > resv->bytes_free) {
+               kprintf("BIGBLOCK UNDERFLOW\n");
+               return (0);
+       }
+
+       resv->bytes_free -= bytes;
+       return (1);
+}
+
+/*
  * Dereference a reservation structure.  Upon the final release the
  * underlying big-block is checked and if it is entirely free we delete
  * any related HAMMER buffers to avoid potential conflicts with future
  * Dereference a reservation structure.  Upon the final release the
  * underlying big-block is checked and if it is entirely free we delete
  * any related HAMMER buffers to avoid potential conflicts with future
@@ -652,6 +821,8 @@ hammer_blockmap_reserve_complete(hammer_mount_t hmp, hammer_reserve_t resv)
                resv->append_off = HAMMER_LARGEBLOCK_SIZE;
                base_offset = resv->zone_offset & ~HAMMER_OFF_ZONE_MASK;
                base_offset = HAMMER_ZONE_ENCODE(resv->zone, base_offset);
                resv->append_off = HAMMER_LARGEBLOCK_SIZE;
                base_offset = resv->zone_offset & ~HAMMER_OFF_ZONE_MASK;
                base_offset = HAMMER_ZONE_ENCODE(resv->zone, base_offset);
+               if (!TAILQ_EMPTY(&hmp->dedup_lru_list))
+                       hammer_dedup_cache_inval(hmp, base_offset);
                error = hammer_del_buffers(hmp, base_offset,
                                           resv->zone_offset,
                                           HAMMER_LARGEBLOCK_SIZE,
                error = hammer_del_buffers(hmp, base_offset,
                                           resv->zone_offset,
                                           HAMMER_LARGEBLOCK_SIZE,
@@ -1108,8 +1279,10 @@ hammer_blockmap_finalize(hammer_transaction_t trans,
        KKASSERT(layer2->zone == zone);
        KKASSERT(bytes != 0);
        layer2->bytes_free -= bytes;
        KKASSERT(layer2->zone == zone);
        KKASSERT(bytes != 0);
        layer2->bytes_free -= bytes;
-       if (resv)
+
+       if (resv) {
                resv->flags &= ~HAMMER_RESF_LAYER2FREE;
                resv->flags &= ~HAMMER_RESF_LAYER2FREE;
+       }
 
        /*
         * Finalizations can occur out of order, or combined with allocations.
 
        /*
         * Finalizations can occur out of order, or combined with allocations.
index 0bb74b5..1bc125f 100644 (file)
@@ -189,3 +189,342 @@ validate_zone(hammer_off_t data_offset)
                return (1);
        }
 }
                return (1);
        }
 }
+
+/************************************************************************
+ *                             LIVE DEDUP                              *
+ ************************************************************************
+ *
+ * HAMMER Live Dedup (aka as efficient cp(1) implementation)
+ *
+ * The dedup cache is operated in a LRU fashion and destroyed on
+ * unmount, so essentially this is a live dedup on a cached dataset and
+ * not a full-fledged fs-wide one - we have a batched dedup for that.
+ * We allow duplicate entries in the buffer cache, data blocks are
+ * deduplicated only on their way to media.  By default the cache is
+ * populated on reads only, but it can be populated on writes too.
+ *
+ * The main implementation gotcha is on-media requirement - in order for
+ * a data block to be added to a dedup cache it has to be present on
+ * disk.  This simplifies cache logic a lot - once data is laid out on
+ * media it remains valid on media all the way up to the point where the
+ * related big block the data was stored in is freed - so there is only
+ * one place we need to bother with invalidation code.
+ */
+
+/*
+ * RB-Tree support for dedup cache structures
+ */
+RB_GENERATE2(hammer_dedup_crc_rb_tree, hammer_dedup_cache, crc_entry,
+               hammer_dedup_crc_rb_compare, hammer_crc_t, crc);
+RB_GENERATE2(hammer_dedup_off_rb_tree, hammer_dedup_cache, off_entry,
+               hammer_dedup_off_rb_compare, hammer_off_t, data_offset);
+
+struct hammer_dedup_inval {
+       struct hammer_mount *hmp;
+       hammer_off_t base_offset;
+};
+
+static int hammer_dedup_scancmp(hammer_dedup_cache_t dc, void *data);
+static int hammer_dedup_cache_inval_callback(hammer_dedup_cache_t dc,
+               void *data);
+static __inline int _vnode_validate(hammer_dedup_cache_t dcp, void *data,
+               int *errorp);
+static __inline int _dev_validate(hammer_dedup_cache_t dcp, void *data,
+               int *errorp);
+
+int
+hammer_dedup_crc_rb_compare(hammer_dedup_cache_t dc1,
+                               hammer_dedup_cache_t dc2)
+{
+       if (dc1->crc < dc2->crc)
+               return (-1);
+       if (dc1->crc > dc2->crc)
+               return (1);
+
+       return (0);
+}
+
+int
+hammer_dedup_off_rb_compare(hammer_dedup_cache_t dc1,
+                               hammer_dedup_cache_t dc2)
+{
+       if (dc1->data_offset < dc2->data_offset)
+               return (-1);
+       if (dc1->data_offset > dc2->data_offset)
+               return (1);
+
+       return (0);
+}
+
+static int
+hammer_dedup_scancmp(hammer_dedup_cache_t dc, void *data)
+{
+       hammer_off_t off = ((struct hammer_dedup_inval *)data)->base_offset;
+
+       if (dc->data_offset < off)
+               return (-1);
+       if (dc->data_offset >= off + HAMMER_LARGEBLOCK_SIZE)
+               return (1);
+
+       return (0);
+}
+
+static int
+hammer_dedup_cache_inval_callback(hammer_dedup_cache_t dc, void *data)
+{
+       hammer_mount_t hmp = ((struct hammer_dedup_inval *)data)->hmp;
+
+       RB_REMOVE(hammer_dedup_crc_rb_tree, &hmp->rb_dedup_crc_root, dc);
+       RB_REMOVE(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, dc);
+       TAILQ_REMOVE(&hmp->dedup_lru_list, dc, lru_entry);
+
+       --hmp->dedup_cache_count;
+       kfree(dc, hmp->m_misc);
+
+       return (0);
+}
+
+hammer_dedup_cache_t
+hammer_dedup_cache_add(hammer_inode_t ip, hammer_btree_leaf_elm_t leaf)
+{
+       hammer_dedup_cache_t dcp, tmp;
+       hammer_mount_t hmp = ip->hmp;
+
+       if (hmp->dedup_free_cache == NULL) {
+               tmp = kmalloc(sizeof(*tmp), hmp->m_misc, M_WAITOK | M_ZERO);
+               if (hmp->dedup_free_cache == NULL)
+                       hmp->dedup_free_cache = tmp;
+               else
+                       kfree(tmp, hmp->m_misc);
+       }
+
+       KKASSERT(leaf != NULL);
+
+       dcp = RB_LOOKUP(hammer_dedup_crc_rb_tree,
+                               &hmp->rb_dedup_crc_root, leaf->data_crc);
+       if (dcp != NULL) {
+               RB_REMOVE(hammer_dedup_off_rb_tree,
+                               &hmp->rb_dedup_off_root, dcp);
+               TAILQ_REMOVE(&hmp->dedup_lru_list, dcp, lru_entry);
+               goto populate;
+       }
+
+       if (hmp->dedup_cache_count < DEDUP_CACHE_SIZE) {
+               dcp = hmp->dedup_free_cache;
+               hmp->dedup_free_cache = NULL;
+               ++hmp->dedup_cache_count;
+       } else {
+               dcp = TAILQ_FIRST(&hmp->dedup_lru_list);
+               RB_REMOVE(hammer_dedup_crc_rb_tree,
+                               &hmp->rb_dedup_crc_root, dcp);
+               RB_REMOVE(hammer_dedup_off_rb_tree,
+                               &hmp->rb_dedup_off_root, dcp);
+               TAILQ_REMOVE(&hmp->dedup_lru_list, dcp, lru_entry);
+       }
+
+       dcp->crc = leaf->data_crc;
+       tmp = RB_INSERT(hammer_dedup_crc_rb_tree, &hmp->rb_dedup_crc_root, dcp);
+       KKASSERT(tmp == NULL);
+
+populate:
+       dcp->hmp = ip->hmp;
+       dcp->obj_id = ip->obj_id;
+       dcp->localization = ip->obj_localization;
+       dcp->file_offset = leaf->base.key - leaf->data_len;
+       dcp->bytes = leaf->data_len;
+       dcp->data_offset = leaf->data_offset;
+
+       tmp = RB_INSERT(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, dcp);
+       KKASSERT(tmp == NULL);
+       TAILQ_INSERT_TAIL(&hmp->dedup_lru_list, dcp, lru_entry);
+
+       return (dcp);
+}
+
+__inline hammer_dedup_cache_t
+hammer_dedup_cache_lookup(hammer_mount_t hmp, hammer_crc_t crc)
+{
+       hammer_dedup_cache_t dcp;
+
+       dcp = RB_LOOKUP(hammer_dedup_crc_rb_tree,
+                               &hmp->rb_dedup_crc_root, crc);
+       return dcp;
+}
+
+void hammer_dedup_cache_inval(hammer_mount_t hmp, hammer_off_t base_offset)
+{
+       struct hammer_dedup_inval di;
+
+       di.hmp = hmp;
+       di.base_offset = base_offset;
+
+       RB_SCAN(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root,
+               hammer_dedup_scancmp, hammer_dedup_cache_inval_callback, &di);
+}
+
+void
+hammer_destroy_dedup_cache(hammer_mount_t hmp)
+{
+       hammer_dedup_cache_t dcp;
+
+       while ((dcp = TAILQ_FIRST(&hmp->dedup_lru_list)) != NULL) {
+               RB_REMOVE(hammer_dedup_crc_rb_tree, &hmp->rb_dedup_crc_root, dcp);
+               RB_REMOVE(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, dcp);
+               TAILQ_REMOVE(&hmp->dedup_lru_list, dcp, lru_entry);
+               --hmp->dedup_cache_count;
+               kfree(dcp, hmp->m_misc);
+       }
+
+       KKASSERT(RB_EMPTY(&hmp->rb_dedup_crc_root));
+       KKASSERT(RB_EMPTY(&hmp->rb_dedup_off_root));
+       KKASSERT(TAILQ_EMPTY(&hmp->dedup_lru_list));
+
+       KKASSERT(hmp->dedup_cache_count == 0);
+}
+
+int
+hammer_dedup_validate(hammer_dedup_cache_t dcp, int zone, int bytes,
+                     void *data)
+{
+       int error;
+
+       /*
+        * Zone validation
+        */
+       if (HAMMER_ZONE_DECODE(dcp->data_offset) != zone)
+               return (0);
+
+       /*
+        * Block length validation
+        */
+       if (dcp->bytes != bytes)
+               return (0);
+
+       /*
+        * Byte-by-byte data comparison
+        *
+        * The data we need for validation may already be present in the
+        * buffer cache in two flavours: vnode-based buffer or
+        * block-device-based buffer.  In case vnode-based buffer wasn't
+        * there or if a non-blocking attempt to acquire it failed use
+        * device-based buffer (for large-zone data blocks it will
+        * generate a separate read).
+        */
+       if (_vnode_validate(dcp, data, &error)) {
+               hammer_live_dedup_vnode_bcmps++;
+               return (1);
+       } else {
+               if (error == 3)
+                       hammer_live_dedup_findblk_failures++;
+       }
+
+       if (error) { /* yes, if there was an error previously */
+               if (_dev_validate(dcp, data, &error)) {
+                       hammer_live_dedup_device_bcmps++;
+                       return (1);
+               }
+       }
+
+       return (0);
+}
+
+static __inline int
+_vnode_validate(hammer_dedup_cache_t dcp, void *data, int *errorp)
+{
+       struct hammer_transaction trans;
+       hammer_inode_t ip;
+       struct vnode *vp;
+       struct buf *bp;
+       off_t dooffset;
+       int result, error;
+
+       result = error = 0;
+       *errorp = 0;
+
+       hammer_simple_transaction(&trans, dcp->hmp);
+
+       ip = hammer_get_inode(&trans, NULL, dcp->obj_id, HAMMER_MAX_TID,
+           dcp->localization, 0, &error);
+       if (ip == NULL) {
+               kprintf("dedup: unable to find objid %016jx:%08x\n",
+                   (intmax_t)dcp->obj_id, dcp->localization);
+               *errorp = 1;
+               goto failed2;
+       }
+
+       error = hammer_get_vnode(ip, &vp);
+       if (error) {
+               kprintf("dedup: unable to acquire vnode for %016jx:%08x\n",
+                   (intmax_t)dcp->obj_id, dcp->localization);
+               *errorp = 2;
+               goto failed;
+       }
+
+       if ((bp = findblk(ip->vp, dcp->file_offset, FINDBLK_NBLOCK)) != NULL) {
+               bremfree(bp);
+
+               /* XXX if (mapped to userspace) goto done, *errorp = 4 */
+
+               if ((bp->b_flags & B_CACHE) == 0 || bp->b_flags & B_DIRTY) {
+                       *errorp = 5;
+                       goto done;
+               }
+
+               if (bp->b_bio2.bio_offset != dcp->data_offset) {
+                       error = VOP_BMAP(ip->vp, dcp->file_offset, &dooffset,
+                           NULL, NULL, BUF_CMD_READ);
+                       if (error) {
+                               *errorp = 6;
+                               goto done;
+                       }
+
+                       if (dooffset != dcp->data_offset) {
+                               *errorp = 7;
+                               goto done;
+                       }
+                       hammer_live_dedup_bmap_saves++;
+               }
+
+               if (bcmp(data, bp->b_data, dcp->bytes) == 0)
+                       result = 1;
+
+done:
+               bqrelse(bp);
+       } else {
+               *errorp = 3;
+       }
+       vput(vp);
+
+failed:
+       hammer_rel_inode(ip, 0);
+failed2:
+       hammer_done_transaction(&trans);
+       return (result);
+}
+
+static __inline int
+_dev_validate(hammer_dedup_cache_t dcp, void *data, int *errorp)
+{
+       hammer_buffer_t buffer = NULL;
+       void *ondisk_data;
+       int result, error;
+
+       result = error = 0;
+       *errorp = 0;
+
+       ondisk_data = hammer_bread_ext(dcp->hmp, dcp->data_offset,
+           dcp->bytes, &error, &buffer);
+       if (error) {
+               *errorp = 1;
+               goto failed;
+       }
+
+       if (bcmp(data, ondisk_data, dcp->bytes) == 0)
+               result = 1;
+
+failed:
+       if (buffer)
+               hammer_rel_buffer(buffer, 0);
+
+       return (result);
+}
index 368acf0..8786cc8 100644 (file)
@@ -950,6 +950,8 @@ hammer_ip_add_bulk(hammer_inode_t ip, off_t file_offset, void *data, int bytes,
                   int *errorp)
 {
        hammer_record_t record;
                   int *errorp)
 {
        hammer_record_t record;
+       hammer_dedup_cache_t dcp;
+       hammer_crc_t crc;
        int zone;
 
        /*
        int zone;
 
        /*
@@ -966,14 +968,39 @@ hammer_ip_add_bulk(hammer_inode_t ip, off_t file_offset, void *data, int bytes,
        record = hammer_alloc_mem_record(ip, 0);
        zone = (bytes >= HAMMER_BUFSIZE) ? HAMMER_ZONE_LARGE_DATA_INDEX :
                                           HAMMER_ZONE_SMALL_DATA_INDEX;
        record = hammer_alloc_mem_record(ip, 0);
        zone = (bytes >= HAMMER_BUFSIZE) ? HAMMER_ZONE_LARGE_DATA_INDEX :
                                           HAMMER_ZONE_SMALL_DATA_INDEX;
-       record->resv = hammer_blockmap_reserve(ip->hmp, zone, bytes,
-                                              &record->leaf.data_offset,
-                                              errorp);
-       if (record->resv == NULL) {
-               kprintf("hammer_ip_add_bulk: reservation failed\n");
-               hammer_rel_mem_record(record);
-               return(NULL);
+       if (bytes == 0)
+               crc = 0;
+       else
+               crc = crc32(data, bytes);
+
+       if (hammer_live_dedup == 0)
+               goto nodedup;
+       if ((dcp = hammer_dedup_cache_lookup(ip->hmp, crc)) != NULL) {
+               struct hammer_dedup_cache tmp = *dcp;
+
+               record->resv = hammer_blockmap_reserve_dedup(ip->hmp, zone,
+                       bytes, tmp.data_offset, errorp);
+               if (record->resv == NULL)
+                       goto nodedup;
+
+               if (!hammer_dedup_validate(&tmp, zone, bytes, data)) {
+                       hammer_blockmap_reserve_complete(ip->hmp, record->resv);
+                       goto nodedup;
+               }
+
+               record->leaf.data_offset = tmp.data_offset;
+               record->flags |= HAMMER_RECF_DEDUPED;
+       } else {
+nodedup:
+               record->resv = hammer_blockmap_reserve(ip->hmp, zone, bytes,
+                      &record->leaf.data_offset, errorp);
+               if (record->resv == NULL) {
+                       kprintf("hammer_ip_add_bulk: reservation failed\n");
+                       hammer_rel_mem_record(record);
+                       return(NULL);
+               }
        }
        }
+
        record->type = HAMMER_MEM_RECORD_DATA;
        record->leaf.base.rec_type = HAMMER_RECTYPE_DATA;
        record->leaf.base.obj_type = ip->ino_leaf.base.obj_type;
        record->type = HAMMER_MEM_RECORD_DATA;
        record->leaf.base.rec_type = HAMMER_RECTYPE_DATA;
        record->leaf.base.obj_type = ip->ino_leaf.base.obj_type;
@@ -982,7 +1009,7 @@ hammer_ip_add_bulk(hammer_inode_t ip, off_t file_offset, void *data, int bytes,
        record->leaf.base.localization = ip->obj_localization +
                                         HAMMER_LOCALIZE_MISC;
        record->leaf.data_len = bytes;
        record->leaf.base.localization = ip->obj_localization +
                                         HAMMER_LOCALIZE_MISC;
        record->leaf.data_len = bytes;
-       hammer_crc_set_leaf(data, &record->leaf);
+       record->leaf.data_crc = crc;
        KKASSERT(*errorp == 0);
 
        return(record);
        KKASSERT(*errorp == 0);
 
        return(record);
@@ -1262,6 +1289,11 @@ hammer_ip_sync_record_cursor(hammer_cursor_t cursor, hammer_record_t record)
                                                 record->resv,
                                                 record->leaf.data_offset,
                                                 record->leaf.data_len);
                                                 record->resv,
                                                 record->leaf.data_offset,
                                                 record->leaf.data_len);
+
+               if (hammer_live_dedup == 2 &&
+                   (record->flags & HAMMER_RECF_DEDUPED) == 0) {
+                       hammer_dedup_cache_add(record->ip, &record->leaf);
+               }
        } else if (record->data && record->leaf.data_len) {
                /*
                 * Wholely cached record, with data.  Allocate the data.
        } else if (record->data && record->leaf.data_len) {
                /*
                 * Wholely cached record, with data.  Allocate the data.
index c9b01fc..5ebb1dd 100644 (file)
@@ -63,6 +63,7 @@ int hammer_debug_recover;             /* -1 will disable, +1 will force */
 int hammer_debug_recover_faults;
 int hammer_debug_critical;             /* non-zero enter debugger on error */
 int hammer_cluster_enable = 1;         /* enable read clustering by default */
 int hammer_debug_recover_faults;
 int hammer_debug_critical;             /* non-zero enter debugger on error */
 int hammer_cluster_enable = 1;         /* enable read clustering by default */
+int hammer_live_dedup = 0;
 int hammer_count_fsyncs;
 int hammer_count_inodes;
 int hammer_count_iqueued;
 int hammer_count_fsyncs;
 int hammer_count_inodes;
 int hammer_count_iqueued;
@@ -116,7 +117,18 @@ int hammer_fsync_mode = 3;
 int64_t hammer_contention_count;
 int64_t hammer_zone_limit;
 
 int64_t hammer_contention_count;
 int64_t hammer_zone_limit;
 
+/*
+ * Live dedup debug counters (sysctls are writable so that counters
+ * can be reset from userspace).
+ */
+int64_t hammer_live_dedup_vnode_bcmps = 0;
+int64_t hammer_live_dedup_device_bcmps = 0;
+int64_t hammer_live_dedup_findblk_failures = 0;
+int64_t hammer_live_dedup_bmap_saves = 0;
+
+
 SYSCTL_NODE(_vfs, OID_AUTO, hammer, CTLFLAG_RW, 0, "HAMMER filesystem");
 SYSCTL_NODE(_vfs, OID_AUTO, hammer, CTLFLAG_RW, 0, "HAMMER filesystem");
+
 SYSCTL_INT(_vfs_hammer, OID_AUTO, supported_version, CTLFLAG_RD,
           &hammer_supported_version, 0, "");
 SYSCTL_INT(_vfs_hammer, OID_AUTO, debug_general, CTLFLAG_RW,
 SYSCTL_INT(_vfs_hammer, OID_AUTO, supported_version, CTLFLAG_RD,
           &hammer_supported_version, 0, "");
 SYSCTL_INT(_vfs_hammer, OID_AUTO, debug_general, CTLFLAG_RW,
@@ -141,6 +153,13 @@ SYSCTL_INT(_vfs_hammer, OID_AUTO, debug_critical, CTLFLAG_RW,
           &hammer_debug_critical, 0, "");
 SYSCTL_INT(_vfs_hammer, OID_AUTO, cluster_enable, CTLFLAG_RW,
           &hammer_cluster_enable, 0, "");
           &hammer_debug_critical, 0, "");
 SYSCTL_INT(_vfs_hammer, OID_AUTO, cluster_enable, CTLFLAG_RW,
           &hammer_cluster_enable, 0, "");
+/*
+ * 0 - live dedup is disabled
+ * 1 - dedup cache is populated on reads only
+ * 2 - dedup cache is populated on both reads and writes
+ */
+SYSCTL_INT(_vfs_hammer, OID_AUTO, live_dedup, CTLFLAG_RW,
+          &hammer_live_dedup, 0, "");
 
 SYSCTL_INT(_vfs_hammer, OID_AUTO, limit_dirtybufspace, CTLFLAG_RW,
           &hammer_limit_dirtybufspace, 0, "");
 
 SYSCTL_INT(_vfs_hammer, OID_AUTO, limit_dirtybufspace, CTLFLAG_RW,
           &hammer_limit_dirtybufspace, 0, "");
@@ -216,6 +235,15 @@ SYSCTL_QUAD(_vfs_hammer, OID_AUTO, stats_undo, CTLFLAG_RD,
 SYSCTL_QUAD(_vfs_hammer, OID_AUTO, stats_redo, CTLFLAG_RD,
           &hammer_stats_redo, 0, "");
 
 SYSCTL_QUAD(_vfs_hammer, OID_AUTO, stats_redo, CTLFLAG_RD,
           &hammer_stats_redo, 0, "");
 
+SYSCTL_QUAD(_vfs_hammer, OID_AUTO, live_dedup_vnode_bcmps, CTLFLAG_RW,
+           &hammer_live_dedup_vnode_bcmps, 0, "");
+SYSCTL_QUAD(_vfs_hammer, OID_AUTO, live_dedup_device_bcmps, CTLFLAG_RW,
+           &hammer_live_dedup_device_bcmps, 0, "");
+SYSCTL_QUAD(_vfs_hammer, OID_AUTO, live_dedup_findblk_failures, CTLFLAG_RW,
+           &hammer_live_dedup_findblk_failures, 0, "");
+SYSCTL_QUAD(_vfs_hammer, OID_AUTO, live_dedup_bmap_saves, CTLFLAG_RW,
+           &hammer_live_dedup_bmap_saves, 0, "");
+
 SYSCTL_INT(_vfs_hammer, OID_AUTO, count_dirtybufspace, CTLFLAG_RD,
           &hammer_count_dirtybufspace, 0, "");
 SYSCTL_INT(_vfs_hammer, OID_AUTO, count_refedbufs, CTLFLAG_RD,
 SYSCTL_INT(_vfs_hammer, OID_AUTO, count_dirtybufspace, CTLFLAG_RD,
           &hammer_count_dirtybufspace, 0, "");
 SYSCTL_INT(_vfs_hammer, OID_AUTO, count_refedbufs, CTLFLAG_RD,
@@ -445,6 +473,10 @@ hammer_vfs_mount(struct mount *mp, char *mntpt, caddr_t data,
                TAILQ_INIT(&hmp->objid_cache_list);
                TAILQ_INIT(&hmp->undo_lru_list);
                TAILQ_INIT(&hmp->reclaim_list);
                TAILQ_INIT(&hmp->objid_cache_list);
                TAILQ_INIT(&hmp->undo_lru_list);
                TAILQ_INIT(&hmp->reclaim_list);
+
+               RB_INIT(&hmp->rb_dedup_crc_root);
+               RB_INIT(&hmp->rb_dedup_off_root);       
+               TAILQ_INIT(&hmp->dedup_lru_list);
        }
        hmp->hflags &= ~HMNT_USERFLAGS;
        hmp->hflags |= info.hflags & HMNT_USERFLAGS;
        }
        hmp->hflags &= ~HMNT_USERFLAGS;
        hmp->hflags |= info.hflags & HMNT_USERFLAGS;
@@ -872,6 +904,11 @@ hammer_free_hmp(struct mount *mp)
        mp->mnt_flag &= ~MNT_LOCAL;
        hmp->mp = NULL;
        hammer_destroy_objid_cache(hmp);
        mp->mnt_flag &= ~MNT_LOCAL;
        hmp->mp = NULL;
        hammer_destroy_objid_cache(hmp);
+       hammer_destroy_dedup_cache(hmp);
+       if (hmp->dedup_free_cache != NULL) {
+               kfree(hmp->dedup_free_cache, hmp->m_misc);
+               hmp->dedup_free_cache = NULL;
+       }
        kmalloc_destroy(&hmp->m_misc);
        kmalloc_destroy(&hmp->m_inodes);
        lwkt_reltoken(&hmp->fs_token);
        kmalloc_destroy(&hmp->m_misc);
        kmalloc_destroy(&hmp->m_inodes);
        lwkt_reltoken(&hmp->fs_token);
index 4d8cdae..8ddb6a2 100644 (file)
@@ -2622,6 +2622,9 @@ hammer_vop_strategy(struct vop_strategy_args *ap)
                biodone(ap->a_bio);
                break;
        }
                biodone(ap->a_bio);
                break;
        }
+
+       /* hammer_dump_dedup_cache(((hammer_inode_t)ap->a_vp->v_data)->hmp); */
+
        return (error);
 }
 
        return (error);
 }
 
@@ -2818,6 +2821,8 @@ hammer_vop_strategy_read(struct vop_strategy_args *ap)
                                 HAMMER_ZONE_LARGE_DATA);
                        nbio->bio_offset = disk_offset;
                        error = hammer_io_direct_read(hmp, nbio, cursor.leaf);
                                 HAMMER_ZONE_LARGE_DATA);
                        nbio->bio_offset = disk_offset;
                        error = hammer_io_direct_read(hmp, nbio, cursor.leaf);
+                       if (hammer_live_dedup)
+                               hammer_dedup_cache_add(ip, cursor.leaf);
                        goto done;
                } else if (n) {
                        error = hammer_ip_resolve_data(&cursor);
                        goto done;
                } else if (n) {
                        error = hammer_ip_resolve_data(&cursor);
@@ -2830,6 +2835,13 @@ hammer_vop_strategy_read(struct vop_strategy_args *ap)
                        break;
 
                /*
                        break;
 
                /*
+                * We have to be sure that the only elements added to the
+                * dedup cache are those which are already on-media.
+                */
+               if (hammer_live_dedup && hammer_cursor_ondisk(&cursor))
+                       hammer_dedup_cache_add(ip, cursor.leaf);
+
+               /*
                 * Iterate until we have filled the request.
                 */
                boff += n;
                 * Iterate until we have filled the request.
                 */
                boff += n;
@@ -3043,7 +3055,11 @@ hammer_vop_bmap(struct vop_bmap_args *ap)
                        }
                        last_offset = rec_offset + rec_len;
                        last_disk_offset = disk_offset + rec_len;
                        }
                        last_offset = rec_offset + rec_len;
                        last_disk_offset = disk_offset + rec_len;
+
+                       if (hammer_live_dedup)
+                               hammer_dedup_cache_add(ip, cursor.leaf);
                }
                }
+               
                error = hammer_ip_next(&cursor);
        }
 
                error = hammer_ip_next(&cursor);
        }
 
@@ -3216,7 +3232,13 @@ hammer_vop_strategy_write(struct vop_strategy_args *ap)
                        record->flags |= HAMMER_RECF_REDO;
                        bp->b_flags &= ~B_VFSFLAG1;
                }
                        record->flags |= HAMMER_RECF_REDO;
                        bp->b_flags &= ~B_VFSFLAG1;
                }
-               hammer_io_direct_write(hmp, bio, record);
+               if (record->flags & HAMMER_RECF_DEDUPED) {
+                       bp->b_resid = 0;
+                       hammer_ip_replace_bulk(hmp, record);
+                       biodone(ap->a_bio);
+               } else {
+                       hammer_io_direct_write(hmp, bio, record);
+               }
                if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs)
                        hammer_flush_inode(ip, 0);
        } else {
                if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs)
                        hammer_flush_inode(ip, 0);
        } else {