TAILQ_HEAD(hammer_node_cache_list, hammer_node_cache);
/*
+ * Live dedup cache
+ */
+struct hammer_dedup_crc_rb_tree;
+RB_HEAD(hammer_dedup_crc_rb_tree, hammer_dedup_cache);
+RB_PROTOTYPE2(hammer_dedup_crc_rb_tree, hammer_dedup_cache, crc_entry,
+ hammer_dedup_crc_rb_compare, hammer_crc_t);
+
+struct hammer_dedup_off_rb_tree;
+RB_HEAD(hammer_dedup_off_rb_tree, hammer_dedup_cache);
+RB_PROTOTYPE2(hammer_dedup_off_rb_tree, hammer_dedup_cache, off_entry,
+ hammer_dedup_off_rb_compare, hammer_off_t);
+
+#define DEDUP_CACHE_SIZE 4096 /* XXX make it a dynamic tunable */
+
+typedef struct hammer_dedup_cache {
+ RB_ENTRY(hammer_dedup_cache) crc_entry;
+ RB_ENTRY(hammer_dedup_cache) off_entry;
+ TAILQ_ENTRY(hammer_dedup_cache) lru_entry;
+ struct hammer_mount *hmp;
+ int64_t obj_id;
+ u_int32_t localization;
+ off_t file_offset;
+ int bytes;
+ hammer_off_t data_offset;
+ hammer_crc_t crc;
+} *hammer_dedup_cache_t;
+
+/*
* Structure used to organize flush groups. Flush groups must be
* organized into chunks in order to avoid blowing out the UNDO FIFO.
* Without this a 'sync' could end up flushing 50,000 inodes in a single
#define HAMMER_RECF_COMMITTED 0x0010 /* committed to the B-Tree */
#define HAMMER_RECF_INTERLOCK_BE 0x0020 /* backend interlock */
#define HAMMER_RECF_WANTED 0x0040 /* wanted by the frontend */
+#define HAMMER_RECF_DEDUPED 0x0080 /* will be live-dedup'ed */
#define HAMMER_RECF_CONVERT_DELETE 0x0100 /* special case */
#define HAMMER_RECF_REDO 0x1000 /* REDO was laid down */
int refs;
int zone;
int append_off;
+ int32_t bytes_free;
hammer_off_t zone_offset;
};
struct hammer_res_rb_tree rb_resv_root;
struct hammer_buf_rb_tree rb_bufs_root;
struct hammer_pfs_rb_tree rb_pfsm_root;
+
+ struct hammer_dedup_crc_rb_tree rb_dedup_crc_root;
+ struct hammer_dedup_off_rb_tree rb_dedup_off_root;
+
struct hammer_volume *rootvol;
struct hammer_base_elm root_btree_beg;
struct hammer_base_elm root_btree_end;
int io_running_space; /* io_token */
int io_running_wakeup; /* io_token */
int objid_cache_count;
+ int dedup_cache_count;
int error; /* critical I/O error */
struct krate krate; /* rate limited kprintf */
hammer_tid_t asof; /* snapshot mount */
struct hammer_flush_group_list flush_group_list;
hammer_flush_group_t next_flush_group;
TAILQ_HEAD(, hammer_objid_cache) objid_cache_list;
+ TAILQ_HEAD(, hammer_dedup_cache) dedup_lru_list;
+ hammer_dedup_cache_t dedup_free_cache;
TAILQ_HEAD(, hammer_reclaim) reclaim_list;
TAILQ_HEAD(, hammer_io) iorun_list;
extern int hammer_debug_recover_faults;
extern int hammer_debug_critical;
extern int hammer_cluster_enable;
+extern int hammer_live_dedup;
extern int hammer_count_fsyncs;
extern int hammer_count_inodes;
extern int hammer_count_iqueued;
extern int hammer_autoflush;
extern int64_t hammer_contention_count;
+extern int64_t hammer_live_dedup_vnode_bcmps;
+extern int64_t hammer_live_dedup_device_bcmps;
+extern int64_t hammer_live_dedup_findblk_failures;
+extern int64_t hammer_live_dedup_bmap_saves;
+
void hammer_critical_error(hammer_mount_t hmp, hammer_inode_t ip,
int error, const char *msg);
int hammer_vop_inactive(struct vop_inactive_args *);
void hammer_clear_objid(hammer_inode_t dip);
void hammer_destroy_objid_cache(hammer_mount_t hmp);
+int hammer_dedup_crc_rb_compare(hammer_dedup_cache_t dc1,
+ hammer_dedup_cache_t dc2);
+int hammer_dedup_off_rb_compare(hammer_dedup_cache_t dc1,
+ hammer_dedup_cache_t dc2);
+hammer_dedup_cache_t hammer_dedup_cache_add(hammer_inode_t ip,
+ hammer_btree_leaf_elm_t leaf);
+hammer_dedup_cache_t hammer_dedup_cache_lookup(hammer_mount_t hmp,
+ hammer_crc_t crc);
+void hammer_dedup_cache_inval(hammer_mount_t hmp, hammer_off_t base_offset);
+void hammer_destroy_dedup_cache(hammer_mount_t hmp);
+void hammer_dump_dedup_cache(hammer_mount_t hmp);
+int hammer_dedup_validate(hammer_dedup_cache_t dcp, int zone, int bytes,
+ void *data);
+
int hammer_enter_undo_history(hammer_mount_t hmp, hammer_off_t offset,
int bytes);
void hammer_clear_undo_history(hammer_mount_t hmp);
int bytes, hammer_off_t hint, int *errorp);
hammer_reserve_t hammer_blockmap_reserve(hammer_mount_t hmp, int zone,
int bytes, hammer_off_t *zone_offp, int *errorp);
+hammer_reserve_t hammer_blockmap_reserve_dedup(hammer_mount_t hmp, int zone,
+ int bytes, hammer_off_t zone_offset, int *errorp);
void hammer_blockmap_reserve_complete(hammer_mount_t hmp,
hammer_reserve_t resv);
void hammer_reserve_clrdelay(hammer_mount_t hmp, hammer_reserve_t resv);
hammer_off_t base_offset, int zone,
struct hammer_blockmap_layer2 *layer2);
static void hammer_reserve_setdelay(hammer_mount_t hmp, hammer_reserve_t resv);
+static int update_bytes_free(hammer_reserve_t resv, int bytes);
/*
* Reserved big-blocks red-black tree support
}
/*
+ * Frontend function - Dedup bytes in a zone.
+ *
+ * Dedup reservations work exactly the same as normal write reservations
+ * except we only adjust bytes_free field and don't touch append offset.
+ * Finalization mechanic for dedup reservations is also the same as for
+ * normal write ones - the backend finalizes the reservation with
+ * hammer_blockmap_finalize().
+ */
+hammer_reserve_t
+hammer_blockmap_reserve_dedup(hammer_mount_t hmp, int zone, int bytes,
+ hammer_off_t zone_offset, int *errorp)
+{
+ hammer_volume_t root_volume;
+ hammer_blockmap_t freemap;
+ struct hammer_blockmap_layer1 *layer1;
+ struct hammer_blockmap_layer2 *layer2;
+ hammer_buffer_t buffer1 = NULL;
+ hammer_buffer_t buffer2 = NULL;
+ hammer_off_t layer1_offset;
+ hammer_off_t layer2_offset;
+ hammer_off_t base_off;
+ hammer_reserve_t resv = NULL;
+ hammer_reserve_t resx = NULL;
+
+ /*
+ * Setup
+ */
+ KKASSERT(zone >= HAMMER_ZONE_BTREE_INDEX && zone < HAMMER_MAX_ZONES);
+ root_volume = hammer_get_root_volume(hmp, errorp);
+ if (*errorp)
+ return (NULL);
+ freemap = &hmp->blockmap[HAMMER_ZONE_FREEMAP_INDEX];
+ KKASSERT(freemap->phys_offset != 0);
+
+ bytes = (bytes + 15) & ~15;
+ KKASSERT(bytes > 0 && bytes <= HAMMER_XBUFSIZE);
+
+ /*
+ * Dive layer 1.
+ */
+ layer1_offset = freemap->phys_offset +
+ HAMMER_BLOCKMAP_LAYER1_OFFSET(zone_offset);
+ layer1 = hammer_bread(hmp, layer1_offset, errorp, &buffer1);
+ if (*errorp)
+ goto failed;
+
+ /*
+ * Check CRC.
+ */
+ if (layer1->layer1_crc != crc32(layer1, HAMMER_LAYER1_CRCSIZE)) {
+ hammer_lock_ex(&hmp->blkmap_lock);
+ if (layer1->layer1_crc != crc32(layer1, HAMMER_LAYER1_CRCSIZE))
+ panic("CRC FAILED: LAYER1");
+ hammer_unlock(&hmp->blkmap_lock);
+ }
+ KKASSERT(layer1->phys_offset != HAMMER_BLOCKMAP_UNAVAIL);
+
+ /*
+ * Dive layer 2, each entry represents a large-block.
+ */
+ layer2_offset = layer1->phys_offset +
+ HAMMER_BLOCKMAP_LAYER2_OFFSET(zone_offset);
+ layer2 = hammer_bread(hmp, layer2_offset, errorp, &buffer2);
+ if (*errorp)
+ goto failed;
+
+ /*
+ * Check CRC.
+ */
+ if (layer2->entry_crc != crc32(layer2, HAMMER_LAYER2_CRCSIZE)) {
+ hammer_lock_ex(&hmp->blkmap_lock);
+ if (layer2->entry_crc != crc32(layer2, HAMMER_LAYER2_CRCSIZE))
+ panic("CRC FAILED: LAYER2");
+ hammer_unlock(&hmp->blkmap_lock);
+ }
+
+ /*
+ * Fail if the zone is owned by someone other than us.
+ */
+ if (layer2->zone && layer2->zone != zone)
+ goto failed;
+
+ /*
+ * We need the lock from this point on. We have to re-check zone
+ * ownership after acquiring the lock and also check for reservations.
+ */
+ hammer_lock_ex(&hmp->blkmap_lock);
+
+ if (layer2->zone && layer2->zone != zone) {
+ hammer_unlock(&hmp->blkmap_lock);
+ goto failed;
+ }
+
+ base_off = (zone_offset &
+ (~HAMMER_LARGEBLOCK_MASK64 & ~HAMMER_OFF_ZONE_MASK)) |
+ HAMMER_ZONE_RAW_BUFFER;
+ resv = RB_LOOKUP(hammer_res_rb_tree, &hmp->rb_resv_root, base_off);
+ if (resv) {
+ if (resv->zone != zone) {
+ hammer_unlock(&hmp->blkmap_lock);
+ resv = NULL;
+ goto failed;
+ }
+ /*
+ * Due to possible big block underflow we can't simply
+ * subtract bytes from bytes_free.
+ */
+ if (update_bytes_free(resv, bytes) == 0) {
+ hammer_unlock(&hmp->blkmap_lock);
+ resv = NULL;
+ goto failed;
+ }
+ ++resv->refs;
+ resx = NULL;
+ } else {
+ resx = kmalloc(sizeof(*resv), hmp->m_misc,
+ M_WAITOK | M_ZERO | M_USE_RESERVE);
+ resx->refs = 1;
+ resx->zone = zone;
+ resx->bytes_free = layer2->bytes_free;
+ /*
+ * Due to possible big block underflow we can't simply
+ * subtract bytes from bytes_free.
+ */
+ if (update_bytes_free(resx, bytes) == 0) {
+ hammer_unlock(&hmp->blkmap_lock);
+ kfree(resx, hmp->m_misc);
+ goto failed;
+ }
+ resx->zone_offset = base_off;
+ resv = RB_INSERT(hammer_res_rb_tree, &hmp->rb_resv_root, resx);
+ KKASSERT(resv == NULL);
+ resv = resx;
+ ++hammer_count_reservations;
+ }
+
+ hammer_unlock(&hmp->blkmap_lock);
+
+failed:
+ if (buffer1)
+ hammer_rel_buffer(buffer1, 0);
+ if (buffer2)
+ hammer_rel_buffer(buffer2, 0);
+ hammer_rel_volume(root_volume, 0);
+
+ return(resv);
+}
+
+static int
+update_bytes_free(hammer_reserve_t resv, int bytes)
+{
+ int32_t temp;
+
+ /*
+ * Big-block underflow check
+ */
+ temp = resv->bytes_free - HAMMER_LARGEBLOCK_SIZE * 2;
+ cpu_ccfence(); /* XXX do we really need it ? */
+ if (temp > resv->bytes_free) {
+ kprintf("BIGBLOCK UNDERFLOW\n");
+ return (0);
+ }
+
+ resv->bytes_free -= bytes;
+ return (1);
+}
+
+/*
* Dereference a reservation structure. Upon the final release the
* underlying big-block is checked and if it is entirely free we delete
* any related HAMMER buffers to avoid potential conflicts with future
resv->append_off = HAMMER_LARGEBLOCK_SIZE;
base_offset = resv->zone_offset & ~HAMMER_OFF_ZONE_MASK;
base_offset = HAMMER_ZONE_ENCODE(resv->zone, base_offset);
+ if (!TAILQ_EMPTY(&hmp->dedup_lru_list))
+ hammer_dedup_cache_inval(hmp, base_offset);
error = hammer_del_buffers(hmp, base_offset,
resv->zone_offset,
HAMMER_LARGEBLOCK_SIZE,
KKASSERT(layer2->zone == zone);
KKASSERT(bytes != 0);
layer2->bytes_free -= bytes;
- if (resv)
+
+ if (resv) {
resv->flags &= ~HAMMER_RESF_LAYER2FREE;
+ }
/*
* Finalizations can occur out of order, or combined with allocations.
return (1);
}
}
+
+/************************************************************************
+ * LIVE DEDUP *
+ ************************************************************************
+ *
+ * HAMMER Live Dedup (aka as efficient cp(1) implementation)
+ *
+ * The dedup cache is operated in a LRU fashion and destroyed on
+ * unmount, so essentially this is a live dedup on a cached dataset and
+ * not a full-fledged fs-wide one - we have a batched dedup for that.
+ * We allow duplicate entries in the buffer cache, data blocks are
+ * deduplicated only on their way to media. By default the cache is
+ * populated on reads only, but it can be populated on writes too.
+ *
+ * The main implementation gotcha is on-media requirement - in order for
+ * a data block to be added to a dedup cache it has to be present on
+ * disk. This simplifies cache logic a lot - once data is laid out on
+ * media it remains valid on media all the way up to the point where the
+ * related big block the data was stored in is freed - so there is only
+ * one place we need to bother with invalidation code.
+ */
+
+/*
+ * RB-Tree support for dedup cache structures
+ */
+RB_GENERATE2(hammer_dedup_crc_rb_tree, hammer_dedup_cache, crc_entry,
+ hammer_dedup_crc_rb_compare, hammer_crc_t, crc);
+RB_GENERATE2(hammer_dedup_off_rb_tree, hammer_dedup_cache, off_entry,
+ hammer_dedup_off_rb_compare, hammer_off_t, data_offset);
+
+struct hammer_dedup_inval {
+ struct hammer_mount *hmp;
+ hammer_off_t base_offset;
+};
+
+static int hammer_dedup_scancmp(hammer_dedup_cache_t dc, void *data);
+static int hammer_dedup_cache_inval_callback(hammer_dedup_cache_t dc,
+ void *data);
+static __inline int _vnode_validate(hammer_dedup_cache_t dcp, void *data,
+ int *errorp);
+static __inline int _dev_validate(hammer_dedup_cache_t dcp, void *data,
+ int *errorp);
+
+int
+hammer_dedup_crc_rb_compare(hammer_dedup_cache_t dc1,
+ hammer_dedup_cache_t dc2)
+{
+ if (dc1->crc < dc2->crc)
+ return (-1);
+ if (dc1->crc > dc2->crc)
+ return (1);
+
+ return (0);
+}
+
+int
+hammer_dedup_off_rb_compare(hammer_dedup_cache_t dc1,
+ hammer_dedup_cache_t dc2)
+{
+ if (dc1->data_offset < dc2->data_offset)
+ return (-1);
+ if (dc1->data_offset > dc2->data_offset)
+ return (1);
+
+ return (0);
+}
+
+static int
+hammer_dedup_scancmp(hammer_dedup_cache_t dc, void *data)
+{
+ hammer_off_t off = ((struct hammer_dedup_inval *)data)->base_offset;
+
+ if (dc->data_offset < off)
+ return (-1);
+ if (dc->data_offset >= off + HAMMER_LARGEBLOCK_SIZE)
+ return (1);
+
+ return (0);
+}
+
+static int
+hammer_dedup_cache_inval_callback(hammer_dedup_cache_t dc, void *data)
+{
+ hammer_mount_t hmp = ((struct hammer_dedup_inval *)data)->hmp;
+
+ RB_REMOVE(hammer_dedup_crc_rb_tree, &hmp->rb_dedup_crc_root, dc);
+ RB_REMOVE(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, dc);
+ TAILQ_REMOVE(&hmp->dedup_lru_list, dc, lru_entry);
+
+ --hmp->dedup_cache_count;
+ kfree(dc, hmp->m_misc);
+
+ return (0);
+}
+
+hammer_dedup_cache_t
+hammer_dedup_cache_add(hammer_inode_t ip, hammer_btree_leaf_elm_t leaf)
+{
+ hammer_dedup_cache_t dcp, tmp;
+ hammer_mount_t hmp = ip->hmp;
+
+ if (hmp->dedup_free_cache == NULL) {
+ tmp = kmalloc(sizeof(*tmp), hmp->m_misc, M_WAITOK | M_ZERO);
+ if (hmp->dedup_free_cache == NULL)
+ hmp->dedup_free_cache = tmp;
+ else
+ kfree(tmp, hmp->m_misc);
+ }
+
+ KKASSERT(leaf != NULL);
+
+ dcp = RB_LOOKUP(hammer_dedup_crc_rb_tree,
+ &hmp->rb_dedup_crc_root, leaf->data_crc);
+ if (dcp != NULL) {
+ RB_REMOVE(hammer_dedup_off_rb_tree,
+ &hmp->rb_dedup_off_root, dcp);
+ TAILQ_REMOVE(&hmp->dedup_lru_list, dcp, lru_entry);
+ goto populate;
+ }
+
+ if (hmp->dedup_cache_count < DEDUP_CACHE_SIZE) {
+ dcp = hmp->dedup_free_cache;
+ hmp->dedup_free_cache = NULL;
+ ++hmp->dedup_cache_count;
+ } else {
+ dcp = TAILQ_FIRST(&hmp->dedup_lru_list);
+ RB_REMOVE(hammer_dedup_crc_rb_tree,
+ &hmp->rb_dedup_crc_root, dcp);
+ RB_REMOVE(hammer_dedup_off_rb_tree,
+ &hmp->rb_dedup_off_root, dcp);
+ TAILQ_REMOVE(&hmp->dedup_lru_list, dcp, lru_entry);
+ }
+
+ dcp->crc = leaf->data_crc;
+ tmp = RB_INSERT(hammer_dedup_crc_rb_tree, &hmp->rb_dedup_crc_root, dcp);
+ KKASSERT(tmp == NULL);
+
+populate:
+ dcp->hmp = ip->hmp;
+ dcp->obj_id = ip->obj_id;
+ dcp->localization = ip->obj_localization;
+ dcp->file_offset = leaf->base.key - leaf->data_len;
+ dcp->bytes = leaf->data_len;
+ dcp->data_offset = leaf->data_offset;
+
+ tmp = RB_INSERT(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, dcp);
+ KKASSERT(tmp == NULL);
+ TAILQ_INSERT_TAIL(&hmp->dedup_lru_list, dcp, lru_entry);
+
+ return (dcp);
+}
+
+__inline hammer_dedup_cache_t
+hammer_dedup_cache_lookup(hammer_mount_t hmp, hammer_crc_t crc)
+{
+ hammer_dedup_cache_t dcp;
+
+ dcp = RB_LOOKUP(hammer_dedup_crc_rb_tree,
+ &hmp->rb_dedup_crc_root, crc);
+ return dcp;
+}
+
+void hammer_dedup_cache_inval(hammer_mount_t hmp, hammer_off_t base_offset)
+{
+ struct hammer_dedup_inval di;
+
+ di.hmp = hmp;
+ di.base_offset = base_offset;
+
+ RB_SCAN(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root,
+ hammer_dedup_scancmp, hammer_dedup_cache_inval_callback, &di);
+}
+
+void
+hammer_destroy_dedup_cache(hammer_mount_t hmp)
+{
+ hammer_dedup_cache_t dcp;
+
+ while ((dcp = TAILQ_FIRST(&hmp->dedup_lru_list)) != NULL) {
+ RB_REMOVE(hammer_dedup_crc_rb_tree, &hmp->rb_dedup_crc_root, dcp);
+ RB_REMOVE(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, dcp);
+ TAILQ_REMOVE(&hmp->dedup_lru_list, dcp, lru_entry);
+ --hmp->dedup_cache_count;
+ kfree(dcp, hmp->m_misc);
+ }
+
+ KKASSERT(RB_EMPTY(&hmp->rb_dedup_crc_root));
+ KKASSERT(RB_EMPTY(&hmp->rb_dedup_off_root));
+ KKASSERT(TAILQ_EMPTY(&hmp->dedup_lru_list));
+
+ KKASSERT(hmp->dedup_cache_count == 0);
+}
+
+int
+hammer_dedup_validate(hammer_dedup_cache_t dcp, int zone, int bytes,
+ void *data)
+{
+ int error;
+
+ /*
+ * Zone validation
+ */
+ if (HAMMER_ZONE_DECODE(dcp->data_offset) != zone)
+ return (0);
+
+ /*
+ * Block length validation
+ */
+ if (dcp->bytes != bytes)
+ return (0);
+
+ /*
+ * Byte-by-byte data comparison
+ *
+ * The data we need for validation may already be present in the
+ * buffer cache in two flavours: vnode-based buffer or
+ * block-device-based buffer. In case vnode-based buffer wasn't
+ * there or if a non-blocking attempt to acquire it failed use
+ * device-based buffer (for large-zone data blocks it will
+ * generate a separate read).
+ */
+ if (_vnode_validate(dcp, data, &error)) {
+ hammer_live_dedup_vnode_bcmps++;
+ return (1);
+ } else {
+ if (error == 3)
+ hammer_live_dedup_findblk_failures++;
+ }
+
+ if (error) { /* yes, if there was an error previously */
+ if (_dev_validate(dcp, data, &error)) {
+ hammer_live_dedup_device_bcmps++;
+ return (1);
+ }
+ }
+
+ return (0);
+}
+
+static __inline int
+_vnode_validate(hammer_dedup_cache_t dcp, void *data, int *errorp)
+{
+ struct hammer_transaction trans;
+ hammer_inode_t ip;
+ struct vnode *vp;
+ struct buf *bp;
+ off_t dooffset;
+ int result, error;
+
+ result = error = 0;
+ *errorp = 0;
+
+ hammer_simple_transaction(&trans, dcp->hmp);
+
+ ip = hammer_get_inode(&trans, NULL, dcp->obj_id, HAMMER_MAX_TID,
+ dcp->localization, 0, &error);
+ if (ip == NULL) {
+ kprintf("dedup: unable to find objid %016jx:%08x\n",
+ (intmax_t)dcp->obj_id, dcp->localization);
+ *errorp = 1;
+ goto failed2;
+ }
+
+ error = hammer_get_vnode(ip, &vp);
+ if (error) {
+ kprintf("dedup: unable to acquire vnode for %016jx:%08x\n",
+ (intmax_t)dcp->obj_id, dcp->localization);
+ *errorp = 2;
+ goto failed;
+ }
+
+ if ((bp = findblk(ip->vp, dcp->file_offset, FINDBLK_NBLOCK)) != NULL) {
+ bremfree(bp);
+
+ /* XXX if (mapped to userspace) goto done, *errorp = 4 */
+
+ if ((bp->b_flags & B_CACHE) == 0 || bp->b_flags & B_DIRTY) {
+ *errorp = 5;
+ goto done;
+ }
+
+ if (bp->b_bio2.bio_offset != dcp->data_offset) {
+ error = VOP_BMAP(ip->vp, dcp->file_offset, &dooffset,
+ NULL, NULL, BUF_CMD_READ);
+ if (error) {
+ *errorp = 6;
+ goto done;
+ }
+
+ if (dooffset != dcp->data_offset) {
+ *errorp = 7;
+ goto done;
+ }
+ hammer_live_dedup_bmap_saves++;
+ }
+
+ if (bcmp(data, bp->b_data, dcp->bytes) == 0)
+ result = 1;
+
+done:
+ bqrelse(bp);
+ } else {
+ *errorp = 3;
+ }
+ vput(vp);
+
+failed:
+ hammer_rel_inode(ip, 0);
+failed2:
+ hammer_done_transaction(&trans);
+ return (result);
+}
+
+static __inline int
+_dev_validate(hammer_dedup_cache_t dcp, void *data, int *errorp)
+{
+ hammer_buffer_t buffer = NULL;
+ void *ondisk_data;
+ int result, error;
+
+ result = error = 0;
+ *errorp = 0;
+
+ ondisk_data = hammer_bread_ext(dcp->hmp, dcp->data_offset,
+ dcp->bytes, &error, &buffer);
+ if (error) {
+ *errorp = 1;
+ goto failed;
+ }
+
+ if (bcmp(data, ondisk_data, dcp->bytes) == 0)
+ result = 1;
+
+failed:
+ if (buffer)
+ hammer_rel_buffer(buffer, 0);
+
+ return (result);
+}
int *errorp)
{
hammer_record_t record;
+ hammer_dedup_cache_t dcp;
+ hammer_crc_t crc;
int zone;
/*
record = hammer_alloc_mem_record(ip, 0);
zone = (bytes >= HAMMER_BUFSIZE) ? HAMMER_ZONE_LARGE_DATA_INDEX :
HAMMER_ZONE_SMALL_DATA_INDEX;
- record->resv = hammer_blockmap_reserve(ip->hmp, zone, bytes,
- &record->leaf.data_offset,
- errorp);
- if (record->resv == NULL) {
- kprintf("hammer_ip_add_bulk: reservation failed\n");
- hammer_rel_mem_record(record);
- return(NULL);
+ if (bytes == 0)
+ crc = 0;
+ else
+ crc = crc32(data, bytes);
+
+ if (hammer_live_dedup == 0)
+ goto nodedup;
+ if ((dcp = hammer_dedup_cache_lookup(ip->hmp, crc)) != NULL) {
+ struct hammer_dedup_cache tmp = *dcp;
+
+ record->resv = hammer_blockmap_reserve_dedup(ip->hmp, zone,
+ bytes, tmp.data_offset, errorp);
+ if (record->resv == NULL)
+ goto nodedup;
+
+ if (!hammer_dedup_validate(&tmp, zone, bytes, data)) {
+ hammer_blockmap_reserve_complete(ip->hmp, record->resv);
+ goto nodedup;
+ }
+
+ record->leaf.data_offset = tmp.data_offset;
+ record->flags |= HAMMER_RECF_DEDUPED;
+ } else {
+nodedup:
+ record->resv = hammer_blockmap_reserve(ip->hmp, zone, bytes,
+ &record->leaf.data_offset, errorp);
+ if (record->resv == NULL) {
+ kprintf("hammer_ip_add_bulk: reservation failed\n");
+ hammer_rel_mem_record(record);
+ return(NULL);
+ }
}
+
record->type = HAMMER_MEM_RECORD_DATA;
record->leaf.base.rec_type = HAMMER_RECTYPE_DATA;
record->leaf.base.obj_type = ip->ino_leaf.base.obj_type;
record->leaf.base.localization = ip->obj_localization +
HAMMER_LOCALIZE_MISC;
record->leaf.data_len = bytes;
- hammer_crc_set_leaf(data, &record->leaf);
+ record->leaf.data_crc = crc;
KKASSERT(*errorp == 0);
return(record);
record->resv,
record->leaf.data_offset,
record->leaf.data_len);
+
+ if (hammer_live_dedup == 2 &&
+ (record->flags & HAMMER_RECF_DEDUPED) == 0) {
+ hammer_dedup_cache_add(record->ip, &record->leaf);
+ }
} else if (record->data && record->leaf.data_len) {
/*
* Wholely cached record, with data. Allocate the data.
int hammer_debug_recover_faults;
int hammer_debug_critical; /* non-zero enter debugger on error */
int hammer_cluster_enable = 1; /* enable read clustering by default */
+int hammer_live_dedup = 0;
int hammer_count_fsyncs;
int hammer_count_inodes;
int hammer_count_iqueued;
int64_t hammer_contention_count;
int64_t hammer_zone_limit;
+/*
+ * Live dedup debug counters (sysctls are writable so that counters
+ * can be reset from userspace).
+ */
+int64_t hammer_live_dedup_vnode_bcmps = 0;
+int64_t hammer_live_dedup_device_bcmps = 0;
+int64_t hammer_live_dedup_findblk_failures = 0;
+int64_t hammer_live_dedup_bmap_saves = 0;
+
+
SYSCTL_NODE(_vfs, OID_AUTO, hammer, CTLFLAG_RW, 0, "HAMMER filesystem");
+
SYSCTL_INT(_vfs_hammer, OID_AUTO, supported_version, CTLFLAG_RD,
&hammer_supported_version, 0, "");
SYSCTL_INT(_vfs_hammer, OID_AUTO, debug_general, CTLFLAG_RW,
&hammer_debug_critical, 0, "");
SYSCTL_INT(_vfs_hammer, OID_AUTO, cluster_enable, CTLFLAG_RW,
&hammer_cluster_enable, 0, "");
+/*
+ * 0 - live dedup is disabled
+ * 1 - dedup cache is populated on reads only
+ * 2 - dedup cache is populated on both reads and writes
+ */
+SYSCTL_INT(_vfs_hammer, OID_AUTO, live_dedup, CTLFLAG_RW,
+ &hammer_live_dedup, 0, "");
SYSCTL_INT(_vfs_hammer, OID_AUTO, limit_dirtybufspace, CTLFLAG_RW,
&hammer_limit_dirtybufspace, 0, "");
SYSCTL_QUAD(_vfs_hammer, OID_AUTO, stats_redo, CTLFLAG_RD,
&hammer_stats_redo, 0, "");
+SYSCTL_QUAD(_vfs_hammer, OID_AUTO, live_dedup_vnode_bcmps, CTLFLAG_RW,
+ &hammer_live_dedup_vnode_bcmps, 0, "");
+SYSCTL_QUAD(_vfs_hammer, OID_AUTO, live_dedup_device_bcmps, CTLFLAG_RW,
+ &hammer_live_dedup_device_bcmps, 0, "");
+SYSCTL_QUAD(_vfs_hammer, OID_AUTO, live_dedup_findblk_failures, CTLFLAG_RW,
+ &hammer_live_dedup_findblk_failures, 0, "");
+SYSCTL_QUAD(_vfs_hammer, OID_AUTO, live_dedup_bmap_saves, CTLFLAG_RW,
+ &hammer_live_dedup_bmap_saves, 0, "");
+
SYSCTL_INT(_vfs_hammer, OID_AUTO, count_dirtybufspace, CTLFLAG_RD,
&hammer_count_dirtybufspace, 0, "");
SYSCTL_INT(_vfs_hammer, OID_AUTO, count_refedbufs, CTLFLAG_RD,
TAILQ_INIT(&hmp->objid_cache_list);
TAILQ_INIT(&hmp->undo_lru_list);
TAILQ_INIT(&hmp->reclaim_list);
+
+ RB_INIT(&hmp->rb_dedup_crc_root);
+ RB_INIT(&hmp->rb_dedup_off_root);
+ TAILQ_INIT(&hmp->dedup_lru_list);
}
hmp->hflags &= ~HMNT_USERFLAGS;
hmp->hflags |= info.hflags & HMNT_USERFLAGS;
mp->mnt_flag &= ~MNT_LOCAL;
hmp->mp = NULL;
hammer_destroy_objid_cache(hmp);
+ hammer_destroy_dedup_cache(hmp);
+ if (hmp->dedup_free_cache != NULL) {
+ kfree(hmp->dedup_free_cache, hmp->m_misc);
+ hmp->dedup_free_cache = NULL;
+ }
kmalloc_destroy(&hmp->m_misc);
kmalloc_destroy(&hmp->m_inodes);
lwkt_reltoken(&hmp->fs_token);
biodone(ap->a_bio);
break;
}
+
+ /* hammer_dump_dedup_cache(((hammer_inode_t)ap->a_vp->v_data)->hmp); */
+
return (error);
}
HAMMER_ZONE_LARGE_DATA);
nbio->bio_offset = disk_offset;
error = hammer_io_direct_read(hmp, nbio, cursor.leaf);
+ if (hammer_live_dedup)
+ hammer_dedup_cache_add(ip, cursor.leaf);
goto done;
} else if (n) {
error = hammer_ip_resolve_data(&cursor);
break;
/*
+ * We have to be sure that the only elements added to the
+ * dedup cache are those which are already on-media.
+ */
+ if (hammer_live_dedup && hammer_cursor_ondisk(&cursor))
+ hammer_dedup_cache_add(ip, cursor.leaf);
+
+ /*
* Iterate until we have filled the request.
*/
boff += n;
}
last_offset = rec_offset + rec_len;
last_disk_offset = disk_offset + rec_len;
+
+ if (hammer_live_dedup)
+ hammer_dedup_cache_add(ip, cursor.leaf);
}
+
error = hammer_ip_next(&cursor);
}
record->flags |= HAMMER_RECF_REDO;
bp->b_flags &= ~B_VFSFLAG1;
}
- hammer_io_direct_write(hmp, bio, record);
+ if (record->flags & HAMMER_RECF_DEDUPED) {
+ bp->b_resid = 0;
+ hammer_ip_replace_bulk(hmp, record);
+ biodone(ap->a_bio);
+ } else {
+ hammer_io_direct_write(hmp, bio, record);
+ }
if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs)
hammer_flush_inode(ip, 0);
} else {