From 507df98a152612f739140d9f1ac5b30cd022eea2 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 4 Jan 2011 03:07:02 +0200 Subject: [PATCH 1/1] HAMMER - Add live dedup sysctl and support * Adds *experimental* live dedup (aka efficient cp) support * sysctl vfs.hammer.live_dedup 0 - disabled (default) 1 - enabled, populate dedup cache on reads 2 - enabled, populate dedup cache on reads and writes --- sys/vfs/hammer/hammer.h | 59 ++++++ sys/vfs/hammer/hammer_blockmap.c | 175 +++++++++++++++- sys/vfs/hammer/hammer_dedup.c | 339 +++++++++++++++++++++++++++++++ sys/vfs/hammer/hammer_object.c | 48 ++++- sys/vfs/hammer/hammer_vfsops.c | 37 ++++ sys/vfs/hammer/hammer_vnops.c | 24 ++- 6 files changed, 672 insertions(+), 10 deletions(-) diff --git a/sys/vfs/hammer/hammer.h b/sys/vfs/hammer/hammer.h index d06bcf952a..dbcade5ef3 100644 --- a/sys/vfs/hammer/hammer.h +++ b/sys/vfs/hammer/hammer.h @@ -270,6 +270,34 @@ typedef struct hammer_node_cache { TAILQ_HEAD(hammer_node_cache_list, hammer_node_cache); +/* + * Live dedup cache + */ +struct hammer_dedup_crc_rb_tree; +RB_HEAD(hammer_dedup_crc_rb_tree, hammer_dedup_cache); +RB_PROTOTYPE2(hammer_dedup_crc_rb_tree, hammer_dedup_cache, crc_entry, + hammer_dedup_crc_rb_compare, hammer_crc_t); + +struct hammer_dedup_off_rb_tree; +RB_HEAD(hammer_dedup_off_rb_tree, hammer_dedup_cache); +RB_PROTOTYPE2(hammer_dedup_off_rb_tree, hammer_dedup_cache, off_entry, + hammer_dedup_off_rb_compare, hammer_off_t); + +#define DEDUP_CACHE_SIZE 4096 /* XXX make it a dynamic tunable */ + +typedef struct hammer_dedup_cache { + RB_ENTRY(hammer_dedup_cache) crc_entry; + RB_ENTRY(hammer_dedup_cache) off_entry; + TAILQ_ENTRY(hammer_dedup_cache) lru_entry; + struct hammer_mount *hmp; + int64_t obj_id; + u_int32_t localization; + off_t file_offset; + int bytes; + hammer_off_t data_offset; + hammer_crc_t crc; +} *hammer_dedup_cache_t; + /* * Structure used to organize flush groups. Flush groups must be * organized into chunks in order to avoid blowing out the UNDO FIFO. @@ -525,6 +553,7 @@ typedef struct hammer_record *hammer_record_t; #define HAMMER_RECF_COMMITTED 0x0010 /* committed to the B-Tree */ #define HAMMER_RECF_INTERLOCK_BE 0x0020 /* backend interlock */ #define HAMMER_RECF_WANTED 0x0040 /* wanted by the frontend */ +#define HAMMER_RECF_DEDUPED 0x0080 /* will be live-dedup'ed */ #define HAMMER_RECF_CONVERT_DELETE 0x0100 /* special case */ #define HAMMER_RECF_REDO 0x1000 /* REDO was laid down */ @@ -775,6 +804,7 @@ struct hammer_reserve { int refs; int zone; int append_off; + int32_t bytes_free; hammer_off_t zone_offset; }; @@ -839,6 +869,10 @@ struct hammer_mount { struct hammer_res_rb_tree rb_resv_root; struct hammer_buf_rb_tree rb_bufs_root; struct hammer_pfs_rb_tree rb_pfsm_root; + + struct hammer_dedup_crc_rb_tree rb_dedup_crc_root; + struct hammer_dedup_off_rb_tree rb_dedup_off_root; + struct hammer_volume *rootvol; struct hammer_base_elm root_btree_beg; struct hammer_base_elm root_btree_end; @@ -882,6 +916,7 @@ struct hammer_mount { int io_running_space; /* io_token */ int io_running_wakeup; /* io_token */ int objid_cache_count; + int dedup_cache_count; int error; /* critical I/O error */ struct krate krate; /* rate limited kprintf */ hammer_tid_t asof; /* snapshot mount */ @@ -908,6 +943,8 @@ struct hammer_mount { struct hammer_flush_group_list flush_group_list; hammer_flush_group_t next_flush_group; TAILQ_HEAD(, hammer_objid_cache) objid_cache_list; + TAILQ_HEAD(, hammer_dedup_cache) dedup_lru_list; + hammer_dedup_cache_t dedup_free_cache; TAILQ_HEAD(, hammer_reclaim) reclaim_list; TAILQ_HEAD(, hammer_io) iorun_list; @@ -970,6 +1007,7 @@ extern int hammer_debug_recover; extern int hammer_debug_recover_faults; extern int hammer_debug_critical; extern int hammer_cluster_enable; +extern int hammer_live_dedup; extern int hammer_count_fsyncs; extern int hammer_count_inodes; extern int hammer_count_iqueued; @@ -1020,6 +1058,11 @@ extern int hammer_fsync_mode; extern int hammer_autoflush; extern int64_t hammer_contention_count; +extern int64_t hammer_live_dedup_vnode_bcmps; +extern int64_t hammer_live_dedup_device_bcmps; +extern int64_t hammer_live_dedup_findblk_failures; +extern int64_t hammer_live_dedup_bmap_saves; + void hammer_critical_error(hammer_mount_t hmp, hammer_inode_t ip, int error, const char *msg); int hammer_vop_inactive(struct vop_inactive_args *); @@ -1121,6 +1164,20 @@ hammer_tid_t hammer_alloc_objid(hammer_mount_t hmp, hammer_inode_t dip, void hammer_clear_objid(hammer_inode_t dip); void hammer_destroy_objid_cache(hammer_mount_t hmp); +int hammer_dedup_crc_rb_compare(hammer_dedup_cache_t dc1, + hammer_dedup_cache_t dc2); +int hammer_dedup_off_rb_compare(hammer_dedup_cache_t dc1, + hammer_dedup_cache_t dc2); +hammer_dedup_cache_t hammer_dedup_cache_add(hammer_inode_t ip, + hammer_btree_leaf_elm_t leaf); +hammer_dedup_cache_t hammer_dedup_cache_lookup(hammer_mount_t hmp, + hammer_crc_t crc); +void hammer_dedup_cache_inval(hammer_mount_t hmp, hammer_off_t base_offset); +void hammer_destroy_dedup_cache(hammer_mount_t hmp); +void hammer_dump_dedup_cache(hammer_mount_t hmp); +int hammer_dedup_validate(hammer_dedup_cache_t dcp, int zone, int bytes, + void *data); + int hammer_enter_undo_history(hammer_mount_t hmp, hammer_off_t offset, int bytes); void hammer_clear_undo_history(hammer_mount_t hmp); @@ -1273,6 +1330,8 @@ hammer_off_t hammer_blockmap_alloc(hammer_transaction_t trans, int zone, int bytes, hammer_off_t hint, int *errorp); hammer_reserve_t hammer_blockmap_reserve(hammer_mount_t hmp, int zone, int bytes, hammer_off_t *zone_offp, int *errorp); +hammer_reserve_t hammer_blockmap_reserve_dedup(hammer_mount_t hmp, int zone, + int bytes, hammer_off_t zone_offset, int *errorp); void hammer_blockmap_reserve_complete(hammer_mount_t hmp, hammer_reserve_t resv); void hammer_reserve_clrdelay(hammer_mount_t hmp, hammer_reserve_t resv); diff --git a/sys/vfs/hammer/hammer_blockmap.c b/sys/vfs/hammer/hammer_blockmap.c index 644e620f79..16160da311 100644 --- a/sys/vfs/hammer/hammer_blockmap.c +++ b/sys/vfs/hammer/hammer_blockmap.c @@ -44,6 +44,7 @@ static void hammer_reserve_setdelay_offset(hammer_mount_t hmp, hammer_off_t base_offset, int zone, struct hammer_blockmap_layer2 *layer2); static void hammer_reserve_setdelay(hammer_mount_t hmp, hammer_reserve_t resv); +static int update_bytes_free(hammer_reserve_t resv, int bytes); /* * Reserved big-blocks red-black tree support @@ -624,6 +625,174 @@ failed: return(resv); } +/* + * Frontend function - Dedup bytes in a zone. + * + * Dedup reservations work exactly the same as normal write reservations + * except we only adjust bytes_free field and don't touch append offset. + * Finalization mechanic for dedup reservations is also the same as for + * normal write ones - the backend finalizes the reservation with + * hammer_blockmap_finalize(). + */ +hammer_reserve_t +hammer_blockmap_reserve_dedup(hammer_mount_t hmp, int zone, int bytes, + hammer_off_t zone_offset, int *errorp) +{ + hammer_volume_t root_volume; + hammer_blockmap_t freemap; + struct hammer_blockmap_layer1 *layer1; + struct hammer_blockmap_layer2 *layer2; + hammer_buffer_t buffer1 = NULL; + hammer_buffer_t buffer2 = NULL; + hammer_off_t layer1_offset; + hammer_off_t layer2_offset; + hammer_off_t base_off; + hammer_reserve_t resv = NULL; + hammer_reserve_t resx = NULL; + + /* + * Setup + */ + KKASSERT(zone >= HAMMER_ZONE_BTREE_INDEX && zone < HAMMER_MAX_ZONES); + root_volume = hammer_get_root_volume(hmp, errorp); + if (*errorp) + return (NULL); + freemap = &hmp->blockmap[HAMMER_ZONE_FREEMAP_INDEX]; + KKASSERT(freemap->phys_offset != 0); + + bytes = (bytes + 15) & ~15; + KKASSERT(bytes > 0 && bytes <= HAMMER_XBUFSIZE); + + /* + * Dive layer 1. + */ + layer1_offset = freemap->phys_offset + + HAMMER_BLOCKMAP_LAYER1_OFFSET(zone_offset); + layer1 = hammer_bread(hmp, layer1_offset, errorp, &buffer1); + if (*errorp) + goto failed; + + /* + * Check CRC. + */ + if (layer1->layer1_crc != crc32(layer1, HAMMER_LAYER1_CRCSIZE)) { + hammer_lock_ex(&hmp->blkmap_lock); + if (layer1->layer1_crc != crc32(layer1, HAMMER_LAYER1_CRCSIZE)) + panic("CRC FAILED: LAYER1"); + hammer_unlock(&hmp->blkmap_lock); + } + KKASSERT(layer1->phys_offset != HAMMER_BLOCKMAP_UNAVAIL); + + /* + * Dive layer 2, each entry represents a large-block. + */ + layer2_offset = layer1->phys_offset + + HAMMER_BLOCKMAP_LAYER2_OFFSET(zone_offset); + layer2 = hammer_bread(hmp, layer2_offset, errorp, &buffer2); + if (*errorp) + goto failed; + + /* + * Check CRC. + */ + if (layer2->entry_crc != crc32(layer2, HAMMER_LAYER2_CRCSIZE)) { + hammer_lock_ex(&hmp->blkmap_lock); + if (layer2->entry_crc != crc32(layer2, HAMMER_LAYER2_CRCSIZE)) + panic("CRC FAILED: LAYER2"); + hammer_unlock(&hmp->blkmap_lock); + } + + /* + * Fail if the zone is owned by someone other than us. + */ + if (layer2->zone && layer2->zone != zone) + goto failed; + + /* + * We need the lock from this point on. We have to re-check zone + * ownership after acquiring the lock and also check for reservations. + */ + hammer_lock_ex(&hmp->blkmap_lock); + + if (layer2->zone && layer2->zone != zone) { + hammer_unlock(&hmp->blkmap_lock); + goto failed; + } + + base_off = (zone_offset & + (~HAMMER_LARGEBLOCK_MASK64 & ~HAMMER_OFF_ZONE_MASK)) | + HAMMER_ZONE_RAW_BUFFER; + resv = RB_LOOKUP(hammer_res_rb_tree, &hmp->rb_resv_root, base_off); + if (resv) { + if (resv->zone != zone) { + hammer_unlock(&hmp->blkmap_lock); + resv = NULL; + goto failed; + } + /* + * Due to possible big block underflow we can't simply + * subtract bytes from bytes_free. + */ + if (update_bytes_free(resv, bytes) == 0) { + hammer_unlock(&hmp->blkmap_lock); + resv = NULL; + goto failed; + } + ++resv->refs; + resx = NULL; + } else { + resx = kmalloc(sizeof(*resv), hmp->m_misc, + M_WAITOK | M_ZERO | M_USE_RESERVE); + resx->refs = 1; + resx->zone = zone; + resx->bytes_free = layer2->bytes_free; + /* + * Due to possible big block underflow we can't simply + * subtract bytes from bytes_free. + */ + if (update_bytes_free(resx, bytes) == 0) { + hammer_unlock(&hmp->blkmap_lock); + kfree(resx, hmp->m_misc); + goto failed; + } + resx->zone_offset = base_off; + resv = RB_INSERT(hammer_res_rb_tree, &hmp->rb_resv_root, resx); + KKASSERT(resv == NULL); + resv = resx; + ++hammer_count_reservations; + } + + hammer_unlock(&hmp->blkmap_lock); + +failed: + if (buffer1) + hammer_rel_buffer(buffer1, 0); + if (buffer2) + hammer_rel_buffer(buffer2, 0); + hammer_rel_volume(root_volume, 0); + + return(resv); +} + +static int +update_bytes_free(hammer_reserve_t resv, int bytes) +{ + int32_t temp; + + /* + * Big-block underflow check + */ + temp = resv->bytes_free - HAMMER_LARGEBLOCK_SIZE * 2; + cpu_ccfence(); /* XXX do we really need it ? */ + if (temp > resv->bytes_free) { + kprintf("BIGBLOCK UNDERFLOW\n"); + return (0); + } + + resv->bytes_free -= bytes; + return (1); +} + /* * Dereference a reservation structure. Upon the final release the * underlying big-block is checked and if it is entirely free we delete @@ -652,6 +821,8 @@ hammer_blockmap_reserve_complete(hammer_mount_t hmp, hammer_reserve_t resv) resv->append_off = HAMMER_LARGEBLOCK_SIZE; base_offset = resv->zone_offset & ~HAMMER_OFF_ZONE_MASK; base_offset = HAMMER_ZONE_ENCODE(resv->zone, base_offset); + if (!TAILQ_EMPTY(&hmp->dedup_lru_list)) + hammer_dedup_cache_inval(hmp, base_offset); error = hammer_del_buffers(hmp, base_offset, resv->zone_offset, HAMMER_LARGEBLOCK_SIZE, @@ -1108,8 +1279,10 @@ hammer_blockmap_finalize(hammer_transaction_t trans, KKASSERT(layer2->zone == zone); KKASSERT(bytes != 0); layer2->bytes_free -= bytes; - if (resv) + + if (resv) { resv->flags &= ~HAMMER_RESF_LAYER2FREE; + } /* * Finalizations can occur out of order, or combined with allocations. diff --git a/sys/vfs/hammer/hammer_dedup.c b/sys/vfs/hammer/hammer_dedup.c index 0bb74b51be..1bc125fa0b 100644 --- a/sys/vfs/hammer/hammer_dedup.c +++ b/sys/vfs/hammer/hammer_dedup.c @@ -189,3 +189,342 @@ validate_zone(hammer_off_t data_offset) return (1); } } + +/************************************************************************ + * LIVE DEDUP * + ************************************************************************ + * + * HAMMER Live Dedup (aka as efficient cp(1) implementation) + * + * The dedup cache is operated in a LRU fashion and destroyed on + * unmount, so essentially this is a live dedup on a cached dataset and + * not a full-fledged fs-wide one - we have a batched dedup for that. + * We allow duplicate entries in the buffer cache, data blocks are + * deduplicated only on their way to media. By default the cache is + * populated on reads only, but it can be populated on writes too. + * + * The main implementation gotcha is on-media requirement - in order for + * a data block to be added to a dedup cache it has to be present on + * disk. This simplifies cache logic a lot - once data is laid out on + * media it remains valid on media all the way up to the point where the + * related big block the data was stored in is freed - so there is only + * one place we need to bother with invalidation code. + */ + +/* + * RB-Tree support for dedup cache structures + */ +RB_GENERATE2(hammer_dedup_crc_rb_tree, hammer_dedup_cache, crc_entry, + hammer_dedup_crc_rb_compare, hammer_crc_t, crc); +RB_GENERATE2(hammer_dedup_off_rb_tree, hammer_dedup_cache, off_entry, + hammer_dedup_off_rb_compare, hammer_off_t, data_offset); + +struct hammer_dedup_inval { + struct hammer_mount *hmp; + hammer_off_t base_offset; +}; + +static int hammer_dedup_scancmp(hammer_dedup_cache_t dc, void *data); +static int hammer_dedup_cache_inval_callback(hammer_dedup_cache_t dc, + void *data); +static __inline int _vnode_validate(hammer_dedup_cache_t dcp, void *data, + int *errorp); +static __inline int _dev_validate(hammer_dedup_cache_t dcp, void *data, + int *errorp); + +int +hammer_dedup_crc_rb_compare(hammer_dedup_cache_t dc1, + hammer_dedup_cache_t dc2) +{ + if (dc1->crc < dc2->crc) + return (-1); + if (dc1->crc > dc2->crc) + return (1); + + return (0); +} + +int +hammer_dedup_off_rb_compare(hammer_dedup_cache_t dc1, + hammer_dedup_cache_t dc2) +{ + if (dc1->data_offset < dc2->data_offset) + return (-1); + if (dc1->data_offset > dc2->data_offset) + return (1); + + return (0); +} + +static int +hammer_dedup_scancmp(hammer_dedup_cache_t dc, void *data) +{ + hammer_off_t off = ((struct hammer_dedup_inval *)data)->base_offset; + + if (dc->data_offset < off) + return (-1); + if (dc->data_offset >= off + HAMMER_LARGEBLOCK_SIZE) + return (1); + + return (0); +} + +static int +hammer_dedup_cache_inval_callback(hammer_dedup_cache_t dc, void *data) +{ + hammer_mount_t hmp = ((struct hammer_dedup_inval *)data)->hmp; + + RB_REMOVE(hammer_dedup_crc_rb_tree, &hmp->rb_dedup_crc_root, dc); + RB_REMOVE(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, dc); + TAILQ_REMOVE(&hmp->dedup_lru_list, dc, lru_entry); + + --hmp->dedup_cache_count; + kfree(dc, hmp->m_misc); + + return (0); +} + +hammer_dedup_cache_t +hammer_dedup_cache_add(hammer_inode_t ip, hammer_btree_leaf_elm_t leaf) +{ + hammer_dedup_cache_t dcp, tmp; + hammer_mount_t hmp = ip->hmp; + + if (hmp->dedup_free_cache == NULL) { + tmp = kmalloc(sizeof(*tmp), hmp->m_misc, M_WAITOK | M_ZERO); + if (hmp->dedup_free_cache == NULL) + hmp->dedup_free_cache = tmp; + else + kfree(tmp, hmp->m_misc); + } + + KKASSERT(leaf != NULL); + + dcp = RB_LOOKUP(hammer_dedup_crc_rb_tree, + &hmp->rb_dedup_crc_root, leaf->data_crc); + if (dcp != NULL) { + RB_REMOVE(hammer_dedup_off_rb_tree, + &hmp->rb_dedup_off_root, dcp); + TAILQ_REMOVE(&hmp->dedup_lru_list, dcp, lru_entry); + goto populate; + } + + if (hmp->dedup_cache_count < DEDUP_CACHE_SIZE) { + dcp = hmp->dedup_free_cache; + hmp->dedup_free_cache = NULL; + ++hmp->dedup_cache_count; + } else { + dcp = TAILQ_FIRST(&hmp->dedup_lru_list); + RB_REMOVE(hammer_dedup_crc_rb_tree, + &hmp->rb_dedup_crc_root, dcp); + RB_REMOVE(hammer_dedup_off_rb_tree, + &hmp->rb_dedup_off_root, dcp); + TAILQ_REMOVE(&hmp->dedup_lru_list, dcp, lru_entry); + } + + dcp->crc = leaf->data_crc; + tmp = RB_INSERT(hammer_dedup_crc_rb_tree, &hmp->rb_dedup_crc_root, dcp); + KKASSERT(tmp == NULL); + +populate: + dcp->hmp = ip->hmp; + dcp->obj_id = ip->obj_id; + dcp->localization = ip->obj_localization; + dcp->file_offset = leaf->base.key - leaf->data_len; + dcp->bytes = leaf->data_len; + dcp->data_offset = leaf->data_offset; + + tmp = RB_INSERT(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, dcp); + KKASSERT(tmp == NULL); + TAILQ_INSERT_TAIL(&hmp->dedup_lru_list, dcp, lru_entry); + + return (dcp); +} + +__inline hammer_dedup_cache_t +hammer_dedup_cache_lookup(hammer_mount_t hmp, hammer_crc_t crc) +{ + hammer_dedup_cache_t dcp; + + dcp = RB_LOOKUP(hammer_dedup_crc_rb_tree, + &hmp->rb_dedup_crc_root, crc); + return dcp; +} + +void hammer_dedup_cache_inval(hammer_mount_t hmp, hammer_off_t base_offset) +{ + struct hammer_dedup_inval di; + + di.hmp = hmp; + di.base_offset = base_offset; + + RB_SCAN(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, + hammer_dedup_scancmp, hammer_dedup_cache_inval_callback, &di); +} + +void +hammer_destroy_dedup_cache(hammer_mount_t hmp) +{ + hammer_dedup_cache_t dcp; + + while ((dcp = TAILQ_FIRST(&hmp->dedup_lru_list)) != NULL) { + RB_REMOVE(hammer_dedup_crc_rb_tree, &hmp->rb_dedup_crc_root, dcp); + RB_REMOVE(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, dcp); + TAILQ_REMOVE(&hmp->dedup_lru_list, dcp, lru_entry); + --hmp->dedup_cache_count; + kfree(dcp, hmp->m_misc); + } + + KKASSERT(RB_EMPTY(&hmp->rb_dedup_crc_root)); + KKASSERT(RB_EMPTY(&hmp->rb_dedup_off_root)); + KKASSERT(TAILQ_EMPTY(&hmp->dedup_lru_list)); + + KKASSERT(hmp->dedup_cache_count == 0); +} + +int +hammer_dedup_validate(hammer_dedup_cache_t dcp, int zone, int bytes, + void *data) +{ + int error; + + /* + * Zone validation + */ + if (HAMMER_ZONE_DECODE(dcp->data_offset) != zone) + return (0); + + /* + * Block length validation + */ + if (dcp->bytes != bytes) + return (0); + + /* + * Byte-by-byte data comparison + * + * The data we need for validation may already be present in the + * buffer cache in two flavours: vnode-based buffer or + * block-device-based buffer. In case vnode-based buffer wasn't + * there or if a non-blocking attempt to acquire it failed use + * device-based buffer (for large-zone data blocks it will + * generate a separate read). + */ + if (_vnode_validate(dcp, data, &error)) { + hammer_live_dedup_vnode_bcmps++; + return (1); + } else { + if (error == 3) + hammer_live_dedup_findblk_failures++; + } + + if (error) { /* yes, if there was an error previously */ + if (_dev_validate(dcp, data, &error)) { + hammer_live_dedup_device_bcmps++; + return (1); + } + } + + return (0); +} + +static __inline int +_vnode_validate(hammer_dedup_cache_t dcp, void *data, int *errorp) +{ + struct hammer_transaction trans; + hammer_inode_t ip; + struct vnode *vp; + struct buf *bp; + off_t dooffset; + int result, error; + + result = error = 0; + *errorp = 0; + + hammer_simple_transaction(&trans, dcp->hmp); + + ip = hammer_get_inode(&trans, NULL, dcp->obj_id, HAMMER_MAX_TID, + dcp->localization, 0, &error); + if (ip == NULL) { + kprintf("dedup: unable to find objid %016jx:%08x\n", + (intmax_t)dcp->obj_id, dcp->localization); + *errorp = 1; + goto failed2; + } + + error = hammer_get_vnode(ip, &vp); + if (error) { + kprintf("dedup: unable to acquire vnode for %016jx:%08x\n", + (intmax_t)dcp->obj_id, dcp->localization); + *errorp = 2; + goto failed; + } + + if ((bp = findblk(ip->vp, dcp->file_offset, FINDBLK_NBLOCK)) != NULL) { + bremfree(bp); + + /* XXX if (mapped to userspace) goto done, *errorp = 4 */ + + if ((bp->b_flags & B_CACHE) == 0 || bp->b_flags & B_DIRTY) { + *errorp = 5; + goto done; + } + + if (bp->b_bio2.bio_offset != dcp->data_offset) { + error = VOP_BMAP(ip->vp, dcp->file_offset, &dooffset, + NULL, NULL, BUF_CMD_READ); + if (error) { + *errorp = 6; + goto done; + } + + if (dooffset != dcp->data_offset) { + *errorp = 7; + goto done; + } + hammer_live_dedup_bmap_saves++; + } + + if (bcmp(data, bp->b_data, dcp->bytes) == 0) + result = 1; + +done: + bqrelse(bp); + } else { + *errorp = 3; + } + vput(vp); + +failed: + hammer_rel_inode(ip, 0); +failed2: + hammer_done_transaction(&trans); + return (result); +} + +static __inline int +_dev_validate(hammer_dedup_cache_t dcp, void *data, int *errorp) +{ + hammer_buffer_t buffer = NULL; + void *ondisk_data; + int result, error; + + result = error = 0; + *errorp = 0; + + ondisk_data = hammer_bread_ext(dcp->hmp, dcp->data_offset, + dcp->bytes, &error, &buffer); + if (error) { + *errorp = 1; + goto failed; + } + + if (bcmp(data, ondisk_data, dcp->bytes) == 0) + result = 1; + +failed: + if (buffer) + hammer_rel_buffer(buffer, 0); + + return (result); +} diff --git a/sys/vfs/hammer/hammer_object.c b/sys/vfs/hammer/hammer_object.c index 368acf0a9e..8786cc8d6f 100644 --- a/sys/vfs/hammer/hammer_object.c +++ b/sys/vfs/hammer/hammer_object.c @@ -950,6 +950,8 @@ hammer_ip_add_bulk(hammer_inode_t ip, off_t file_offset, void *data, int bytes, int *errorp) { hammer_record_t record; + hammer_dedup_cache_t dcp; + hammer_crc_t crc; int zone; /* @@ -966,14 +968,39 @@ hammer_ip_add_bulk(hammer_inode_t ip, off_t file_offset, void *data, int bytes, record = hammer_alloc_mem_record(ip, 0); zone = (bytes >= HAMMER_BUFSIZE) ? HAMMER_ZONE_LARGE_DATA_INDEX : HAMMER_ZONE_SMALL_DATA_INDEX; - record->resv = hammer_blockmap_reserve(ip->hmp, zone, bytes, - &record->leaf.data_offset, - errorp); - if (record->resv == NULL) { - kprintf("hammer_ip_add_bulk: reservation failed\n"); - hammer_rel_mem_record(record); - return(NULL); + if (bytes == 0) + crc = 0; + else + crc = crc32(data, bytes); + + if (hammer_live_dedup == 0) + goto nodedup; + if ((dcp = hammer_dedup_cache_lookup(ip->hmp, crc)) != NULL) { + struct hammer_dedup_cache tmp = *dcp; + + record->resv = hammer_blockmap_reserve_dedup(ip->hmp, zone, + bytes, tmp.data_offset, errorp); + if (record->resv == NULL) + goto nodedup; + + if (!hammer_dedup_validate(&tmp, zone, bytes, data)) { + hammer_blockmap_reserve_complete(ip->hmp, record->resv); + goto nodedup; + } + + record->leaf.data_offset = tmp.data_offset; + record->flags |= HAMMER_RECF_DEDUPED; + } else { +nodedup: + record->resv = hammer_blockmap_reserve(ip->hmp, zone, bytes, + &record->leaf.data_offset, errorp); + if (record->resv == NULL) { + kprintf("hammer_ip_add_bulk: reservation failed\n"); + hammer_rel_mem_record(record); + return(NULL); + } } + record->type = HAMMER_MEM_RECORD_DATA; record->leaf.base.rec_type = HAMMER_RECTYPE_DATA; record->leaf.base.obj_type = ip->ino_leaf.base.obj_type; @@ -982,7 +1009,7 @@ hammer_ip_add_bulk(hammer_inode_t ip, off_t file_offset, void *data, int bytes, record->leaf.base.localization = ip->obj_localization + HAMMER_LOCALIZE_MISC; record->leaf.data_len = bytes; - hammer_crc_set_leaf(data, &record->leaf); + record->leaf.data_crc = crc; KKASSERT(*errorp == 0); return(record); @@ -1262,6 +1289,11 @@ hammer_ip_sync_record_cursor(hammer_cursor_t cursor, hammer_record_t record) record->resv, record->leaf.data_offset, record->leaf.data_len); + + if (hammer_live_dedup == 2 && + (record->flags & HAMMER_RECF_DEDUPED) == 0) { + hammer_dedup_cache_add(record->ip, &record->leaf); + } } else if (record->data && record->leaf.data_len) { /* * Wholely cached record, with data. Allocate the data. diff --git a/sys/vfs/hammer/hammer_vfsops.c b/sys/vfs/hammer/hammer_vfsops.c index c9b01fcd85..5ebb1ddab6 100644 --- a/sys/vfs/hammer/hammer_vfsops.c +++ b/sys/vfs/hammer/hammer_vfsops.c @@ -63,6 +63,7 @@ int hammer_debug_recover; /* -1 will disable, +1 will force */ int hammer_debug_recover_faults; int hammer_debug_critical; /* non-zero enter debugger on error */ int hammer_cluster_enable = 1; /* enable read clustering by default */ +int hammer_live_dedup = 0; int hammer_count_fsyncs; int hammer_count_inodes; int hammer_count_iqueued; @@ -116,7 +117,18 @@ int hammer_fsync_mode = 3; int64_t hammer_contention_count; int64_t hammer_zone_limit; +/* + * Live dedup debug counters (sysctls are writable so that counters + * can be reset from userspace). + */ +int64_t hammer_live_dedup_vnode_bcmps = 0; +int64_t hammer_live_dedup_device_bcmps = 0; +int64_t hammer_live_dedup_findblk_failures = 0; +int64_t hammer_live_dedup_bmap_saves = 0; + + SYSCTL_NODE(_vfs, OID_AUTO, hammer, CTLFLAG_RW, 0, "HAMMER filesystem"); + SYSCTL_INT(_vfs_hammer, OID_AUTO, supported_version, CTLFLAG_RD, &hammer_supported_version, 0, ""); SYSCTL_INT(_vfs_hammer, OID_AUTO, debug_general, CTLFLAG_RW, @@ -141,6 +153,13 @@ SYSCTL_INT(_vfs_hammer, OID_AUTO, debug_critical, CTLFLAG_RW, &hammer_debug_critical, 0, ""); SYSCTL_INT(_vfs_hammer, OID_AUTO, cluster_enable, CTLFLAG_RW, &hammer_cluster_enable, 0, ""); +/* + * 0 - live dedup is disabled + * 1 - dedup cache is populated on reads only + * 2 - dedup cache is populated on both reads and writes + */ +SYSCTL_INT(_vfs_hammer, OID_AUTO, live_dedup, CTLFLAG_RW, + &hammer_live_dedup, 0, ""); SYSCTL_INT(_vfs_hammer, OID_AUTO, limit_dirtybufspace, CTLFLAG_RW, &hammer_limit_dirtybufspace, 0, ""); @@ -216,6 +235,15 @@ SYSCTL_QUAD(_vfs_hammer, OID_AUTO, stats_undo, CTLFLAG_RD, SYSCTL_QUAD(_vfs_hammer, OID_AUTO, stats_redo, CTLFLAG_RD, &hammer_stats_redo, 0, ""); +SYSCTL_QUAD(_vfs_hammer, OID_AUTO, live_dedup_vnode_bcmps, CTLFLAG_RW, + &hammer_live_dedup_vnode_bcmps, 0, ""); +SYSCTL_QUAD(_vfs_hammer, OID_AUTO, live_dedup_device_bcmps, CTLFLAG_RW, + &hammer_live_dedup_device_bcmps, 0, ""); +SYSCTL_QUAD(_vfs_hammer, OID_AUTO, live_dedup_findblk_failures, CTLFLAG_RW, + &hammer_live_dedup_findblk_failures, 0, ""); +SYSCTL_QUAD(_vfs_hammer, OID_AUTO, live_dedup_bmap_saves, CTLFLAG_RW, + &hammer_live_dedup_bmap_saves, 0, ""); + SYSCTL_INT(_vfs_hammer, OID_AUTO, count_dirtybufspace, CTLFLAG_RD, &hammer_count_dirtybufspace, 0, ""); SYSCTL_INT(_vfs_hammer, OID_AUTO, count_refedbufs, CTLFLAG_RD, @@ -445,6 +473,10 @@ hammer_vfs_mount(struct mount *mp, char *mntpt, caddr_t data, TAILQ_INIT(&hmp->objid_cache_list); TAILQ_INIT(&hmp->undo_lru_list); TAILQ_INIT(&hmp->reclaim_list); + + RB_INIT(&hmp->rb_dedup_crc_root); + RB_INIT(&hmp->rb_dedup_off_root); + TAILQ_INIT(&hmp->dedup_lru_list); } hmp->hflags &= ~HMNT_USERFLAGS; hmp->hflags |= info.hflags & HMNT_USERFLAGS; @@ -872,6 +904,11 @@ hammer_free_hmp(struct mount *mp) mp->mnt_flag &= ~MNT_LOCAL; hmp->mp = NULL; hammer_destroy_objid_cache(hmp); + hammer_destroy_dedup_cache(hmp); + if (hmp->dedup_free_cache != NULL) { + kfree(hmp->dedup_free_cache, hmp->m_misc); + hmp->dedup_free_cache = NULL; + } kmalloc_destroy(&hmp->m_misc); kmalloc_destroy(&hmp->m_inodes); lwkt_reltoken(&hmp->fs_token); diff --git a/sys/vfs/hammer/hammer_vnops.c b/sys/vfs/hammer/hammer_vnops.c index 4d8cdaee19..8ddb6a218d 100644 --- a/sys/vfs/hammer/hammer_vnops.c +++ b/sys/vfs/hammer/hammer_vnops.c @@ -2622,6 +2622,9 @@ hammer_vop_strategy(struct vop_strategy_args *ap) biodone(ap->a_bio); break; } + + /* hammer_dump_dedup_cache(((hammer_inode_t)ap->a_vp->v_data)->hmp); */ + return (error); } @@ -2818,6 +2821,8 @@ hammer_vop_strategy_read(struct vop_strategy_args *ap) HAMMER_ZONE_LARGE_DATA); nbio->bio_offset = disk_offset; error = hammer_io_direct_read(hmp, nbio, cursor.leaf); + if (hammer_live_dedup) + hammer_dedup_cache_add(ip, cursor.leaf); goto done; } else if (n) { error = hammer_ip_resolve_data(&cursor); @@ -2829,6 +2834,13 @@ hammer_vop_strategy_read(struct vop_strategy_args *ap) if (error) break; + /* + * We have to be sure that the only elements added to the + * dedup cache are those which are already on-media. + */ + if (hammer_live_dedup && hammer_cursor_ondisk(&cursor)) + hammer_dedup_cache_add(ip, cursor.leaf); + /* * Iterate until we have filled the request. */ @@ -3043,7 +3055,11 @@ hammer_vop_bmap(struct vop_bmap_args *ap) } last_offset = rec_offset + rec_len; last_disk_offset = disk_offset + rec_len; + + if (hammer_live_dedup) + hammer_dedup_cache_add(ip, cursor.leaf); } + error = hammer_ip_next(&cursor); } @@ -3216,7 +3232,13 @@ hammer_vop_strategy_write(struct vop_strategy_args *ap) record->flags |= HAMMER_RECF_REDO; bp->b_flags &= ~B_VFSFLAG1; } - hammer_io_direct_write(hmp, bio, record); + if (record->flags & HAMMER_RECF_DEDUPED) { + bp->b_resid = 0; + hammer_ip_replace_bulk(hmp, record); + biodone(ap->a_bio); + } else { + hammer_io_direct_write(hmp, bio, record); + } if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs) hammer_flush_inode(ip, 0); } else { -- 2.41.0