2 * Copyright (c) 2010 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Ilya Dryomov <idryomov@gmail.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 static __inline int validate_zone(hammer_off_t data_offset);
40 hammer_ioc_dedup(hammer_transaction_t trans, hammer_inode_t ip,
41 struct hammer_ioc_dedup *dedup)
43 struct hammer_cursor cursor1, cursor2;
48 * Enforce hammer filesystem version requirements
50 if (trans->hmp->version < HAMMER_VOL_VERSION_FIVE) {
51 kprintf("hammer: Filesystem must be upgraded to v5 "
52 "before you can run dedup\n");
57 * Cursor1, return an error -> candidate goes to pass2 list
59 error = hammer_init_cursor(trans, &cursor1, NULL, NULL);
62 cursor1.key_beg = dedup->elm1;
63 cursor1.flags |= HAMMER_CURSOR_BACKEND;
65 error = hammer_btree_lookup(&cursor1);
68 error = hammer_btree_extract(&cursor1, HAMMER_CURSOR_GET_LEAF |
69 HAMMER_CURSOR_GET_DATA);
74 * Cursor2, return an error -> candidate goes to pass2 list
76 error = hammer_init_cursor(trans, &cursor2, NULL, NULL);
79 cursor2.key_beg = dedup->elm2;
80 cursor2.flags |= HAMMER_CURSOR_BACKEND;
82 error = hammer_btree_lookup(&cursor2);
85 error = hammer_btree_extract(&cursor2, HAMMER_CURSOR_GET_LEAF |
86 HAMMER_CURSOR_GET_DATA);
91 * Zone validation. We can't de-dup any of the other zones
92 * (BTREE or META) or bad things will happen.
94 * Return with error = 0, but set an INVALID_ZONE flag.
96 error = validate_zone(cursor1.leaf->data_offset) +
97 validate_zone(cursor2.leaf->data_offset);
99 dedup->head.flags |= HAMMER_IOC_DEDUP_INVALID_ZONE;
107 * If zones don't match or data_len fields aren't the same
108 * we consider it to be a comparison failure.
110 * Return with error = 0, but set a CMP_FAILURE flag.
112 if ((cursor1.leaf->data_offset & HAMMER_OFF_ZONE_MASK) !=
113 (cursor2.leaf->data_offset & HAMMER_OFF_ZONE_MASK)) {
114 dedup->head.flags |= HAMMER_IOC_DEDUP_CMP_FAILURE;
117 if (cursor1.leaf->data_len != cursor2.leaf->data_len) {
118 dedup->head.flags |= HAMMER_IOC_DEDUP_CMP_FAILURE;
122 /* byte-by-byte comparison to be sure */
123 if (bcmp(cursor1.data, cursor2.data, cursor1.leaf->data_len)) {
124 dedup->head.flags |= HAMMER_IOC_DEDUP_CMP_FAILURE;
129 * Upgrade both cursors together to an exclusive lock
131 * Return an error -> candidate goes to pass2 list
133 hammer_sync_lock_sh(trans);
134 error = hammer_cursor_upgrade2(&cursor1, &cursor2);
136 hammer_sync_unlock(trans);
140 error = hammer_blockmap_dedup(cursor1.trans,
141 cursor1.leaf->data_offset, cursor1.leaf->data_len);
143 if (error == ERANGE) {
145 * Return with error = 0, but set an UNDERFLOW flag
147 dedup->head.flags |= HAMMER_IOC_DEDUP_UNDERFLOW;
149 goto downgrade_cursors;
152 * Return an error -> block goes to pass2 list
154 goto downgrade_cursors;
159 * The cursor2's cache must be invalidated before calling
160 * hammer_blockmap_free(), otherwise it will not be able to
161 * invalidate the underlying data buffer.
163 hammer_cursor_invalidate_cache(&cursor2);
164 hammer_blockmap_free(cursor2.trans,
165 cursor2.leaf->data_offset, cursor2.leaf->data_len);
167 hammer_modify_node(cursor2.trans, cursor2.node,
168 &cursor2.leaf->data_offset, sizeof(hammer_off_t));
169 cursor2.leaf->data_offset = cursor1.leaf->data_offset;
170 hammer_modify_node_done(cursor2.node);
173 hammer_cursor_downgrade2(&cursor1, &cursor2);
174 hammer_sync_unlock(trans);
176 hammer_done_cursor(&cursor2);
178 hammer_done_cursor(&cursor1);
181 * Avoid deadlocking the buffer cache
183 seq = trans->hmp->flusher.done;
184 while (hammer_flusher_meta_halflimit(trans->hmp) ||
185 hammer_flusher_undo_exhausted(trans, 2)) {
186 hammer_flusher_wait(trans->hmp, seq);
187 seq = hammer_flusher_async_one(trans->hmp);
193 validate_zone(hammer_off_t data_offset)
195 switch(data_offset & HAMMER_OFF_ZONE_MASK) {
196 case HAMMER_ZONE_LARGE_DATA:
197 case HAMMER_ZONE_SMALL_DATA:
204 /************************************************************************
206 ************************************************************************
208 * HAMMER Live Dedup (aka as efficient cp(1) implementation)
210 * The dedup cache is operated in a LRU fashion and destroyed on
211 * unmount, so essentially this is a live dedup on a cached dataset and
212 * not a full-fledged fs-wide one - we have a batched dedup for that.
213 * We allow duplicate entries in the buffer cache, data blocks are
214 * deduplicated only on their way to media. By default the cache is
215 * populated on reads only, but it can be populated on writes too.
217 * The main implementation gotcha is on-media requirement - in order for
218 * a data block to be added to a dedup cache it has to be present on
219 * disk. This simplifies cache logic a lot - once data is laid out on
220 * media it remains valid on media all the way up to the point where the
221 * related big block the data was stored in is freed - so there is only
222 * one place we need to bother with invalidation code.
226 * RB-Tree support for dedup cache structures
228 RB_GENERATE2(hammer_dedup_crc_rb_tree, hammer_dedup_cache, crc_entry,
229 hammer_dedup_crc_rb_compare, hammer_crc_t, crc);
230 RB_GENERATE2(hammer_dedup_off_rb_tree, hammer_dedup_cache, off_entry,
231 hammer_dedup_off_rb_compare, hammer_off_t, data_offset);
233 struct hammer_dedup_inval {
234 struct hammer_mount *hmp;
235 hammer_off_t base_offset;
238 static int hammer_dedup_scancmp(hammer_dedup_cache_t dc, void *data);
239 static int hammer_dedup_cache_inval_callback(hammer_dedup_cache_t dc,
241 static __inline int _vnode_validate(hammer_dedup_cache_t dcp, void *data,
243 static __inline int _dev_validate(hammer_dedup_cache_t dcp, void *data,
247 hammer_dedup_crc_rb_compare(hammer_dedup_cache_t dc1,
248 hammer_dedup_cache_t dc2)
250 if (dc1->crc < dc2->crc)
252 if (dc1->crc > dc2->crc)
259 hammer_dedup_off_rb_compare(hammer_dedup_cache_t dc1,
260 hammer_dedup_cache_t dc2)
262 if (dc1->data_offset < dc2->data_offset)
264 if (dc1->data_offset > dc2->data_offset)
271 hammer_dedup_scancmp(hammer_dedup_cache_t dc, void *data)
273 hammer_off_t off = ((struct hammer_dedup_inval *)data)->base_offset;
275 if (dc->data_offset < off)
277 if (dc->data_offset >= off + HAMMER_LARGEBLOCK_SIZE)
284 hammer_dedup_cache_inval_callback(hammer_dedup_cache_t dc, void *data)
286 hammer_mount_t hmp = ((struct hammer_dedup_inval *)data)->hmp;
288 RB_REMOVE(hammer_dedup_crc_rb_tree, &hmp->rb_dedup_crc_root, dc);
289 RB_REMOVE(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, dc);
290 TAILQ_REMOVE(&hmp->dedup_lru_list, dc, lru_entry);
292 --hmp->dedup_cache_count;
293 kfree(dc, hmp->m_misc);
299 hammer_dedup_cache_add(hammer_inode_t ip, hammer_btree_leaf_elm_t leaf)
301 hammer_dedup_cache_t dcp, tmp;
302 hammer_mount_t hmp = ip->hmp;
304 if (hmp->dedup_free_cache == NULL) {
305 tmp = kmalloc(sizeof(*tmp), hmp->m_misc, M_WAITOK | M_ZERO);
306 if (hmp->dedup_free_cache == NULL)
307 hmp->dedup_free_cache = tmp;
309 kfree(tmp, hmp->m_misc);
312 KKASSERT(leaf != NULL);
314 dcp = RB_LOOKUP(hammer_dedup_crc_rb_tree,
315 &hmp->rb_dedup_crc_root, leaf->data_crc);
317 RB_REMOVE(hammer_dedup_off_rb_tree,
318 &hmp->rb_dedup_off_root, dcp);
319 TAILQ_REMOVE(&hmp->dedup_lru_list, dcp, lru_entry);
323 if (hmp->dedup_cache_count < hammer_live_dedup_cache_size) {
324 dcp = hmp->dedup_free_cache;
325 hmp->dedup_free_cache = NULL;
326 ++hmp->dedup_cache_count;
328 dcp = TAILQ_FIRST(&hmp->dedup_lru_list);
329 RB_REMOVE(hammer_dedup_crc_rb_tree,
330 &hmp->rb_dedup_crc_root, dcp);
331 RB_REMOVE(hammer_dedup_off_rb_tree,
332 &hmp->rb_dedup_off_root, dcp);
333 TAILQ_REMOVE(&hmp->dedup_lru_list, dcp, lru_entry);
336 dcp->crc = leaf->data_crc;
337 tmp = RB_INSERT(hammer_dedup_crc_rb_tree, &hmp->rb_dedup_crc_root, dcp);
338 KKASSERT(tmp == NULL);
342 dcp->obj_id = ip->obj_id;
343 dcp->localization = ip->obj_localization;
344 dcp->file_offset = leaf->base.key - leaf->data_len;
345 dcp->bytes = leaf->data_len;
346 dcp->data_offset = leaf->data_offset;
348 tmp = RB_INSERT(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, dcp);
349 KKASSERT(tmp == NULL);
350 TAILQ_INSERT_TAIL(&hmp->dedup_lru_list, dcp, lru_entry);
355 __inline hammer_dedup_cache_t
356 hammer_dedup_cache_lookup(hammer_mount_t hmp, hammer_crc_t crc)
358 hammer_dedup_cache_t dcp;
360 dcp = RB_LOOKUP(hammer_dedup_crc_rb_tree,
361 &hmp->rb_dedup_crc_root, crc);
365 void hammer_dedup_cache_inval(hammer_mount_t hmp, hammer_off_t base_offset)
367 struct hammer_dedup_inval di;
370 di.base_offset = base_offset;
372 RB_SCAN(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root,
373 hammer_dedup_scancmp, hammer_dedup_cache_inval_callback, &di);
377 hammer_destroy_dedup_cache(hammer_mount_t hmp)
379 hammer_dedup_cache_t dcp;
381 while ((dcp = TAILQ_FIRST(&hmp->dedup_lru_list)) != NULL) {
382 RB_REMOVE(hammer_dedup_crc_rb_tree, &hmp->rb_dedup_crc_root, dcp);
383 RB_REMOVE(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, dcp);
384 TAILQ_REMOVE(&hmp->dedup_lru_list, dcp, lru_entry);
385 --hmp->dedup_cache_count;
386 kfree(dcp, hmp->m_misc);
389 KKASSERT(RB_EMPTY(&hmp->rb_dedup_crc_root));
390 KKASSERT(RB_EMPTY(&hmp->rb_dedup_off_root));
391 KKASSERT(TAILQ_EMPTY(&hmp->dedup_lru_list));
393 KKASSERT(hmp->dedup_cache_count == 0);
397 hammer_dedup_validate(hammer_dedup_cache_t dcp, int zone, int bytes,
405 if (HAMMER_ZONE_DECODE(dcp->data_offset) != zone)
409 * Block length validation
411 if (dcp->bytes != bytes)
415 * Byte-by-byte data comparison
417 * The data we need for validation may already be present in the
418 * buffer cache in two flavours: vnode-based buffer or
419 * block-device-based buffer. In case vnode-based buffer wasn't
420 * there or if a non-blocking attempt to acquire it failed use
421 * device-based buffer (for large-zone data blocks it will
422 * generate a separate read).
424 * XXX vnode-based checking is not MP safe so when live-dedup
425 * is turned on we must always use the device buffer.
428 if (hammer_double_buffer) {
430 } else if (_vnode_validate(dcp, data, &error)) {
431 hammer_live_dedup_vnode_bcmps++;
435 hammer_live_dedup_findblk_failures++;
439 * If there was an error previously or if double buffering is
443 if (_dev_validate(dcp, data, &error)) {
444 hammer_live_dedup_device_bcmps++;
449 if (_dev_validate(dcp, data, &error)) {
450 hammer_live_dedup_device_bcmps++;
458 _vnode_validate(hammer_dedup_cache_t dcp, void *data, int *errorp)
460 struct hammer_transaction trans;
470 hammer_simple_transaction(&trans, dcp->hmp);
472 ip = hammer_get_inode(&trans, NULL, dcp->obj_id, HAMMER_MAX_TID,
473 dcp->localization, 0, &error);
475 kprintf("dedup: unable to find objid %016jx:%08x\n",
476 (intmax_t)dcp->obj_id, dcp->localization);
481 error = hammer_get_vnode(ip, &vp);
483 kprintf("dedup: unable to acquire vnode for %016jx:%08x\n",
484 (intmax_t)dcp->obj_id, dcp->localization);
489 if ((bp = findblk(ip->vp, dcp->file_offset, FINDBLK_NBLOCK)) != NULL) {
492 /* XXX if (mapped to userspace) goto done, *errorp = 4 */
494 if ((bp->b_flags & B_CACHE) == 0 || bp->b_flags & B_DIRTY) {
499 if (bp->b_bio2.bio_offset != dcp->data_offset) {
500 error = VOP_BMAP(ip->vp, dcp->file_offset, &dooffset,
501 NULL, NULL, BUF_CMD_READ);
507 if (dooffset != dcp->data_offset) {
511 hammer_live_dedup_bmap_saves++;
514 if (bcmp(data, bp->b_data, dcp->bytes) == 0)
525 hammer_rel_inode(ip, 0);
527 hammer_done_transaction(&trans);
532 _dev_validate(hammer_dedup_cache_t dcp, void *data, int *errorp)
534 hammer_buffer_t buffer = NULL;
541 ondisk_data = hammer_bread_ext(dcp->hmp, dcp->data_offset,
542 dcp->bytes, &error, &buffer);
548 if (bcmp(data, ondisk_data, dcp->bytes) == 0)
553 hammer_rel_buffer(buffer, 0);