HAMMER - Add live dedup sysctl and support
[dragonfly.git] / sys / vfs / hammer / hammer_dedup.c
CommitLineData
bb29b5d8
MD
1/*
2 * Copyright (c) 2010 The DragonFly Project. All rights reserved.
3 *
4 * This code is derived from software contributed to The DragonFly Project
5 * by Ilya Dryomov <idryomov@gmail.com>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 */
34
35#include "hammer.h"
36
37static __inline int validate_zone(hammer_off_t data_offset);
38
39int
40hammer_ioc_dedup(hammer_transaction_t trans, hammer_inode_t ip,
41 struct hammer_ioc_dedup *dedup)
42{
43 struct hammer_cursor cursor1, cursor2;
44 int error;
45
46 /*
3045a179
MD
47 * Enforce hammer filesystem version requirements
48 */
49 if (trans->hmp->version < HAMMER_VOL_VERSION_FIVE) {
50 kprintf("hammer: Filesystem must be upgraded to v5 "
51 "before you can run dedup\n");
52 return (EOPNOTSUPP);
53 }
54
55 /*
bb29b5d8
MD
56 * Cursor1, return an error -> candidate goes to pass2 list
57 */
58 error = hammer_init_cursor(trans, &cursor1, NULL, NULL);
59 if (error)
60 goto done_cursor;
61 cursor1.key_beg = dedup->elm1;
62 cursor1.flags |= HAMMER_CURSOR_BACKEND;
63
64 error = hammer_btree_lookup(&cursor1);
65 if (error)
66 goto done_cursor;
67 error = hammer_btree_extract(&cursor1, HAMMER_CURSOR_GET_LEAF |
68 HAMMER_CURSOR_GET_DATA);
69 if (error)
70 goto done_cursor;
71
72 /*
73 * Cursor2, return an error -> candidate goes to pass2 list
74 */
75 error = hammer_init_cursor(trans, &cursor2, NULL, NULL);
76 if (error)
77 goto done_cursors;
78 cursor2.key_beg = dedup->elm2;
79 cursor2.flags |= HAMMER_CURSOR_BACKEND;
80
81 error = hammer_btree_lookup(&cursor2);
82 if (error)
83 goto done_cursors;
84 error = hammer_btree_extract(&cursor2, HAMMER_CURSOR_GET_LEAF |
85 HAMMER_CURSOR_GET_DATA);
86 if (error)
87 goto done_cursors;
88
89 /*
90 * Zone validation. We can't de-dup any of the other zones
91 * (BTREE or META) or bad things will happen.
92 *
93 * Return with error = 0, but set an INVALID_ZONE flag.
94 */
95 error = validate_zone(cursor1.leaf->data_offset) +
96 validate_zone(cursor2.leaf->data_offset);
97 if (error) {
98 dedup->head.flags |= HAMMER_IOC_DEDUP_INVALID_ZONE;
99 error = 0;
100 goto done_cursors;
101 }
102
103 /*
104 * Comparison checks
105 *
106 * If zones don't match or data_len fields aren't the same
107 * we consider it to be a comparison failure.
108 *
109 * Return with error = 0, but set a CMP_FAILURE flag.
110 */
111 if ((cursor1.leaf->data_offset & HAMMER_OFF_ZONE_MASK) !=
112 (cursor2.leaf->data_offset & HAMMER_OFF_ZONE_MASK)) {
113 dedup->head.flags |= HAMMER_IOC_DEDUP_CMP_FAILURE;
114 goto done_cursors;
115 }
116 if (cursor1.leaf->data_len != cursor2.leaf->data_len) {
117 dedup->head.flags |= HAMMER_IOC_DEDUP_CMP_FAILURE;
118 goto done_cursors;
119 }
120
121 /* byte-by-byte comparison to be sure */
122 if (bcmp(cursor1.data, cursor2.data, cursor1.leaf->data_len)) {
123 dedup->head.flags |= HAMMER_IOC_DEDUP_CMP_FAILURE;
124 goto done_cursors;
125 }
126
127 /*
128 * Upgrade both cursors together to an exclusive lock
129 *
130 * Return an error -> candidate goes to pass2 list
131 */
132 hammer_sync_lock_sh(trans);
133 error = hammer_cursor_upgrade2(&cursor1, &cursor2);
134 if (error) {
135 hammer_sync_unlock(trans);
136 goto done_cursors;
137 }
138
139 error = hammer_blockmap_dedup(cursor1.trans,
140 cursor1.leaf->data_offset, cursor1.leaf->data_len);
141 if (error) {
142 if (error == ERANGE) {
143 /*
144 * Return with error = 0, but set an UNDERFLOW flag
145 */
146 dedup->head.flags |= HAMMER_IOC_DEDUP_UNDERFLOW;
147 error = 0;
148 goto downgrade_cursors;
149 } else {
150 /*
151 * Return an error -> block goes to pass2 list
152 */
153 goto downgrade_cursors;
154 }
155 }
156
157 /*
158 * The cursor2's cache must be invalidated before calling
159 * hammer_blockmap_free(), otherwise it will not be able to
160 * invalidate the underlying data buffer.
161 */
162 hammer_cursor_invalidate_cache(&cursor2);
163 hammer_blockmap_free(cursor2.trans,
164 cursor2.leaf->data_offset, cursor2.leaf->data_len);
165
166 hammer_modify_node(cursor2.trans, cursor2.node,
167 &cursor2.leaf->data_offset, sizeof(hammer_off_t));
168 cursor2.leaf->data_offset = cursor1.leaf->data_offset;
169 hammer_modify_node_done(cursor2.node);
170
171downgrade_cursors:
172 hammer_cursor_downgrade2(&cursor1, &cursor2);
173 hammer_sync_unlock(trans);
174done_cursors:
175 hammer_done_cursor(&cursor2);
176done_cursor:
177 hammer_done_cursor(&cursor1);
178 return (error);
179}
180
181static __inline int
182validate_zone(hammer_off_t data_offset)
183{
184 switch(data_offset & HAMMER_OFF_ZONE_MASK) {
185 case HAMMER_ZONE_LARGE_DATA:
186 case HAMMER_ZONE_SMALL_DATA:
187 return (0);
188 default:
189 return (1);
190 }
191}
507df98a
ID
192
193/************************************************************************
194 * LIVE DEDUP *
195 ************************************************************************
196 *
197 * HAMMER Live Dedup (aka as efficient cp(1) implementation)
198 *
199 * The dedup cache is operated in a LRU fashion and destroyed on
200 * unmount, so essentially this is a live dedup on a cached dataset and
201 * not a full-fledged fs-wide one - we have a batched dedup for that.
202 * We allow duplicate entries in the buffer cache, data blocks are
203 * deduplicated only on their way to media. By default the cache is
204 * populated on reads only, but it can be populated on writes too.
205 *
206 * The main implementation gotcha is on-media requirement - in order for
207 * a data block to be added to a dedup cache it has to be present on
208 * disk. This simplifies cache logic a lot - once data is laid out on
209 * media it remains valid on media all the way up to the point where the
210 * related big block the data was stored in is freed - so there is only
211 * one place we need to bother with invalidation code.
212 */
213
214/*
215 * RB-Tree support for dedup cache structures
216 */
217RB_GENERATE2(hammer_dedup_crc_rb_tree, hammer_dedup_cache, crc_entry,
218 hammer_dedup_crc_rb_compare, hammer_crc_t, crc);
219RB_GENERATE2(hammer_dedup_off_rb_tree, hammer_dedup_cache, off_entry,
220 hammer_dedup_off_rb_compare, hammer_off_t, data_offset);
221
222struct hammer_dedup_inval {
223 struct hammer_mount *hmp;
224 hammer_off_t base_offset;
225};
226
227static int hammer_dedup_scancmp(hammer_dedup_cache_t dc, void *data);
228static int hammer_dedup_cache_inval_callback(hammer_dedup_cache_t dc,
229 void *data);
230static __inline int _vnode_validate(hammer_dedup_cache_t dcp, void *data,
231 int *errorp);
232static __inline int _dev_validate(hammer_dedup_cache_t dcp, void *data,
233 int *errorp);
234
235int
236hammer_dedup_crc_rb_compare(hammer_dedup_cache_t dc1,
237 hammer_dedup_cache_t dc2)
238{
239 if (dc1->crc < dc2->crc)
240 return (-1);
241 if (dc1->crc > dc2->crc)
242 return (1);
243
244 return (0);
245}
246
247int
248hammer_dedup_off_rb_compare(hammer_dedup_cache_t dc1,
249 hammer_dedup_cache_t dc2)
250{
251 if (dc1->data_offset < dc2->data_offset)
252 return (-1);
253 if (dc1->data_offset > dc2->data_offset)
254 return (1);
255
256 return (0);
257}
258
259static int
260hammer_dedup_scancmp(hammer_dedup_cache_t dc, void *data)
261{
262 hammer_off_t off = ((struct hammer_dedup_inval *)data)->base_offset;
263
264 if (dc->data_offset < off)
265 return (-1);
266 if (dc->data_offset >= off + HAMMER_LARGEBLOCK_SIZE)
267 return (1);
268
269 return (0);
270}
271
272static int
273hammer_dedup_cache_inval_callback(hammer_dedup_cache_t dc, void *data)
274{
275 hammer_mount_t hmp = ((struct hammer_dedup_inval *)data)->hmp;
276
277 RB_REMOVE(hammer_dedup_crc_rb_tree, &hmp->rb_dedup_crc_root, dc);
278 RB_REMOVE(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, dc);
279 TAILQ_REMOVE(&hmp->dedup_lru_list, dc, lru_entry);
280
281 --hmp->dedup_cache_count;
282 kfree(dc, hmp->m_misc);
283
284 return (0);
285}
286
287hammer_dedup_cache_t
288hammer_dedup_cache_add(hammer_inode_t ip, hammer_btree_leaf_elm_t leaf)
289{
290 hammer_dedup_cache_t dcp, tmp;
291 hammer_mount_t hmp = ip->hmp;
292
293 if (hmp->dedup_free_cache == NULL) {
294 tmp = kmalloc(sizeof(*tmp), hmp->m_misc, M_WAITOK | M_ZERO);
295 if (hmp->dedup_free_cache == NULL)
296 hmp->dedup_free_cache = tmp;
297 else
298 kfree(tmp, hmp->m_misc);
299 }
300
301 KKASSERT(leaf != NULL);
302
303 dcp = RB_LOOKUP(hammer_dedup_crc_rb_tree,
304 &hmp->rb_dedup_crc_root, leaf->data_crc);
305 if (dcp != NULL) {
306 RB_REMOVE(hammer_dedup_off_rb_tree,
307 &hmp->rb_dedup_off_root, dcp);
308 TAILQ_REMOVE(&hmp->dedup_lru_list, dcp, lru_entry);
309 goto populate;
310 }
311
312 if (hmp->dedup_cache_count < DEDUP_CACHE_SIZE) {
313 dcp = hmp->dedup_free_cache;
314 hmp->dedup_free_cache = NULL;
315 ++hmp->dedup_cache_count;
316 } else {
317 dcp = TAILQ_FIRST(&hmp->dedup_lru_list);
318 RB_REMOVE(hammer_dedup_crc_rb_tree,
319 &hmp->rb_dedup_crc_root, dcp);
320 RB_REMOVE(hammer_dedup_off_rb_tree,
321 &hmp->rb_dedup_off_root, dcp);
322 TAILQ_REMOVE(&hmp->dedup_lru_list, dcp, lru_entry);
323 }
324
325 dcp->crc = leaf->data_crc;
326 tmp = RB_INSERT(hammer_dedup_crc_rb_tree, &hmp->rb_dedup_crc_root, dcp);
327 KKASSERT(tmp == NULL);
328
329populate:
330 dcp->hmp = ip->hmp;
331 dcp->obj_id = ip->obj_id;
332 dcp->localization = ip->obj_localization;
333 dcp->file_offset = leaf->base.key - leaf->data_len;
334 dcp->bytes = leaf->data_len;
335 dcp->data_offset = leaf->data_offset;
336
337 tmp = RB_INSERT(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, dcp);
338 KKASSERT(tmp == NULL);
339 TAILQ_INSERT_TAIL(&hmp->dedup_lru_list, dcp, lru_entry);
340
341 return (dcp);
342}
343
344__inline hammer_dedup_cache_t
345hammer_dedup_cache_lookup(hammer_mount_t hmp, hammer_crc_t crc)
346{
347 hammer_dedup_cache_t dcp;
348
349 dcp = RB_LOOKUP(hammer_dedup_crc_rb_tree,
350 &hmp->rb_dedup_crc_root, crc);
351 return dcp;
352}
353
354void hammer_dedup_cache_inval(hammer_mount_t hmp, hammer_off_t base_offset)
355{
356 struct hammer_dedup_inval di;
357
358 di.hmp = hmp;
359 di.base_offset = base_offset;
360
361 RB_SCAN(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root,
362 hammer_dedup_scancmp, hammer_dedup_cache_inval_callback, &di);
363}
364
365void
366hammer_destroy_dedup_cache(hammer_mount_t hmp)
367{
368 hammer_dedup_cache_t dcp;
369
370 while ((dcp = TAILQ_FIRST(&hmp->dedup_lru_list)) != NULL) {
371 RB_REMOVE(hammer_dedup_crc_rb_tree, &hmp->rb_dedup_crc_root, dcp);
372 RB_REMOVE(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, dcp);
373 TAILQ_REMOVE(&hmp->dedup_lru_list, dcp, lru_entry);
374 --hmp->dedup_cache_count;
375 kfree(dcp, hmp->m_misc);
376 }
377
378 KKASSERT(RB_EMPTY(&hmp->rb_dedup_crc_root));
379 KKASSERT(RB_EMPTY(&hmp->rb_dedup_off_root));
380 KKASSERT(TAILQ_EMPTY(&hmp->dedup_lru_list));
381
382 KKASSERT(hmp->dedup_cache_count == 0);
383}
384
385int
386hammer_dedup_validate(hammer_dedup_cache_t dcp, int zone, int bytes,
387 void *data)
388{
389 int error;
390
391 /*
392 * Zone validation
393 */
394 if (HAMMER_ZONE_DECODE(dcp->data_offset) != zone)
395 return (0);
396
397 /*
398 * Block length validation
399 */
400 if (dcp->bytes != bytes)
401 return (0);
402
403 /*
404 * Byte-by-byte data comparison
405 *
406 * The data we need for validation may already be present in the
407 * buffer cache in two flavours: vnode-based buffer or
408 * block-device-based buffer. In case vnode-based buffer wasn't
409 * there or if a non-blocking attempt to acquire it failed use
410 * device-based buffer (for large-zone data blocks it will
411 * generate a separate read).
412 */
413 if (_vnode_validate(dcp, data, &error)) {
414 hammer_live_dedup_vnode_bcmps++;
415 return (1);
416 } else {
417 if (error == 3)
418 hammer_live_dedup_findblk_failures++;
419 }
420
421 if (error) { /* yes, if there was an error previously */
422 if (_dev_validate(dcp, data, &error)) {
423 hammer_live_dedup_device_bcmps++;
424 return (1);
425 }
426 }
427
428 return (0);
429}
430
431static __inline int
432_vnode_validate(hammer_dedup_cache_t dcp, void *data, int *errorp)
433{
434 struct hammer_transaction trans;
435 hammer_inode_t ip;
436 struct vnode *vp;
437 struct buf *bp;
438 off_t dooffset;
439 int result, error;
440
441 result = error = 0;
442 *errorp = 0;
443
444 hammer_simple_transaction(&trans, dcp->hmp);
445
446 ip = hammer_get_inode(&trans, NULL, dcp->obj_id, HAMMER_MAX_TID,
447 dcp->localization, 0, &error);
448 if (ip == NULL) {
449 kprintf("dedup: unable to find objid %016jx:%08x\n",
450 (intmax_t)dcp->obj_id, dcp->localization);
451 *errorp = 1;
452 goto failed2;
453 }
454
455 error = hammer_get_vnode(ip, &vp);
456 if (error) {
457 kprintf("dedup: unable to acquire vnode for %016jx:%08x\n",
458 (intmax_t)dcp->obj_id, dcp->localization);
459 *errorp = 2;
460 goto failed;
461 }
462
463 if ((bp = findblk(ip->vp, dcp->file_offset, FINDBLK_NBLOCK)) != NULL) {
464 bremfree(bp);
465
466 /* XXX if (mapped to userspace) goto done, *errorp = 4 */
467
468 if ((bp->b_flags & B_CACHE) == 0 || bp->b_flags & B_DIRTY) {
469 *errorp = 5;
470 goto done;
471 }
472
473 if (bp->b_bio2.bio_offset != dcp->data_offset) {
474 error = VOP_BMAP(ip->vp, dcp->file_offset, &dooffset,
475 NULL, NULL, BUF_CMD_READ);
476 if (error) {
477 *errorp = 6;
478 goto done;
479 }
480
481 if (dooffset != dcp->data_offset) {
482 *errorp = 7;
483 goto done;
484 }
485 hammer_live_dedup_bmap_saves++;
486 }
487
488 if (bcmp(data, bp->b_data, dcp->bytes) == 0)
489 result = 1;
490
491done:
492 bqrelse(bp);
493 } else {
494 *errorp = 3;
495 }
496 vput(vp);
497
498failed:
499 hammer_rel_inode(ip, 0);
500failed2:
501 hammer_done_transaction(&trans);
502 return (result);
503}
504
505static __inline int
506_dev_validate(hammer_dedup_cache_t dcp, void *data, int *errorp)
507{
508 hammer_buffer_t buffer = NULL;
509 void *ondisk_data;
510 int result, error;
511
512 result = error = 0;
513 *errorp = 0;
514
515 ondisk_data = hammer_bread_ext(dcp->hmp, dcp->data_offset,
516 dcp->bytes, &error, &buffer);
517 if (error) {
518 *errorp = 1;
519 goto failed;
520 }
521
522 if (bcmp(data, ondisk_data, dcp->bytes) == 0)
523 result = 1;
524
525failed:
526 if (buffer)
527 hammer_rel_buffer(buffer, 0);
528
529 return (result);
530}