f4385a368955b876036e6d8ff335632ac1aa68a5
[dragonfly.git] / sys / vfs / hammer / hammer_dedup.c
1 /*
2  * Copyright (c) 2010 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Ilya Dryomov <idryomov@gmail.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34
35 #include "hammer.h"
36
37 static __inline int validate_zone(hammer_off_t data_offset);
38
39 int
40 hammer_ioc_dedup(hammer_transaction_t trans, hammer_inode_t ip,
41                  struct hammer_ioc_dedup *dedup)
42 {
43         struct hammer_cursor cursor1, cursor2;
44         int error;
45         int seq;
46
47         /*
48          * Enforce hammer filesystem version requirements
49          */
50         if (trans->hmp->version < HAMMER_VOL_VERSION_FIVE) {
51                 kprintf("hammer: Filesystem must be upgraded to v5 "
52                         "before you can run dedup\n");
53                 return (EOPNOTSUPP);
54         }
55
56         /*
57          * Cursor1, return an error -> candidate goes to pass2 list
58          */
59         error = hammer_init_cursor(trans, &cursor1, NULL, NULL);
60         if (error)
61                 goto done_cursor;
62         cursor1.key_beg = dedup->elm1;
63         cursor1.flags |= HAMMER_CURSOR_BACKEND;
64
65         error = hammer_btree_lookup(&cursor1);
66         if (error)
67                 goto done_cursor;
68         error = hammer_btree_extract(&cursor1, HAMMER_CURSOR_GET_LEAF |
69                                                 HAMMER_CURSOR_GET_DATA);
70         if (error)
71                 goto done_cursor;
72
73         /*
74          * Cursor2, return an error -> candidate goes to pass2 list
75          */
76         error = hammer_init_cursor(trans, &cursor2, NULL, NULL);
77         if (error)
78                 goto done_cursors;
79         cursor2.key_beg = dedup->elm2;
80         cursor2.flags |= HAMMER_CURSOR_BACKEND;
81
82         error = hammer_btree_lookup(&cursor2);
83         if (error)
84                 goto done_cursors;
85         error = hammer_btree_extract(&cursor2, HAMMER_CURSOR_GET_LEAF |
86                                                 HAMMER_CURSOR_GET_DATA);
87         if (error)
88                 goto done_cursors;
89
90         /*
91          * Zone validation. We can't de-dup any of the other zones
92          * (BTREE or META) or bad things will happen.
93          *
94          * Return with error = 0, but set an INVALID_ZONE flag.
95          */
96         error = validate_zone(cursor1.leaf->data_offset) +
97                             validate_zone(cursor2.leaf->data_offset);
98         if (error) {
99                 dedup->head.flags |= HAMMER_IOC_DEDUP_INVALID_ZONE;
100                 error = 0;
101                 goto done_cursors;
102         }
103
104         /*
105          * Comparison checks
106          *
107          * If zones don't match or data_len fields aren't the same
108          * we consider it to be a comparison failure.
109          *
110          * Return with error = 0, but set a CMP_FAILURE flag.
111          */
112         if ((cursor1.leaf->data_offset & HAMMER_OFF_ZONE_MASK) !=
113             (cursor2.leaf->data_offset & HAMMER_OFF_ZONE_MASK)) {
114                 dedup->head.flags |= HAMMER_IOC_DEDUP_CMP_FAILURE;
115                 goto done_cursors;
116         }
117         if (cursor1.leaf->data_len != cursor2.leaf->data_len) {
118                 dedup->head.flags |= HAMMER_IOC_DEDUP_CMP_FAILURE;
119                 goto done_cursors;
120         }
121
122         /* byte-by-byte comparison to be sure */
123         if (bcmp(cursor1.data, cursor2.data, cursor1.leaf->data_len)) {
124                 dedup->head.flags |= HAMMER_IOC_DEDUP_CMP_FAILURE;
125                 goto done_cursors;
126         }
127
128         /*
129          * Upgrade both cursors together to an exclusive lock
130          *
131          * Return an error -> candidate goes to pass2 list
132          */
133         hammer_sync_lock_sh(trans);
134         error = hammer_cursor_upgrade2(&cursor1, &cursor2);
135         if (error) {
136                 hammer_sync_unlock(trans);
137                 goto done_cursors;
138         }
139
140         error = hammer_blockmap_dedup(cursor1.trans,
141                         cursor1.leaf->data_offset, cursor1.leaf->data_len);
142         if (error) {
143                 if (error == ERANGE) {
144                         /*
145                          * Return with error = 0, but set an UNDERFLOW flag
146                          */
147                         dedup->head.flags |= HAMMER_IOC_DEDUP_UNDERFLOW;
148                         error = 0;
149                         goto downgrade_cursors;
150                 } else {
151                         /*
152                          * Return an error -> block goes to pass2 list
153                          */
154                         goto downgrade_cursors;
155                 }
156         }
157
158         /*
159          * The cursor2's cache must be invalidated before calling
160          * hammer_blockmap_free(), otherwise it will not be able to
161          * invalidate the underlying data buffer.
162          */
163         hammer_cursor_invalidate_cache(&cursor2);
164         hammer_blockmap_free(cursor2.trans,
165                         cursor2.leaf->data_offset, cursor2.leaf->data_len);
166
167         hammer_modify_node(cursor2.trans, cursor2.node,
168                         &cursor2.leaf->data_offset, sizeof(hammer_off_t));
169         cursor2.leaf->data_offset = cursor1.leaf->data_offset;
170         hammer_modify_node_done(cursor2.node);
171
172 downgrade_cursors:
173         hammer_cursor_downgrade2(&cursor1, &cursor2);
174         hammer_sync_unlock(trans);
175 done_cursors:
176         hammer_done_cursor(&cursor2);
177 done_cursor:
178         hammer_done_cursor(&cursor1);
179
180         /*
181          * Avoid deadlocking the buffer cache
182          */
183         seq = trans->hmp->flusher.done;
184         while (hammer_flusher_meta_halflimit(trans->hmp) ||
185                hammer_flusher_undo_exhausted(trans, 2)) {
186                 hammer_flusher_wait(trans->hmp, seq);
187                 seq = hammer_flusher_async_one(trans->hmp);
188         }
189         return (error);
190 }
191
192 static __inline int
193 validate_zone(hammer_off_t data_offset)
194 {
195         switch(data_offset & HAMMER_OFF_ZONE_MASK) {
196         case HAMMER_ZONE_LARGE_DATA:
197         case HAMMER_ZONE_SMALL_DATA:
198                 return (0);
199         default:
200                 return (1);
201         }
202 }
203
204 /************************************************************************
205  *                              LIVE DEDUP                              *
206  ************************************************************************
207  *
208  * HAMMER Live Dedup (aka as efficient cp(1) implementation)
209  *
210  * The dedup cache is operated in a LRU fashion and destroyed on
211  * unmount, so essentially this is a live dedup on a cached dataset and
212  * not a full-fledged fs-wide one - we have a batched dedup for that.
213  * We allow duplicate entries in the buffer cache, data blocks are
214  * deduplicated only on their way to media.  By default the cache is
215  * populated on reads only, but it can be populated on writes too.
216  *
217  * The main implementation gotcha is on-media requirement - in order for
218  * a data block to be added to a dedup cache it has to be present on
219  * disk.  This simplifies cache logic a lot - once data is laid out on
220  * media it remains valid on media all the way up to the point where the
221  * related big-block the data was stored in is freed - so there is only
222  * one place we need to bother with invalidation code.
223  */
224
225 /*
226  * RB-Tree support for dedup cache structures
227  */
228 RB_GENERATE2(hammer_dedup_crc_rb_tree, hammer_dedup_cache, crc_entry,
229                 hammer_dedup_crc_rb_compare, hammer_crc_t, crc);
230 RB_GENERATE2(hammer_dedup_off_rb_tree, hammer_dedup_cache, off_entry,
231                 hammer_dedup_off_rb_compare, hammer_off_t, data_offset);
232
233 struct hammer_dedup_inval {
234         struct hammer_mount *hmp;
235         hammer_off_t base_offset;
236 };
237
238 static int hammer_dedup_scancmp(hammer_dedup_cache_t dc, void *data);
239 static int hammer_dedup_cache_inval_callback(hammer_dedup_cache_t dc,
240                 void *data);
241 static __inline int _vnode_validate(hammer_dedup_cache_t dcp, void *data,
242                 int *errorp);
243 static __inline int _dev_validate(hammer_dedup_cache_t dcp, void *data,
244                 int *errorp);
245
246 int
247 hammer_dedup_crc_rb_compare(hammer_dedup_cache_t dc1, hammer_dedup_cache_t dc2)
248 {
249         if (dc1->crc < dc2->crc)
250                 return (-1);
251         if (dc1->crc > dc2->crc)
252                 return (1);
253
254         return (0);
255 }
256
257 int
258 hammer_dedup_off_rb_compare(hammer_dedup_cache_t dc1, hammer_dedup_cache_t dc2)
259 {
260         if (dc1->data_offset < dc2->data_offset)
261                 return (-1);
262         if (dc1->data_offset > dc2->data_offset)
263                 return (1);
264
265         return (0);
266 }
267
268 static int
269 hammer_dedup_scancmp(hammer_dedup_cache_t dc, void *data)
270 {
271         hammer_off_t off = ((struct hammer_dedup_inval *)data)->base_offset;
272
273         if (dc->data_offset < off)
274                 return (-1);
275         if (dc->data_offset >= off + HAMMER_BIGBLOCK_SIZE)
276                 return (1);
277
278         return (0);
279 }
280
281 static int
282 hammer_dedup_cache_inval_callback(hammer_dedup_cache_t dc, void *data)
283 {
284         hammer_mount_t hmp = ((struct hammer_dedup_inval *)data)->hmp;
285
286         RB_REMOVE(hammer_dedup_crc_rb_tree, &hmp->rb_dedup_crc_root, dc);
287         RB_REMOVE(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, dc);
288         TAILQ_REMOVE(&hmp->dedup_lru_list, dc, lru_entry);
289
290         --hmp->dedup_cache_count;
291         kfree(dc, hmp->m_misc);
292
293         return (0);
294 }
295
296 hammer_dedup_cache_t
297 hammer_dedup_cache_add(hammer_inode_t ip, hammer_btree_leaf_elm_t leaf)
298 {
299         hammer_dedup_cache_t dcp, tmp;
300         hammer_mount_t hmp = ip->hmp;
301
302         if (hmp->dedup_free_cache == NULL) {
303                 tmp = kmalloc(sizeof(*tmp), hmp->m_misc, M_WAITOK | M_ZERO);
304                 if (hmp->dedup_free_cache == NULL)
305                         hmp->dedup_free_cache = tmp;
306                 else
307                         kfree(tmp, hmp->m_misc);
308         }
309
310         KKASSERT(leaf != NULL);
311
312         dcp = RB_LOOKUP(hammer_dedup_crc_rb_tree,
313                                 &hmp->rb_dedup_crc_root, leaf->data_crc);
314         if (dcp != NULL) {
315                 RB_REMOVE(hammer_dedup_off_rb_tree,
316                                 &hmp->rb_dedup_off_root, dcp);
317                 TAILQ_REMOVE(&hmp->dedup_lru_list, dcp, lru_entry);
318                 goto populate;
319         }
320
321         if (hmp->dedup_cache_count < hammer_live_dedup_cache_size) {
322                 dcp = hmp->dedup_free_cache;
323                 hmp->dedup_free_cache = NULL;
324                 ++hmp->dedup_cache_count;
325         } else {
326                 dcp = TAILQ_FIRST(&hmp->dedup_lru_list);
327                 RB_REMOVE(hammer_dedup_crc_rb_tree,
328                                 &hmp->rb_dedup_crc_root, dcp);
329                 RB_REMOVE(hammer_dedup_off_rb_tree,
330                                 &hmp->rb_dedup_off_root, dcp);
331                 TAILQ_REMOVE(&hmp->dedup_lru_list, dcp, lru_entry);
332         }
333
334         dcp->crc = leaf->data_crc;
335         tmp = RB_INSERT(hammer_dedup_crc_rb_tree, &hmp->rb_dedup_crc_root, dcp);
336         KKASSERT(tmp == NULL);
337
338 populate:
339         dcp->hmp = ip->hmp;
340         dcp->obj_id = ip->obj_id;
341         dcp->localization = ip->obj_localization;
342         dcp->file_offset = leaf->base.key - leaf->data_len;
343         dcp->bytes = leaf->data_len;
344         dcp->data_offset = leaf->data_offset;
345
346         tmp = RB_INSERT(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, dcp);
347         KKASSERT(tmp == NULL);
348         TAILQ_INSERT_TAIL(&hmp->dedup_lru_list, dcp, lru_entry);
349
350         return (dcp);
351 }
352
353 __inline hammer_dedup_cache_t
354 hammer_dedup_cache_lookup(hammer_mount_t hmp, hammer_crc_t crc)
355 {
356         hammer_dedup_cache_t dcp;
357
358         dcp = RB_LOOKUP(hammer_dedup_crc_rb_tree,
359                                 &hmp->rb_dedup_crc_root, crc);
360         return dcp;
361 }
362
363 void hammer_dedup_cache_inval(hammer_mount_t hmp, hammer_off_t base_offset)
364 {
365         struct hammer_dedup_inval di;
366
367         di.hmp = hmp;
368         di.base_offset = base_offset;
369
370         RB_SCAN(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root,
371                 hammer_dedup_scancmp, hammer_dedup_cache_inval_callback, &di);
372 }
373
374 void
375 hammer_destroy_dedup_cache(hammer_mount_t hmp)
376 {
377         hammer_dedup_cache_t dcp;
378
379         while ((dcp = TAILQ_FIRST(&hmp->dedup_lru_list)) != NULL) {
380                 RB_REMOVE(hammer_dedup_crc_rb_tree, &hmp->rb_dedup_crc_root, dcp);
381                 RB_REMOVE(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, dcp);
382                 TAILQ_REMOVE(&hmp->dedup_lru_list, dcp, lru_entry);
383                 --hmp->dedup_cache_count;
384                 kfree(dcp, hmp->m_misc);
385         }
386
387         KKASSERT(RB_EMPTY(&hmp->rb_dedup_crc_root));
388         KKASSERT(RB_EMPTY(&hmp->rb_dedup_off_root));
389         KKASSERT(TAILQ_EMPTY(&hmp->dedup_lru_list));
390
391         KKASSERT(hmp->dedup_cache_count == 0);
392 }
393
394 int
395 hammer_dedup_validate(hammer_dedup_cache_t dcp, int zone, int bytes, void *data)
396 {
397         int error;
398
399         /*
400          * Zone validation
401          */
402         if (HAMMER_ZONE_DECODE(dcp->data_offset) != zone)
403                 return (0);
404
405         /*
406          * Block length validation
407          */
408         if (dcp->bytes != bytes)
409                 return (0);
410
411         /*
412          * Byte-by-byte data comparison
413          *
414          * The data we need for validation may already be present in the
415          * buffer cache in two flavours: vnode-based buffer or
416          * block-device-based buffer.  In case vnode-based buffer wasn't
417          * there or if a non-blocking attempt to acquire it failed use
418          * device-based buffer (for large-zone data blocks it will
419          * generate a separate read).
420          *
421          * XXX vnode-based checking is not MP safe so when live-dedup
422          *     is turned on we must always use the device buffer.
423          */
424 #if 0
425         if (hammer_double_buffer) {
426                 error = 1;
427         } else if (_vnode_validate(dcp, data, &error)) {
428                 hammer_live_dedup_vnode_bcmps++;
429                 return (1);
430         } else {
431                 if (error == 3)
432                         hammer_live_dedup_findblk_failures++;
433         }
434
435         /*
436          * If there was an error previously or if double buffering is
437          * enabled.
438          */
439         if (error) {
440                 if (_dev_validate(dcp, data, &error)) {
441                         hammer_live_dedup_device_bcmps++;
442                         return (1);
443                 }
444         }
445 #endif
446         if (_dev_validate(dcp, data, &error)) {
447                 hammer_live_dedup_device_bcmps++;
448                 return (1);
449         }
450
451         return (0);
452 }
453
454 static __inline int
455 _vnode_validate(hammer_dedup_cache_t dcp, void *data, int *errorp)
456 {
457         struct hammer_transaction trans;
458         hammer_inode_t ip;
459         struct vnode *vp;
460         struct buf *bp;
461         off_t dooffset;
462         int result, error;
463
464         result = error = 0;
465         *errorp = 0;
466
467         hammer_simple_transaction(&trans, dcp->hmp);
468
469         ip = hammer_get_inode(&trans, NULL, dcp->obj_id, HAMMER_MAX_TID,
470             dcp->localization, 0, &error);
471         if (ip == NULL) {
472                 kprintf("dedup: unable to find objid %016jx:%08x\n",
473                     (intmax_t)dcp->obj_id, dcp->localization);
474                 *errorp = 1;
475                 goto failed2;
476         }
477
478         error = hammer_get_vnode(ip, &vp);
479         if (error) {
480                 kprintf("dedup: unable to acquire vnode for %016jx:%08x\n",
481                     (intmax_t)dcp->obj_id, dcp->localization);
482                 *errorp = 2;
483                 goto failed;
484         }
485
486         if ((bp = findblk(ip->vp, dcp->file_offset, FINDBLK_NBLOCK)) != NULL) {
487                 bremfree(bp);
488
489                 /* XXX if (mapped to userspace) goto done, *errorp = 4 */
490
491                 if ((bp->b_flags & B_CACHE) == 0 || bp->b_flags & B_DIRTY) {
492                         *errorp = 5;
493                         goto done;
494                 }
495
496                 if (bp->b_bio2.bio_offset != dcp->data_offset) {
497                         error = VOP_BMAP(ip->vp, dcp->file_offset, &dooffset,
498                             NULL, NULL, BUF_CMD_READ);
499                         if (error) {
500                                 *errorp = 6;
501                                 goto done;
502                         }
503
504                         if (dooffset != dcp->data_offset) {
505                                 *errorp = 7;
506                                 goto done;
507                         }
508                         hammer_live_dedup_bmap_saves++;
509                 }
510
511                 if (bcmp(data, bp->b_data, dcp->bytes) == 0)
512                         result = 1;
513
514 done:
515                 bqrelse(bp);
516         } else {
517                 *errorp = 3;
518         }
519         vput(vp);
520
521 failed:
522         hammer_rel_inode(ip, 0);
523 failed2:
524         hammer_done_transaction(&trans);
525         return (result);
526 }
527
528 static __inline int
529 _dev_validate(hammer_dedup_cache_t dcp, void *data, int *errorp)
530 {
531         hammer_buffer_t buffer = NULL;
532         void *ondisk_data;
533         int result, error;
534
535         result = error = 0;
536         *errorp = 0;
537
538         ondisk_data = hammer_bread_ext(dcp->hmp, dcp->data_offset,
539             dcp->bytes, &error, &buffer);
540         if (error) {
541                 *errorp = 1;
542                 goto failed;
543         }
544
545         if (bcmp(data, ondisk_data, dcp->bytes) == 0)
546                 result = 1;
547
548 failed:
549         if (buffer)
550                 hammer_rel_buffer(buffer, 0);
551
552         return (result);
553 }