Initial import of binutils 2.22 on the new vendor branch
[dragonfly.git] / sys / vfs / hammer / hammer_dedup.c
1 /*
2  * Copyright (c) 2010 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Ilya Dryomov <idryomov@gmail.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34
35 #include "hammer.h"
36
37 static __inline int validate_zone(hammer_off_t data_offset);
38
39 int
40 hammer_ioc_dedup(hammer_transaction_t trans, hammer_inode_t ip,
41                  struct hammer_ioc_dedup *dedup)
42 {
43         struct hammer_cursor cursor1, cursor2;
44         int error;
45
46         /*
47          * Enforce hammer filesystem version requirements
48          */
49         if (trans->hmp->version < HAMMER_VOL_VERSION_FIVE) {
50                 kprintf("hammer: Filesystem must be upgraded to v5 "
51                         "before you can run dedup\n");
52                 return (EOPNOTSUPP);
53         }
54
55         /*
56          * Cursor1, return an error -> candidate goes to pass2 list
57          */
58         error = hammer_init_cursor(trans, &cursor1, NULL, NULL);
59         if (error)
60                 goto done_cursor;
61         cursor1.key_beg = dedup->elm1;
62         cursor1.flags |= HAMMER_CURSOR_BACKEND;
63
64         error = hammer_btree_lookup(&cursor1);
65         if (error)
66                 goto done_cursor;
67         error = hammer_btree_extract(&cursor1, HAMMER_CURSOR_GET_LEAF |
68                                                 HAMMER_CURSOR_GET_DATA);
69         if (error)
70                 goto done_cursor;
71
72         /*
73          * Cursor2, return an error -> candidate goes to pass2 list
74          */
75         error = hammer_init_cursor(trans, &cursor2, NULL, NULL);
76         if (error)
77                 goto done_cursors;
78         cursor2.key_beg = dedup->elm2;
79         cursor2.flags |= HAMMER_CURSOR_BACKEND;
80
81         error = hammer_btree_lookup(&cursor2);
82         if (error)
83                 goto done_cursors;
84         error = hammer_btree_extract(&cursor2, HAMMER_CURSOR_GET_LEAF |
85                                                 HAMMER_CURSOR_GET_DATA);
86         if (error)
87                 goto done_cursors;
88
89         /*
90          * Zone validation. We can't de-dup any of the other zones
91          * (BTREE or META) or bad things will happen.
92          *
93          * Return with error = 0, but set an INVALID_ZONE flag.
94          */
95         error = validate_zone(cursor1.leaf->data_offset) +
96                             validate_zone(cursor2.leaf->data_offset);
97         if (error) {
98                 dedup->head.flags |= HAMMER_IOC_DEDUP_INVALID_ZONE;
99                 error = 0;
100                 goto done_cursors;
101         }
102
103         /*
104          * Comparison checks
105          *
106          * If zones don't match or data_len fields aren't the same
107          * we consider it to be a comparison failure.
108          *
109          * Return with error = 0, but set a CMP_FAILURE flag.
110          */
111         if ((cursor1.leaf->data_offset & HAMMER_OFF_ZONE_MASK) !=
112             (cursor2.leaf->data_offset & HAMMER_OFF_ZONE_MASK)) {
113                 dedup->head.flags |= HAMMER_IOC_DEDUP_CMP_FAILURE;
114                 goto done_cursors;
115         }
116         if (cursor1.leaf->data_len != cursor2.leaf->data_len) {
117                 dedup->head.flags |= HAMMER_IOC_DEDUP_CMP_FAILURE;
118                 goto done_cursors;
119         }
120
121         /* byte-by-byte comparison to be sure */
122         if (bcmp(cursor1.data, cursor2.data, cursor1.leaf->data_len)) {
123                 dedup->head.flags |= HAMMER_IOC_DEDUP_CMP_FAILURE;
124                 goto done_cursors;
125         }
126
127         /*
128          * Upgrade both cursors together to an exclusive lock
129          *
130          * Return an error -> candidate goes to pass2 list
131          */
132         hammer_sync_lock_sh(trans);
133         error = hammer_cursor_upgrade2(&cursor1, &cursor2);
134         if (error) {
135                 hammer_sync_unlock(trans);
136                 goto done_cursors;
137         }
138
139         error = hammer_blockmap_dedup(cursor1.trans,
140                         cursor1.leaf->data_offset, cursor1.leaf->data_len);
141         if (error) {
142                 if (error == ERANGE) {
143                         /*
144                          * Return with error = 0, but set an UNDERFLOW flag
145                          */
146                         dedup->head.flags |= HAMMER_IOC_DEDUP_UNDERFLOW;
147                         error = 0;
148                         goto downgrade_cursors;
149                 } else {
150                         /*
151                          * Return an error -> block goes to pass2 list
152                          */
153                         goto downgrade_cursors;
154                 }
155         }
156
157         /*
158          * The cursor2's cache must be invalidated before calling
159          * hammer_blockmap_free(), otherwise it will not be able to
160          * invalidate the underlying data buffer.
161          */
162         hammer_cursor_invalidate_cache(&cursor2);
163         hammer_blockmap_free(cursor2.trans,
164                         cursor2.leaf->data_offset, cursor2.leaf->data_len);
165
166         hammer_modify_node(cursor2.trans, cursor2.node,
167                         &cursor2.leaf->data_offset, sizeof(hammer_off_t));
168         cursor2.leaf->data_offset = cursor1.leaf->data_offset;
169         hammer_modify_node_done(cursor2.node);
170
171 downgrade_cursors:
172         hammer_cursor_downgrade2(&cursor1, &cursor2);
173         hammer_sync_unlock(trans);
174 done_cursors:
175         hammer_done_cursor(&cursor2);
176 done_cursor:
177         hammer_done_cursor(&cursor1);
178         return (error);
179 }
180
181 static __inline int
182 validate_zone(hammer_off_t data_offset)
183 {
184         switch(data_offset & HAMMER_OFF_ZONE_MASK) {
185         case HAMMER_ZONE_LARGE_DATA:
186         case HAMMER_ZONE_SMALL_DATA:
187                 return (0);
188         default:
189                 return (1);
190         }
191 }
192
193 /************************************************************************
194  *                              LIVE DEDUP                              *
195  ************************************************************************
196  *
197  * HAMMER Live Dedup (aka as efficient cp(1) implementation)
198  *
199  * The dedup cache is operated in a LRU fashion and destroyed on
200  * unmount, so essentially this is a live dedup on a cached dataset and
201  * not a full-fledged fs-wide one - we have a batched dedup for that.
202  * We allow duplicate entries in the buffer cache, data blocks are
203  * deduplicated only on their way to media.  By default the cache is
204  * populated on reads only, but it can be populated on writes too.
205  *
206  * The main implementation gotcha is on-media requirement - in order for
207  * a data block to be added to a dedup cache it has to be present on
208  * disk.  This simplifies cache logic a lot - once data is laid out on
209  * media it remains valid on media all the way up to the point where the
210  * related big block the data was stored in is freed - so there is only
211  * one place we need to bother with invalidation code.
212  */
213
214 /*
215  * RB-Tree support for dedup cache structures
216  */
217 RB_GENERATE2(hammer_dedup_crc_rb_tree, hammer_dedup_cache, crc_entry,
218                 hammer_dedup_crc_rb_compare, hammer_crc_t, crc);
219 RB_GENERATE2(hammer_dedup_off_rb_tree, hammer_dedup_cache, off_entry,
220                 hammer_dedup_off_rb_compare, hammer_off_t, data_offset);
221
222 struct hammer_dedup_inval {
223         struct hammer_mount *hmp;
224         hammer_off_t base_offset;
225 };
226
227 static int hammer_dedup_scancmp(hammer_dedup_cache_t dc, void *data);
228 static int hammer_dedup_cache_inval_callback(hammer_dedup_cache_t dc,
229                 void *data);
230 static __inline int _vnode_validate(hammer_dedup_cache_t dcp, void *data,
231                 int *errorp);
232 static __inline int _dev_validate(hammer_dedup_cache_t dcp, void *data,
233                 int *errorp);
234
235 int
236 hammer_dedup_crc_rb_compare(hammer_dedup_cache_t dc1,
237                                 hammer_dedup_cache_t dc2)
238 {
239         if (dc1->crc < dc2->crc)
240                 return (-1);
241         if (dc1->crc > dc2->crc)
242                 return (1);
243
244         return (0);
245 }
246
247 int
248 hammer_dedup_off_rb_compare(hammer_dedup_cache_t dc1,
249                                 hammer_dedup_cache_t dc2)
250 {
251         if (dc1->data_offset < dc2->data_offset)
252                 return (-1);
253         if (dc1->data_offset > dc2->data_offset)
254                 return (1);
255
256         return (0);
257 }
258
259 static int
260 hammer_dedup_scancmp(hammer_dedup_cache_t dc, void *data)
261 {
262         hammer_off_t off = ((struct hammer_dedup_inval *)data)->base_offset;
263
264         if (dc->data_offset < off)
265                 return (-1);
266         if (dc->data_offset >= off + HAMMER_LARGEBLOCK_SIZE)
267                 return (1);
268
269         return (0);
270 }
271
272 static int
273 hammer_dedup_cache_inval_callback(hammer_dedup_cache_t dc, void *data)
274 {
275         hammer_mount_t hmp = ((struct hammer_dedup_inval *)data)->hmp;
276
277         RB_REMOVE(hammer_dedup_crc_rb_tree, &hmp->rb_dedup_crc_root, dc);
278         RB_REMOVE(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, dc);
279         TAILQ_REMOVE(&hmp->dedup_lru_list, dc, lru_entry);
280
281         --hmp->dedup_cache_count;
282         kfree(dc, hmp->m_misc);
283
284         return (0);
285 }
286
287 hammer_dedup_cache_t
288 hammer_dedup_cache_add(hammer_inode_t ip, hammer_btree_leaf_elm_t leaf)
289 {
290         hammer_dedup_cache_t dcp, tmp;
291         hammer_mount_t hmp = ip->hmp;
292
293         if (hmp->dedup_free_cache == NULL) {
294                 tmp = kmalloc(sizeof(*tmp), hmp->m_misc, M_WAITOK | M_ZERO);
295                 if (hmp->dedup_free_cache == NULL)
296                         hmp->dedup_free_cache = tmp;
297                 else
298                         kfree(tmp, hmp->m_misc);
299         }
300
301         KKASSERT(leaf != NULL);
302
303         dcp = RB_LOOKUP(hammer_dedup_crc_rb_tree,
304                                 &hmp->rb_dedup_crc_root, leaf->data_crc);
305         if (dcp != NULL) {
306                 RB_REMOVE(hammer_dedup_off_rb_tree,
307                                 &hmp->rb_dedup_off_root, dcp);
308                 TAILQ_REMOVE(&hmp->dedup_lru_list, dcp, lru_entry);
309                 goto populate;
310         }
311
312         if (hmp->dedup_cache_count < hammer_live_dedup_cache_size) {
313                 dcp = hmp->dedup_free_cache;
314                 hmp->dedup_free_cache = NULL;
315                 ++hmp->dedup_cache_count;
316         } else {
317                 dcp = TAILQ_FIRST(&hmp->dedup_lru_list);
318                 RB_REMOVE(hammer_dedup_crc_rb_tree,
319                                 &hmp->rb_dedup_crc_root, dcp);
320                 RB_REMOVE(hammer_dedup_off_rb_tree,
321                                 &hmp->rb_dedup_off_root, dcp);
322                 TAILQ_REMOVE(&hmp->dedup_lru_list, dcp, lru_entry);
323         }
324
325         dcp->crc = leaf->data_crc;
326         tmp = RB_INSERT(hammer_dedup_crc_rb_tree, &hmp->rb_dedup_crc_root, dcp);
327         KKASSERT(tmp == NULL);
328
329 populate:
330         dcp->hmp = ip->hmp;
331         dcp->obj_id = ip->obj_id;
332         dcp->localization = ip->obj_localization;
333         dcp->file_offset = leaf->base.key - leaf->data_len;
334         dcp->bytes = leaf->data_len;
335         dcp->data_offset = leaf->data_offset;
336
337         tmp = RB_INSERT(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, dcp);
338         KKASSERT(tmp == NULL);
339         TAILQ_INSERT_TAIL(&hmp->dedup_lru_list, dcp, lru_entry);
340
341         return (dcp);
342 }
343
344 __inline hammer_dedup_cache_t
345 hammer_dedup_cache_lookup(hammer_mount_t hmp, hammer_crc_t crc)
346 {
347         hammer_dedup_cache_t dcp;
348
349         dcp = RB_LOOKUP(hammer_dedup_crc_rb_tree,
350                                 &hmp->rb_dedup_crc_root, crc);
351         return dcp;
352 }
353
354 void hammer_dedup_cache_inval(hammer_mount_t hmp, hammer_off_t base_offset)
355 {
356         struct hammer_dedup_inval di;
357
358         di.hmp = hmp;
359         di.base_offset = base_offset;
360
361         RB_SCAN(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root,
362                 hammer_dedup_scancmp, hammer_dedup_cache_inval_callback, &di);
363 }
364
365 void
366 hammer_destroy_dedup_cache(hammer_mount_t hmp)
367 {
368         hammer_dedup_cache_t dcp;
369
370         while ((dcp = TAILQ_FIRST(&hmp->dedup_lru_list)) != NULL) {
371                 RB_REMOVE(hammer_dedup_crc_rb_tree, &hmp->rb_dedup_crc_root, dcp);
372                 RB_REMOVE(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, dcp);
373                 TAILQ_REMOVE(&hmp->dedup_lru_list, dcp, lru_entry);
374                 --hmp->dedup_cache_count;
375                 kfree(dcp, hmp->m_misc);
376         }
377
378         KKASSERT(RB_EMPTY(&hmp->rb_dedup_crc_root));
379         KKASSERT(RB_EMPTY(&hmp->rb_dedup_off_root));
380         KKASSERT(TAILQ_EMPTY(&hmp->dedup_lru_list));
381
382         KKASSERT(hmp->dedup_cache_count == 0);
383 }
384
385 int
386 hammer_dedup_validate(hammer_dedup_cache_t dcp, int zone, int bytes,
387                       void *data)
388 {
389         int error;
390
391         /*
392          * Zone validation
393          */
394         if (HAMMER_ZONE_DECODE(dcp->data_offset) != zone)
395                 return (0);
396
397         /*
398          * Block length validation
399          */
400         if (dcp->bytes != bytes)
401                 return (0);
402
403         /*
404          * Byte-by-byte data comparison
405          *
406          * The data we need for validation may already be present in the
407          * buffer cache in two flavours: vnode-based buffer or
408          * block-device-based buffer.  In case vnode-based buffer wasn't
409          * there or if a non-blocking attempt to acquire it failed use
410          * device-based buffer (for large-zone data blocks it will
411          * generate a separate read).
412          *
413          * XXX vnode-based checking is not MP safe so when live-dedup
414          *     is turned on we must always use the device buffer.
415          */
416 #if 0
417         if (hammer_double_buffer) {
418                 error = 1;
419         } else if (_vnode_validate(dcp, data, &error)) {
420                 hammer_live_dedup_vnode_bcmps++;
421                 return (1);
422         } else {
423                 if (error == 3)
424                         hammer_live_dedup_findblk_failures++;
425         }
426
427         /*
428          * If there was an error previously or if double buffering is
429          * enabled.
430          */
431         if (error) {
432                 if (_dev_validate(dcp, data, &error)) {
433                         hammer_live_dedup_device_bcmps++;
434                         return (1);
435                 }
436         }
437 #endif
438         if (_dev_validate(dcp, data, &error)) {
439                 hammer_live_dedup_device_bcmps++;
440                 return (1);
441         }
442
443         return (0);
444 }
445
446 static __inline int
447 _vnode_validate(hammer_dedup_cache_t dcp, void *data, int *errorp)
448 {
449         struct hammer_transaction trans;
450         hammer_inode_t ip;
451         struct vnode *vp;
452         struct buf *bp;
453         off_t dooffset;
454         int result, error;
455
456         result = error = 0;
457         *errorp = 0;
458
459         hammer_simple_transaction(&trans, dcp->hmp);
460
461         ip = hammer_get_inode(&trans, NULL, dcp->obj_id, HAMMER_MAX_TID,
462             dcp->localization, 0, &error);
463         if (ip == NULL) {
464                 kprintf("dedup: unable to find objid %016jx:%08x\n",
465                     (intmax_t)dcp->obj_id, dcp->localization);
466                 *errorp = 1;
467                 goto failed2;
468         }
469
470         error = hammer_get_vnode(ip, &vp);
471         if (error) {
472                 kprintf("dedup: unable to acquire vnode for %016jx:%08x\n",
473                     (intmax_t)dcp->obj_id, dcp->localization);
474                 *errorp = 2;
475                 goto failed;
476         }
477
478         if ((bp = findblk(ip->vp, dcp->file_offset, FINDBLK_NBLOCK)) != NULL) {
479                 bremfree(bp);
480
481                 /* XXX if (mapped to userspace) goto done, *errorp = 4 */
482
483                 if ((bp->b_flags & B_CACHE) == 0 || bp->b_flags & B_DIRTY) {
484                         *errorp = 5;
485                         goto done;
486                 }
487
488                 if (bp->b_bio2.bio_offset != dcp->data_offset) {
489                         error = VOP_BMAP(ip->vp, dcp->file_offset, &dooffset,
490                             NULL, NULL, BUF_CMD_READ);
491                         if (error) {
492                                 *errorp = 6;
493                                 goto done;
494                         }
495
496                         if (dooffset != dcp->data_offset) {
497                                 *errorp = 7;
498                                 goto done;
499                         }
500                         hammer_live_dedup_bmap_saves++;
501                 }
502
503                 if (bcmp(data, bp->b_data, dcp->bytes) == 0)
504                         result = 1;
505
506 done:
507                 bqrelse(bp);
508         } else {
509                 *errorp = 3;
510         }
511         vput(vp);
512
513 failed:
514         hammer_rel_inode(ip, 0);
515 failed2:
516         hammer_done_transaction(&trans);
517         return (result);
518 }
519
520 static __inline int
521 _dev_validate(hammer_dedup_cache_t dcp, void *data, int *errorp)
522 {
523         hammer_buffer_t buffer = NULL;
524         void *ondisk_data;
525         int result, error;
526
527         result = error = 0;
528         *errorp = 0;
529
530         ondisk_data = hammer_bread_ext(dcp->hmp, dcp->data_offset,
531             dcp->bytes, &error, &buffer);
532         if (error) {
533                 *errorp = 1;
534                 goto failed;
535         }
536
537         if (bcmp(data, ondisk_data, dcp->bytes) == 0)
538                 result = 1;
539
540 failed:
541         if (buffer)
542                 hammer_rel_buffer(buffer, 0);
543
544         return (result);
545 }