5b4d43d533e7aaee147d0c6fa92076b1a119c8a6
[dragonfly.git] / sbin / hammer / cmd_dedup.c
1 /*
2  * Copyright (c) 2010 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Ilya Dryomov <idryomov@gmail.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34
35 #include "hammer.h"
36 #include <libutil.h>
37 #include <crypto/sha2/sha2.h>
38
39 #define DEDUP_BUF (64 * 1024)
40
41 /* Sorted list of block CRCs - light version for dedup-simulate */
42 struct sim_dedup_entry_rb_tree;
43 RB_HEAD(sim_dedup_entry_rb_tree, sim_dedup_entry) sim_dedup_tree =
44                                         RB_INITIALIZER(&sim_dedup_tree);
45 RB_PROTOTYPE2(sim_dedup_entry_rb_tree, sim_dedup_entry, rb_entry,
46                 rb_sim_dedup_entry_compare, hammer_crc_t);
47
48 struct sim_dedup_entry {
49         hammer_crc_t    crc;
50         u_int64_t       ref_blks; /* number of blocks referenced */
51         u_int64_t       ref_size; /* size of data referenced */
52         RB_ENTRY(sim_dedup_entry) rb_entry;
53 };
54
55 struct dedup_entry {
56         struct hammer_btree_leaf_elm leaf;
57         union {
58                 struct {
59                         u_int64_t ref_blks;
60                         u_int64_t ref_size;
61                 } de;
62                 RB_HEAD(sha_dedup_entry_rb_tree, sha_dedup_entry) fict_root;
63         } u;
64         u_int8_t flags;
65         RB_ENTRY(dedup_entry) rb_entry;
66 };
67
68 #define HAMMER_DEDUP_ENTRY_FICTITIOUS   0x0001
69
70 struct sha_dedup_entry {
71         struct hammer_btree_leaf_elm    leaf;
72         u_int64_t                       ref_blks;
73         u_int64_t                       ref_size;
74         u_int8_t                        sha_hash[SHA256_DIGEST_LENGTH];
75         RB_ENTRY(sha_dedup_entry)       fict_entry;
76 };
77
78 /* Sorted list of HAMMER B-Tree keys */
79 struct dedup_entry_rb_tree;
80 struct sha_dedup_entry_rb_tree;
81
82 RB_HEAD(dedup_entry_rb_tree, dedup_entry) dedup_tree =
83                                         RB_INITIALIZER(&dedup_tree);
84 RB_PROTOTYPE2(dedup_entry_rb_tree, dedup_entry, rb_entry,
85                 rb_dedup_entry_compare, hammer_crc_t);
86
87 RB_PROTOTYPE(sha_dedup_entry_rb_tree, sha_dedup_entry, fict_entry,
88                 rb_sha_dedup_entry_compare);
89
90 /*
91  * Pass2 list - contains entries that were not dedup'ed because ioctl failed
92  */
93 STAILQ_HEAD(, pass2_dedup_entry) pass2_dedup_queue =
94                                 STAILQ_HEAD_INITIALIZER(pass2_dedup_queue);
95
96 struct pass2_dedup_entry {
97         struct hammer_btree_leaf_elm    leaf;
98         STAILQ_ENTRY(pass2_dedup_entry) sq_entry;
99 };
100
101 #define DEDUP_PASS2     0x0001 /* process_btree_elm() mode */
102
103 static int SigInfoFlag;
104 static int64_t DedupDataReads;
105 static int64_t DedupCurrentRecords;
106 static int64_t DedupTotalRecords;
107 static u_int32_t DedupCrcStart;
108 static u_int32_t DedupCrcEnd;
109 static u_int64_t MemoryUse;
110
111 /* PFS global ids - we deal with just one PFS at a run */
112 int glob_fd;
113 struct hammer_ioc_pseudofs_rw glob_pfs;
114
115 /*
116  * Global accounting variables
117  *
118  * Last three don't have to be 64-bit, just to be safe..
119  */
120 u_int64_t dedup_alloc_size = 0;
121 u_int64_t dedup_ref_size = 0;
122 u_int64_t dedup_skipped_size = 0;
123 u_int64_t dedup_crc_failures = 0;
124 u_int64_t dedup_sha_failures = 0;
125 u_int64_t dedup_underflows = 0;
126 u_int64_t dedup_successes_count = 0;
127 u_int64_t dedup_successes_bytes = 0;
128
129 static int rb_sim_dedup_entry_compare(struct sim_dedup_entry *sim_de1,
130                                 struct sim_dedup_entry *sim_de2);
131 static int rb_dedup_entry_compare(struct dedup_entry *de1,
132                                 struct dedup_entry *de2);
133 static int rb_sha_dedup_entry_compare(struct sha_dedup_entry *sha_de1,
134                                 struct sha_dedup_entry *sha_de2);
135 typedef int (*scan_pfs_cb_t)(hammer_btree_leaf_elm_t scan_leaf, int flags);
136 static void scan_pfs(char *filesystem, scan_pfs_cb_t func, const char *id);
137 static int collect_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags);
138 static int count_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags);
139 static int process_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags);
140 static int upgrade_chksum(hammer_btree_leaf_elm_t leaf, u_int8_t *sha_hash);
141 static void dump_simulated_dedup(void);
142 static void dump_real_dedup(void);
143 static void dedup_usage(int code);
144
145 RB_GENERATE2(sim_dedup_entry_rb_tree, sim_dedup_entry, rb_entry,
146                 rb_sim_dedup_entry_compare, hammer_crc_t, crc);
147 RB_GENERATE2(dedup_entry_rb_tree, dedup_entry, rb_entry,
148                 rb_dedup_entry_compare, hammer_crc_t, leaf.data_crc);
149 RB_GENERATE(sha_dedup_entry_rb_tree, sha_dedup_entry, fict_entry,
150                 rb_sha_dedup_entry_compare);
151
152 static int
153 rb_sim_dedup_entry_compare(struct sim_dedup_entry *sim_de1,
154                         struct sim_dedup_entry *sim_de2)
155 {
156         if (sim_de1->crc < sim_de2->crc)
157                 return (-1);
158         if (sim_de1->crc > sim_de2->crc)
159                 return (1);
160
161         return (0);
162 }
163
164 static int
165 rb_dedup_entry_compare(struct dedup_entry *de1, struct dedup_entry *de2)
166 {
167         if (de1->leaf.data_crc < de2->leaf.data_crc)
168                 return (-1);
169         if (de1->leaf.data_crc > de2->leaf.data_crc)
170                 return (1);
171
172         return (0);
173 }
174
175 static int
176 rb_sha_dedup_entry_compare(struct sha_dedup_entry *sha_de1,
177                         struct sha_dedup_entry *sha_de2)
178 {
179         unsigned long *h1 = (unsigned long *)&sha_de1->sha_hash;
180         unsigned long *h2 = (unsigned long *)&sha_de2->sha_hash;
181         int i;
182
183         for (i = 0; i < SHA256_DIGEST_LENGTH / (int)sizeof(unsigned long); ++i) {
184                 if (h1[i] < h2[i])
185                         return (-1);
186                 if (h1[i] > h2[i])
187                         return (1);
188         }
189
190         return (0);
191 }
192
193 /*
194  * dedup-simulate <filesystem>
195  */
196 void
197 hammer_cmd_dedup_simulate(char **av, int ac)
198 {
199         struct sim_dedup_entry *sim_de;
200
201         if (ac != 1)
202                 dedup_usage(1);
203
204         glob_fd = getpfs(&glob_pfs, av[0]);
205
206         /*
207          * Collection passes (memory limited)
208          */
209         printf("Dedup-simulate running\n");
210         do {
211                 DedupCrcStart = DedupCrcEnd;
212                 DedupCrcEnd = 0;
213                 MemoryUse = 0;
214
215                 if (VerboseOpt) {
216                         printf("b-tree pass  crc-range %08x-max\n",
217                                 DedupCrcStart);
218                         fflush(stdout);
219                 }
220                 scan_pfs(av[0], collect_btree_elm, "simu-pass");
221
222                 if (VerboseOpt >= 2)
223                         dump_simulated_dedup();
224
225                 /*
226                  * Calculate simulated dedup ratio and get rid of the tree
227                  */
228                 while ((sim_de = RB_ROOT(&sim_dedup_tree)) != NULL) {
229                         assert(sim_de->ref_blks != 0);
230                         dedup_ref_size += sim_de->ref_size;
231                         dedup_alloc_size += sim_de->ref_size / sim_de->ref_blks;
232
233                         RB_REMOVE(sim_dedup_entry_rb_tree, &sim_dedup_tree, sim_de);
234                         free(sim_de);
235                 }
236                 if (DedupCrcEnd && VerboseOpt == 0)
237                         printf(".");
238         } while (DedupCrcEnd);
239
240         printf("Dedup-simulate %s succeeded\n", av[0]);
241         relpfs(glob_fd, &glob_pfs);
242
243         printf("Simulated dedup ratio = %.2f\n",
244             (dedup_alloc_size != 0) ?
245                 (double)dedup_ref_size / dedup_alloc_size : 0);
246 }
247
248 /*
249  * dedup <filesystem>
250  */
251 void
252 hammer_cmd_dedup(char **av, int ac)
253 {
254         struct dedup_entry *de;
255         struct sha_dedup_entry *sha_de;
256         struct pass2_dedup_entry *pass2_de;
257         char buf[8];
258
259         if (TimeoutOpt > 0)
260                 alarm(TimeoutOpt);
261
262         if (ac != 1)
263                 dedup_usage(1);
264
265         STAILQ_INIT(&pass2_dedup_queue);
266
267         glob_fd = getpfs(&glob_pfs, av[0]);
268
269         /*
270          * Pre-pass to cache the btree
271          */
272         scan_pfs(av[0], count_btree_elm, "pre-pass ");
273         DedupTotalRecords = DedupCurrentRecords;
274
275         /*
276          * Collection passes (memory limited)
277          */
278         printf("Dedup running\n");
279         do {
280                 DedupCrcStart = DedupCrcEnd;
281                 DedupCrcEnd = 0;
282                 MemoryUse = 0;
283
284                 if (VerboseOpt) {
285                         printf("b-tree pass  crc-range %08x-max\n",
286                                 DedupCrcStart);
287                         fflush(stdout);
288                 }
289                 scan_pfs(av[0], process_btree_elm, "main-pass");
290
291                 while ((pass2_de = STAILQ_FIRST(&pass2_dedup_queue)) != NULL) {
292                         if (process_btree_elm(&pass2_de->leaf, DEDUP_PASS2))
293                                 dedup_skipped_size -= pass2_de->leaf.data_len;
294
295                         STAILQ_REMOVE_HEAD(&pass2_dedup_queue, sq_entry);
296                         free(pass2_de);
297                 }
298                 assert(STAILQ_EMPTY(&pass2_dedup_queue));
299
300                 if (VerboseOpt >= 2)
301                         dump_real_dedup();
302
303                 /*
304                  * Calculate dedup ratio and get rid of the trees
305                  */
306                 while ((de = RB_ROOT(&dedup_tree)) != NULL) {
307                         if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
308                                 while ((sha_de = RB_ROOT(&de->u.fict_root)) != NULL) {
309                                         assert(sha_de->ref_blks != 0);
310                                         dedup_ref_size += sha_de->ref_size;
311                                         dedup_alloc_size += sha_de->ref_size / sha_de->ref_blks;
312
313                                         RB_REMOVE(sha_dedup_entry_rb_tree,
314                                                         &de->u.fict_root, sha_de);
315                                         free(sha_de);
316                                 }
317                                 assert(RB_EMPTY(&de->u.fict_root));
318                         } else {
319                                 assert(de->u.de.ref_blks != 0);
320                                 dedup_ref_size += de->u.de.ref_size;
321                                 dedup_alloc_size += de->u.de.ref_size / de->u.de.ref_blks;
322                         }
323
324                         RB_REMOVE(dedup_entry_rb_tree, &dedup_tree, de);
325                         free(de);
326                 }
327                 assert(RB_EMPTY(&dedup_tree));
328                 if (DedupCrcEnd && VerboseOpt == 0)
329                         printf(".");
330         } while (DedupCrcEnd);
331
332         printf("Dedup %s succeeded\n", av[0]);
333         relpfs(glob_fd, &glob_pfs);
334
335         humanize_unsigned(buf, sizeof(buf), dedup_ref_size, "B", 1024);
336         printf("Dedup ratio = %.2f\n"
337                "    %8s referenced\n",
338                ((dedup_alloc_size != 0) ?
339                         (double)dedup_ref_size / dedup_alloc_size : 0),
340                buf
341         );
342         humanize_unsigned(buf, sizeof(buf), dedup_alloc_size, "B", 1024);
343         printf("    %8s allocated\n", buf);
344         humanize_unsigned(buf, sizeof(buf), dedup_skipped_size, "B", 1024);
345         printf("    %8s skipped\n", buf);
346         printf("    %8jd CRC collisions\n"
347                "    %8jd SHA collisions\n"
348                "    %8jd bigblock underflows\n"
349                "    %8jd new dedup records\n"
350                "    %8jd new dedup bytes\n",
351                (intmax_t)dedup_crc_failures,
352                (intmax_t)dedup_sha_failures,
353                (intmax_t)dedup_underflows,
354                (intmax_t)dedup_successes_count,
355                (intmax_t)dedup_successes_bytes
356         );
357 }
358
359 static int
360 count_btree_elm(hammer_btree_leaf_elm_t scan_leaf __unused, int flags __unused)
361 {
362         return(1);
363 }
364
365 static int
366 collect_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags __unused)
367 {
368         struct sim_dedup_entry *sim_de;
369
370         /*
371          * If we are using too much memory we have to clean some out, which
372          * will cause the run to use multiple passes.  Be careful of integer
373          * overflows!
374          */
375         if (MemoryUse > MemoryLimit) {
376                 DedupCrcEnd = DedupCrcStart +
377                               (u_int32_t)(DedupCrcEnd - DedupCrcStart - 1) / 2;
378                 if (VerboseOpt) {
379                         printf("memory limit  crc-range %08x-%08x\n",
380                                 DedupCrcStart, DedupCrcEnd);
381                         fflush(stdout);
382                 }
383                 for (;;) {
384                         sim_de = RB_MAX(sim_dedup_entry_rb_tree,
385                                         &sim_dedup_tree);
386                         if (sim_de == NULL || sim_de->crc < DedupCrcEnd)
387                                 break;
388                         RB_REMOVE(sim_dedup_entry_rb_tree,
389                                   &sim_dedup_tree, sim_de);
390                         MemoryUse -= sizeof(*sim_de);
391                         free(sim_de);
392                 }
393         }
394
395         /*
396          * Collect statistics based on the CRC only, do not try to read
397          * any data blocks or run SHA hashes.
398          */
399         sim_de = RB_LOOKUP(sim_dedup_entry_rb_tree, &sim_dedup_tree,
400                            scan_leaf->data_crc);
401
402         if (sim_de == NULL) {
403                 sim_de = calloc(sizeof(*sim_de), 1);
404                 sim_de->crc = scan_leaf->data_crc;
405                 RB_INSERT(sim_dedup_entry_rb_tree, &sim_dedup_tree, sim_de);
406                 MemoryUse += sizeof(*sim_de);
407         }
408
409         sim_de->ref_blks += 1;
410         sim_de->ref_size += scan_leaf->data_len;
411         return (1);
412 }
413
414 static __inline int
415 validate_dedup_pair(hammer_btree_leaf_elm_t p, hammer_btree_leaf_elm_t q)
416 {
417         if ((p->data_offset & HAMMER_OFF_ZONE_MASK) !=
418             (q->data_offset & HAMMER_OFF_ZONE_MASK)) {
419                 return (1);
420         }
421         if (p->data_len != q->data_len) {
422                 return (1);
423         }
424
425         return (0);
426 }
427
428 #define DEDUP_TECH_FAILURE      1
429 #define DEDUP_CMP_FAILURE       2
430 #define DEDUP_INVALID_ZONE      3
431 #define DEDUP_UNDERFLOW         4
432 #define DEDUP_VERS_FAILURE      5
433
434 static __inline int
435 deduplicate(hammer_btree_leaf_elm_t p, hammer_btree_leaf_elm_t q)
436 {
437         struct hammer_ioc_dedup dedup;
438
439         bzero(&dedup, sizeof(dedup));
440
441         /*
442          * If data_offset fields are the same there is no need to run ioctl,
443          * candidate is already dedup'ed.
444          */
445         if (p->data_offset == q->data_offset) {
446                 return (0);
447         }
448
449         dedup.elm1 = p->base;
450         dedup.elm2 = q->base;
451         RunningIoctl = 1;
452         if (ioctl(glob_fd, HAMMERIOC_DEDUP, &dedup) < 0) {
453                 if (errno == EOPNOTSUPP) {
454                         /* must be at least version 5 */
455                         return (DEDUP_VERS_FAILURE);
456                 }
457                 /* Technical failure - locking or w/e */
458                 return (DEDUP_TECH_FAILURE);
459         }
460         if (dedup.head.flags & HAMMER_IOC_DEDUP_CMP_FAILURE) {
461                 return (DEDUP_CMP_FAILURE);
462         }
463         if (dedup.head.flags & HAMMER_IOC_DEDUP_INVALID_ZONE) {
464                 return (DEDUP_INVALID_ZONE);
465         }
466         if (dedup.head.flags & HAMMER_IOC_DEDUP_UNDERFLOW) {
467                 return (DEDUP_UNDERFLOW);
468         }
469         RunningIoctl = 0;
470         ++dedup_successes_count;
471         dedup_successes_bytes += p->data_len;
472         return (0);
473 }
474
475 static int
476 process_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags)
477 {
478         struct dedup_entry *de;
479         struct sha_dedup_entry *sha_de, temp;
480         struct pass2_dedup_entry *pass2_de;
481         int error;
482
483         /*
484          * If we are using too much memory we have to clean some out, which
485          * will cause the run to use multiple passes.  Be careful of integer
486          * overflows!
487          */
488         while (MemoryUse > MemoryLimit) {
489                 DedupCrcEnd = DedupCrcStart +
490                               (u_int32_t)(DedupCrcEnd - DedupCrcStart - 1) / 2;
491                 if (VerboseOpt) {
492                         printf("memory limit  crc-range %08x-%08x\n",
493                                 DedupCrcStart, DedupCrcEnd);
494                         fflush(stdout);
495                 }
496
497                 for (;;) {
498                         de = RB_MAX(dedup_entry_rb_tree, &dedup_tree);
499                         if (de == NULL || de->leaf.data_crc < DedupCrcEnd)
500                                 break;
501                         if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
502                                 while ((sha_de = RB_ROOT(&de->u.fict_root)) !=
503                                        NULL) {
504                                         RB_REMOVE(sha_dedup_entry_rb_tree,
505                                                   &de->u.fict_root, sha_de);
506                                         MemoryUse -= sizeof(*sha_de);
507                                         free(sha_de);
508                                 }
509                         }
510                         RB_REMOVE(dedup_entry_rb_tree, &dedup_tree, de);
511                         MemoryUse -= sizeof(*de);
512                         free(de);
513                 }
514         }
515
516         /*
517          * Collect statistics based on the CRC.  Colliding CRCs usually
518          * cause a SHA sub-tree to be created under the de.
519          *
520          * Trivial case if de not found.
521          */
522         de = RB_LOOKUP(dedup_entry_rb_tree, &dedup_tree, scan_leaf->data_crc);
523         if (de == NULL) {
524                 de = calloc(sizeof(*de), 1);
525                 de->leaf = *scan_leaf;
526                 RB_INSERT(dedup_entry_rb_tree, &dedup_tree, de);
527                 MemoryUse += sizeof(*de);
528                 goto upgrade_stats;
529         }
530
531         /*
532          * Found entry in CRC tree
533          */
534         if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
535                 /*
536                  * Optimize the case where a CRC failure results in multiple
537                  * SHA entries.  If we unconditionally issue a data-read a
538                  * degenerate situation where a colliding CRC's second SHA
539                  * entry contains the lion's share of the deduplication
540                  * candidates will result in excessive data block reads.
541                  *
542                  * Deal with the degenerate case by looking for a matching
543                  * data_offset/data_len in the SHA elements we already have
544                  * before reading the data block and generating a new SHA.
545                  */
546                 RB_FOREACH(sha_de, sha_dedup_entry_rb_tree, &de->u.fict_root) {
547                         if (sha_de->leaf.data_offset ==
548                                                 scan_leaf->data_offset &&
549                             sha_de->leaf.data_len == scan_leaf->data_len) {
550                                 memcpy(temp.sha_hash, sha_de->sha_hash,
551                                         SHA256_DIGEST_LENGTH);
552                                 break;
553                         }
554                 }
555
556                 /*
557                  * Entry in CRC tree is fictitious, so we already had problems
558                  * with this CRC. Upgrade (compute SHA) the candidate and
559                  * dive into SHA subtree. If upgrade fails insert the candidate
560                  * into Pass2 list (it will be processed later).
561                  */
562                 if (sha_de == NULL) {
563                         if (upgrade_chksum(scan_leaf, temp.sha_hash))
564                                 goto pass2_insert;
565
566                         sha_de = RB_FIND(sha_dedup_entry_rb_tree,
567                                          &de->u.fict_root, &temp);
568                 }
569
570                 /*
571                  * Nothing in SHA subtree so far, so this is a new
572                  * 'dataset'. Insert new entry into SHA subtree.
573                  */
574                 if (sha_de == NULL) {
575                         sha_de = calloc(sizeof(*sha_de), 1);
576                         sha_de->leaf = *scan_leaf;
577                         memcpy(sha_de->sha_hash, temp.sha_hash,
578                                SHA256_DIGEST_LENGTH);
579                         RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root,
580                                   sha_de);
581                         MemoryUse += sizeof(*sha_de);
582                         goto upgrade_stats_sha;
583                 }
584
585                 /*
586                  * Found entry in SHA subtree, it means we have a potential
587                  * dedup pair. Validate it (zones have to match and data_len
588                  * field have to be the same too. If validation fails, treat
589                  * it as a SHA collision (jump to sha256_failure).
590                  */
591                 if (validate_dedup_pair(&sha_de->leaf, scan_leaf))
592                         goto sha256_failure;
593
594                 /*
595                  * We have a valid dedup pair (SHA match, validated).
596                  *
597                  * In case of technical failure (dedup pair was good, but
598                  * ioctl failed anyways) insert the candidate into Pass2 list
599                  * (we will try to dedup it after we are done with the rest of
600                  * the tree).
601                  *
602                  * If ioctl fails because either of blocks is in the non-dedup
603                  * zone (we can dedup only in LARGE_DATA and SMALL_DATA) don't
604                  * bother with the candidate and terminate early.
605                  *
606                  * If ioctl fails because of bigblock underflow replace the
607                  * leaf node that found dedup entry represents with scan_leaf.
608                  */
609                 error = deduplicate(&sha_de->leaf, scan_leaf);
610                 switch(error) {
611                 case 0:
612                         goto upgrade_stats_sha;
613                 case DEDUP_TECH_FAILURE:
614                         goto pass2_insert;
615                 case DEDUP_CMP_FAILURE:
616                         goto sha256_failure;
617                 case DEDUP_INVALID_ZONE:
618                         goto terminate_early;
619                 case DEDUP_UNDERFLOW:
620                         ++dedup_underflows;
621                         sha_de->leaf = *scan_leaf;
622                         memcpy(sha_de->sha_hash, temp.sha_hash,
623                                 SHA256_DIGEST_LENGTH);
624                         goto upgrade_stats_sha;
625                 case DEDUP_VERS_FAILURE:
626                         fprintf(stderr,
627                                 "HAMMER filesystem must be at least "
628                                 "version 5 to dedup\n");
629                         exit (1);
630                 default:
631                         fprintf(stderr, "Unknown error\n");
632                         goto terminate_early;
633                 }
634
635                 /*
636                  * Ooh la la.. SHA-256 collision. Terminate early, there's
637                  * nothing we can do here.
638                  */
639 sha256_failure:
640                 ++dedup_sha_failures;
641                 goto terminate_early;
642         } else {
643                 /*
644                  * Candidate CRC is good for now (we found an entry in CRC
645                  * tree and it's not fictitious). This means we have a
646                  * potential dedup pair.
647                  */
648                 if (validate_dedup_pair(&de->leaf, scan_leaf))
649                         goto crc_failure;
650
651                 /*
652                  * We have a valid dedup pair (CRC match, validated)
653                  */
654                 error = deduplicate(&de->leaf, scan_leaf);
655                 switch(error) {
656                 case 0:
657                         goto upgrade_stats;
658                 case DEDUP_TECH_FAILURE:
659                         goto pass2_insert;
660                 case DEDUP_CMP_FAILURE:
661                         goto crc_failure;
662                 case DEDUP_INVALID_ZONE:
663                         goto terminate_early;
664                 case DEDUP_UNDERFLOW:
665                         ++dedup_underflows;
666                         de->leaf = *scan_leaf;
667                         goto upgrade_stats;
668                 case DEDUP_VERS_FAILURE:
669                         fprintf(stderr,
670                                 "HAMMER filesystem must be at least "
671                                 "version 5 to dedup\n");
672                         exit (1);
673                 default:
674                         fprintf(stderr, "Unknown error\n");
675                         goto terminate_early;
676                 }
677
678 crc_failure:
679                 /*
680                  * We got a CRC collision - either ioctl failed because of
681                  * the comparison failure or validation of the potential
682                  * dedup pair went bad. In all cases insert both blocks
683                  * into SHA subtree (this requires checksum upgrade) and mark
684                  * entry that corresponds to this CRC in the CRC tree
685                  * fictitious, so that all futher operations with this CRC go
686                  * through SHA subtree.
687                  */
688                 ++dedup_crc_failures;
689
690                 /*
691                  * Insert block that was represented by now fictitious dedup
692                  * entry (create a new SHA entry and preserve stats of the
693                  * old CRC one). If checksum upgrade fails insert the
694                  * candidate into Pass2 list and return - keep both trees
695                  * unmodified.
696                  */
697                 sha_de = calloc(sizeof(*sha_de), 1);
698                 sha_de->leaf = de->leaf;
699                 sha_de->ref_blks = de->u.de.ref_blks;
700                 sha_de->ref_size = de->u.de.ref_size;
701                 if (upgrade_chksum(&sha_de->leaf, sha_de->sha_hash)) {
702                         free(sha_de);
703                         goto pass2_insert;
704                 }
705                 MemoryUse += sizeof(*sha_de);
706
707                 RB_INIT(&de->u.fict_root);
708                 /*
709                  * Here we can insert without prior checking because the tree
710                  * is empty at this point
711                  */
712                 RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root, sha_de);
713
714                 /*
715                  * Mark entry in CRC tree fictitious
716                  */
717                 de->flags |= HAMMER_DEDUP_ENTRY_FICTITIOUS;
718
719                 /*
720                  * Upgrade checksum of the candidate and insert it into
721                  * SHA subtree. If upgrade fails insert the candidate into
722                  * Pass2 list.
723                  */
724                 if (upgrade_chksum(scan_leaf, temp.sha_hash)) {
725                         goto pass2_insert;
726                 }
727                 sha_de = RB_FIND(sha_dedup_entry_rb_tree, &de->u.fict_root,
728                                  &temp);
729                 if (sha_de != NULL)
730                         /* There is an entry with this SHA already, but the only
731                          * RB-tree element at this point is that entry we just
732                          * added. We know for sure these blocks are different
733                          * (this is crc_failure branch) so treat it as SHA
734                          * collision.
735                          */
736                         goto sha256_failure;
737
738                 sha_de = calloc(sizeof(*sha_de), 1);
739                 sha_de->leaf = *scan_leaf;
740                 memcpy(sha_de->sha_hash, temp.sha_hash, SHA256_DIGEST_LENGTH);
741                 RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root, sha_de);
742                 MemoryUse += sizeof(*sha_de);
743                 goto upgrade_stats_sha;
744         }
745
746 upgrade_stats:
747         de->u.de.ref_blks += 1;
748         de->u.de.ref_size += scan_leaf->data_len;
749         return (1);
750
751 upgrade_stats_sha:
752         sha_de->ref_blks += 1;
753         sha_de->ref_size += scan_leaf->data_len;
754         return (1);
755
756 pass2_insert:
757         /*
758          * If in pass2 mode don't insert anything, fall through to
759          * terminate_early
760          */
761         if ((flags & DEDUP_PASS2) == 0) {
762                 pass2_de = calloc(sizeof(*pass2_de), 1);
763                 pass2_de->leaf = *scan_leaf;
764                 STAILQ_INSERT_TAIL(&pass2_dedup_queue, pass2_de, sq_entry);
765                 dedup_skipped_size += scan_leaf->data_len;
766                 return (1);
767         }
768
769 terminate_early:
770         /*
771          * Early termination path. Fixup stats.
772          */
773         dedup_alloc_size += scan_leaf->data_len;
774         dedup_ref_size += scan_leaf->data_len;
775         return (0);
776 }
777
778 static int
779 upgrade_chksum(hammer_btree_leaf_elm_t leaf, u_int8_t *sha_hash)
780 {
781         struct hammer_ioc_data data;
782         char *buf = malloc(DEDUP_BUF);
783         SHA256_CTX ctx;
784         int error;
785
786         bzero(&data, sizeof(data));
787         data.elm = leaf->base;
788         data.ubuf = buf;
789         data.size = DEDUP_BUF;
790
791         error = 0;
792         if (ioctl(glob_fd, HAMMERIOC_GET_DATA, &data) < 0) {
793                 fprintf(stderr, "Get-data failed: %s\n", strerror(errno));
794                 error = 1;
795                 goto done;
796         }
797         DedupDataReads += leaf->data_len;
798
799         if (data.leaf.data_len != leaf->data_len) {
800                 error = 1;
801                 goto done;
802         }
803
804         if (data.leaf.base.btype == HAMMER_BTREE_TYPE_RECORD &&
805             data.leaf.base.rec_type == HAMMER_RECTYPE_DATA) {
806                 SHA256_Init(&ctx);
807                 SHA256_Update(&ctx, (void *)buf, data.leaf.data_len);
808                 SHA256_Final(sha_hash, &ctx);
809         }
810
811 done:
812         free(buf);
813         return (error);
814 }
815
816 static void
817 sigInfo(int signo __unused)
818 {
819         SigInfoFlag = 1;
820 }
821
822 static void
823 scan_pfs(char *filesystem, scan_pfs_cb_t func, const char *id)
824 {
825         struct hammer_ioc_mirror_rw mirror;
826         hammer_ioc_mrecord_any_t mrec;
827         struct hammer_btree_leaf_elm elm;
828         char *buf = malloc(DEDUP_BUF);
829         char buf1[8];
830         char buf2[8];
831         int offset, bytes;
832
833         SigInfoFlag = 0;
834         DedupDataReads = 0;
835         DedupCurrentRecords = 0;
836         signal(SIGINFO, sigInfo);
837
838         bzero(&mirror, sizeof(mirror));
839         hammer_key_beg_init(&mirror.key_beg);
840         hammer_key_end_init(&mirror.key_end);
841
842         mirror.tid_beg = glob_pfs.ondisk->sync_beg_tid;
843         mirror.tid_end = glob_pfs.ondisk->sync_end_tid;
844         mirror.head.flags |= HAMMER_IOC_MIRROR_NODATA; /* we want only keys */
845         mirror.ubuf = buf;
846         mirror.size = DEDUP_BUF;
847         mirror.pfs_id = glob_pfs.pfs_id;
848         mirror.shared_uuid = glob_pfs.ondisk->shared_uuid;
849
850         if (VerboseOpt && DedupCrcStart == 0) {
851                 printf("%s %s: objspace %016jx:%04x %016jx:%04x\n",
852                         id, filesystem,
853                         (uintmax_t)mirror.key_beg.obj_id,
854                         mirror.key_beg.localization,
855                         (uintmax_t)mirror.key_end.obj_id,
856                         mirror.key_end.localization);
857                 printf("%s %s: pfs_id %d\n",
858                         id, filesystem, glob_pfs.pfs_id);
859         }
860         fflush(stdout);
861         fflush(stderr);
862
863         do {
864                 mirror.count = 0;
865                 mirror.pfs_id = glob_pfs.pfs_id;
866                 mirror.shared_uuid = glob_pfs.ondisk->shared_uuid;
867                 if (ioctl(glob_fd, HAMMERIOC_MIRROR_READ, &mirror) < 0) {
868                         fprintf(stderr, "Mirror-read %s failed: %s\n",
869                                 filesystem, strerror(errno));
870                         exit(1);
871                 }
872                 if (mirror.head.flags & HAMMER_IOC_HEAD_ERROR) {
873                         fprintf(stderr, "Mirror-read %s fatal error %d\n",
874                                 filesystem, mirror.head.error);
875                         exit(1);
876                 }
877                 if (mirror.count) {
878                         offset = 0;
879                         while (offset < mirror.count) {
880                                 mrec = (void *)((char *)buf + offset);
881                                 bytes = HAMMER_HEAD_DOALIGN(mrec->head.rec_size);
882                                 if (offset + bytes > mirror.count) {
883                                         fprintf(stderr, "Misaligned record\n");
884                                         exit(1);
885                                 }
886                                 assert((mrec->head.type &
887                                        HAMMER_MRECF_TYPE_MASK) ==
888                                        HAMMER_MREC_TYPE_REC);
889                                 offset += bytes;
890                                 elm = mrec->rec.leaf;
891                                 if (elm.base.btype != HAMMER_BTREE_TYPE_RECORD)
892                                         continue;
893                                 if (elm.base.rec_type != HAMMER_RECTYPE_DATA)
894                                         continue;
895                                 ++DedupCurrentRecords;
896                                 if (DedupCrcStart != DedupCrcEnd) {
897                                         if (elm.data_crc < DedupCrcStart)
898                                                 continue;
899                                         if (DedupCrcEnd &&
900                                             elm.data_crc >= DedupCrcEnd) {
901                                                 continue;
902                                         }
903                                 }
904                                 func(&elm, 0);
905                         }
906                 }
907                 mirror.key_beg = mirror.key_cur;
908                 if (DidInterrupt) {
909                         fprintf(stderr, "Interrupted\n");
910                         exit(1);
911                 }
912                 if (SigInfoFlag) {
913                         if (DedupTotalRecords) {
914                                 humanize_unsigned(buf1, sizeof(buf1),
915                                                   DedupDataReads,
916                                                   "B", 1024);
917                                 humanize_unsigned(buf2, sizeof(buf2),
918                                                   dedup_successes_bytes,
919                                                   "B", 1024);
920                                 fprintf(stderr, "%s count %7jd/%jd "
921                                                 "(%02d.%02d%%) "
922                                                 "ioread %s newddup %s\n",
923                                         id,
924                                         (intmax_t)DedupCurrentRecords,
925                                         (intmax_t)DedupTotalRecords,
926                                         (int)(DedupCurrentRecords * 100 /
927                                                 DedupTotalRecords),
928                                         (int)(DedupCurrentRecords * 10000 /
929                                                 DedupTotalRecords % 100),
930                                         buf1, buf2);
931                         } else {
932                                 fprintf(stderr, "%s count %-7jd\n",
933                                         id,
934                                         (intmax_t)DedupCurrentRecords);
935                         }
936                         SigInfoFlag = 0;
937                 }
938         } while (mirror.count != 0);
939
940         signal(SIGINFO, SIG_IGN);
941
942         free(buf);
943 }
944
945 static void
946 dump_simulated_dedup(void)
947 {
948         struct sim_dedup_entry *sim_de;
949
950         printf("=== Dumping simulated dedup entries:\n");
951         RB_FOREACH(sim_de, sim_dedup_entry_rb_tree, &sim_dedup_tree) {
952                 printf("\tcrc=%08x cnt=%ju size=%ju\n",
953                         sim_de->crc,
954                         (intmax_t)sim_de->ref_blks,
955                         (intmax_t)sim_de->ref_size);
956         }
957         printf("end of dump ===\n");
958 }
959
960 static void
961 dump_real_dedup(void)
962 {
963         struct dedup_entry *de;
964         struct sha_dedup_entry *sha_de;
965         int i;
966
967         printf("=== Dumping dedup entries:\n");
968         RB_FOREACH(de, dedup_entry_rb_tree, &dedup_tree) {
969                 if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
970                         printf("\tcrc=%08x fictitious\n", de->leaf.data_crc);
971
972                         RB_FOREACH(sha_de, sha_dedup_entry_rb_tree, &de->u.fict_root) {
973                                 printf("\t\tcrc=%08x cnt=%ju size=%ju\n\t"
974                                        "\t\tsha=",
975                                        sha_de->leaf.data_crc,
976                                        (intmax_t)sha_de->ref_blks,
977                                        (intmax_t)sha_de->ref_size);
978                                 for (i = 0; i < SHA256_DIGEST_LENGTH; ++i)
979                                         printf("%02x", sha_de->sha_hash[i]);
980                                 printf("\n");
981                         }
982                 } else {
983                         printf("\tcrc=%08x cnt=%ju size=%ju\n",
984                                de->leaf.data_crc,
985                                (intmax_t)de->u.de.ref_blks,
986                                (intmax_t)de->u.de.ref_size);
987                 }
988         }
989         printf("end of dump ===\n");
990 }
991
992 static void
993 dedup_usage(int code)
994 {
995         fprintf(stderr,
996                 "hammer dedup-simulate <filesystem>\n"
997                 "hammer dedup <filesystem>\n"
998         );
999         exit(code);
1000 }