hammer2 - stabilization, sequencing
[dragonfly.git] / sbin / hammer / cmd_dedup.c
1 /*
2  * Copyright (c) 2010 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Ilya Dryomov <idryomov@gmail.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34
35 #include <libutil.h>
36 #include <crypto/sha2/sha2.h>
37
38 #include "hammer.h"
39
40 #define DEDUP_BUF (64 * 1024)
41
42 /* Sorted list of block CRCs - light version for dedup-simulate */
43 struct sim_dedup_entry_rb_tree;
44 RB_HEAD(sim_dedup_entry_rb_tree, sim_dedup_entry) sim_dedup_tree =
45                                         RB_INITIALIZER(&sim_dedup_tree);
46 RB_PROTOTYPE2(sim_dedup_entry_rb_tree, sim_dedup_entry, rb_entry,
47                 rb_sim_dedup_entry_compare, hammer_crc_t);
48
49 struct sim_dedup_entry {
50         hammer_crc_t    crc;
51         u_int64_t       ref_blks; /* number of blocks referenced */
52         u_int64_t       ref_size; /* size of data referenced */
53         RB_ENTRY(sim_dedup_entry) rb_entry;
54 };
55
56 struct dedup_entry {
57         struct hammer_btree_leaf_elm leaf;
58         union {
59                 struct {
60                         u_int64_t ref_blks;
61                         u_int64_t ref_size;
62                 } de;
63                 RB_HEAD(sha_dedup_entry_rb_tree, sha_dedup_entry) fict_root;
64         } u;
65         u_int8_t flags;
66         RB_ENTRY(dedup_entry) rb_entry;
67 };
68
69 #define HAMMER_DEDUP_ENTRY_FICTITIOUS   0x0001
70
71 struct sha_dedup_entry {
72         struct hammer_btree_leaf_elm    leaf;
73         u_int64_t                       ref_blks;
74         u_int64_t                       ref_size;
75         u_int8_t                        sha_hash[SHA256_DIGEST_LENGTH];
76         RB_ENTRY(sha_dedup_entry)       fict_entry;
77 };
78
79 /* Sorted list of HAMMER B-Tree keys */
80 struct dedup_entry_rb_tree;
81 struct sha_dedup_entry_rb_tree;
82
83 RB_HEAD(dedup_entry_rb_tree, dedup_entry) dedup_tree =
84                                         RB_INITIALIZER(&dedup_tree);
85 RB_PROTOTYPE2(dedup_entry_rb_tree, dedup_entry, rb_entry,
86                 rb_dedup_entry_compare, hammer_crc_t);
87
88 RB_PROTOTYPE(sha_dedup_entry_rb_tree, sha_dedup_entry, fict_entry,
89                 rb_sha_dedup_entry_compare);
90
91 /*
92  * Pass2 list - contains entries that were not dedup'ed because ioctl failed
93  */
94 STAILQ_HEAD(, pass2_dedup_entry) pass2_dedup_queue =
95                                 STAILQ_HEAD_INITIALIZER(pass2_dedup_queue);
96
97 struct pass2_dedup_entry {
98         struct hammer_btree_leaf_elm    leaf;
99         STAILQ_ENTRY(pass2_dedup_entry) sq_entry;
100 };
101
102 #define DEDUP_PASS2     0x0001 /* process_btree_elm() mode */
103
104 static int SigInfoFlag;
105 static int SigAlrmFlag;
106 static int64_t DedupDataReads;
107 static int64_t DedupCurrentRecords;
108 static int64_t DedupTotalRecords;
109 static u_int32_t DedupCrcStart;
110 static u_int32_t DedupCrcEnd;
111 static u_int64_t MemoryUse;
112
113 /* PFS global ids - we deal with just one PFS at a run */
114 int glob_fd;
115 struct hammer_ioc_pseudofs_rw glob_pfs;
116
117 /*
118  * Global accounting variables
119  *
120  * Last three don't have to be 64-bit, just to be safe..
121  */
122 u_int64_t dedup_alloc_size = 0;
123 u_int64_t dedup_ref_size = 0;
124 u_int64_t dedup_skipped_size = 0;
125 u_int64_t dedup_crc_failures = 0;
126 u_int64_t dedup_sha_failures = 0;
127 u_int64_t dedup_underflows = 0;
128 u_int64_t dedup_successes_count = 0;
129 u_int64_t dedup_successes_bytes = 0;
130
131 static int rb_sim_dedup_entry_compare(struct sim_dedup_entry *sim_de1,
132                                 struct sim_dedup_entry *sim_de2);
133 static int rb_dedup_entry_compare(struct dedup_entry *de1,
134                                 struct dedup_entry *de2);
135 static int rb_sha_dedup_entry_compare(struct sha_dedup_entry *sha_de1,
136                                 struct sha_dedup_entry *sha_de2);
137 typedef int (*scan_pfs_cb_t)(hammer_btree_leaf_elm_t scan_leaf, int flags);
138 static void scan_pfs(char *filesystem, scan_pfs_cb_t func, const char *id);
139 static int collect_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags);
140 static int count_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags);
141 static int process_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags);
142 static int upgrade_chksum(hammer_btree_leaf_elm_t leaf, u_int8_t *sha_hash);
143 static void dump_simulated_dedup(void);
144 static void dump_real_dedup(void);
145 static void dedup_usage(int code);
146
147 RB_GENERATE2(sim_dedup_entry_rb_tree, sim_dedup_entry, rb_entry,
148                 rb_sim_dedup_entry_compare, hammer_crc_t, crc);
149 RB_GENERATE2(dedup_entry_rb_tree, dedup_entry, rb_entry,
150                 rb_dedup_entry_compare, hammer_crc_t, leaf.data_crc);
151 RB_GENERATE(sha_dedup_entry_rb_tree, sha_dedup_entry, fict_entry,
152                 rb_sha_dedup_entry_compare);
153
154 static int
155 rb_sim_dedup_entry_compare(struct sim_dedup_entry *sim_de1,
156                         struct sim_dedup_entry *sim_de2)
157 {
158         if (sim_de1->crc < sim_de2->crc)
159                 return (-1);
160         if (sim_de1->crc > sim_de2->crc)
161                 return (1);
162
163         return (0);
164 }
165
166 static int
167 rb_dedup_entry_compare(struct dedup_entry *de1, struct dedup_entry *de2)
168 {
169         if (de1->leaf.data_crc < de2->leaf.data_crc)
170                 return (-1);
171         if (de1->leaf.data_crc > de2->leaf.data_crc)
172                 return (1);
173
174         return (0);
175 }
176
177 static int
178 rb_sha_dedup_entry_compare(struct sha_dedup_entry *sha_de1,
179                         struct sha_dedup_entry *sha_de2)
180 {
181         unsigned long *h1 = (unsigned long *)&sha_de1->sha_hash;
182         unsigned long *h2 = (unsigned long *)&sha_de2->sha_hash;
183         int i;
184
185         for (i = 0; i < SHA256_DIGEST_LENGTH / (int)sizeof(unsigned long); ++i) {
186                 if (h1[i] < h2[i])
187                         return (-1);
188                 if (h1[i] > h2[i])
189                         return (1);
190         }
191
192         return (0);
193 }
194
195 /*
196  * dedup-simulate <filesystem>
197  */
198 void
199 hammer_cmd_dedup_simulate(char **av, int ac)
200 {
201         struct sim_dedup_entry *sim_de;
202
203         if (ac != 1)
204                 dedup_usage(1);
205
206         glob_fd = getpfs(&glob_pfs, av[0]);
207
208         /*
209          * Collection passes (memory limited)
210          */
211         printf("Dedup-simulate running\n");
212         do {
213                 DedupCrcStart = DedupCrcEnd;
214                 DedupCrcEnd = 0;
215                 MemoryUse = 0;
216
217                 if (VerboseOpt) {
218                         printf("b-tree pass  crc-range %08x-max\n",
219                                 DedupCrcStart);
220                         fflush(stdout);
221                 }
222                 scan_pfs(av[0], collect_btree_elm, "simu-pass");
223
224                 if (VerboseOpt >= 2)
225                         dump_simulated_dedup();
226
227                 /*
228                  * Calculate simulated dedup ratio and get rid of the tree
229                  */
230                 while ((sim_de = RB_ROOT(&sim_dedup_tree)) != NULL) {
231                         assert(sim_de->ref_blks != 0);
232                         dedup_ref_size += sim_de->ref_size;
233                         dedup_alloc_size += sim_de->ref_size / sim_de->ref_blks;
234
235                         RB_REMOVE(sim_dedup_entry_rb_tree, &sim_dedup_tree, sim_de);
236                         free(sim_de);
237                 }
238                 if (DedupCrcEnd && VerboseOpt == 0)
239                         printf(".");
240         } while (DedupCrcEnd);
241
242         printf("Dedup-simulate %s succeeded\n", av[0]);
243         relpfs(glob_fd, &glob_pfs);
244
245         printf("Simulated dedup ratio = %.2f\n",
246             (dedup_alloc_size != 0) ?
247                 (double)dedup_ref_size / dedup_alloc_size : 0);
248 }
249
250 /*
251  * dedup <filesystem>
252  */
253 void
254 hammer_cmd_dedup(char **av, int ac)
255 {
256         struct dedup_entry *de;
257         struct sha_dedup_entry *sha_de;
258         struct pass2_dedup_entry *pass2_de;
259         char *tmp;
260         char buf[8];
261         int needfree = 0;
262
263         if (TimeoutOpt > 0)
264                 alarm(TimeoutOpt);
265
266         if (ac != 1)
267                 dedup_usage(1);
268
269         STAILQ_INIT(&pass2_dedup_queue);
270
271         glob_fd = getpfs(&glob_pfs, av[0]);
272
273         /*
274          * A cycle file is _required_ for resuming dedup after the timeout
275          * specified with -t has expired. If no -c option, then place a
276          * .dedup.cycle file either in the PFS snapshots directory or in
277          * the default snapshots directory.
278          */
279         if (!CyclePath) {
280                 if (glob_pfs.ondisk->snapshots[0] != '/')
281                         asprintf(&tmp, "%s/%s/.dedup.cycle",
282                             SNAPSHOTS_BASE, av[0]);
283                 else
284                         asprintf(&tmp, "%s/.dedup.cycle",
285                             glob_pfs.ondisk->snapshots);
286                 CyclePath = tmp;
287                 needfree = 1;
288         }
289
290         /*
291          * Pre-pass to cache the btree
292          */
293         scan_pfs(av[0], count_btree_elm, "pre-pass ");
294         DedupTotalRecords = DedupCurrentRecords;
295
296         /*
297          * Collection passes (memory limited)
298          */
299         printf("Dedup running\n");
300         do {
301                 DedupCrcStart = DedupCrcEnd;
302                 DedupCrcEnd = 0;
303                 MemoryUse = 0;
304
305                 if (VerboseOpt) {
306                         printf("b-tree pass  crc-range %08x-max\n",
307                                 DedupCrcStart);
308                         fflush(stdout);
309                 }
310                 scan_pfs(av[0], process_btree_elm, "main-pass");
311
312                 while ((pass2_de = STAILQ_FIRST(&pass2_dedup_queue)) != NULL) {
313                         if (process_btree_elm(&pass2_de->leaf, DEDUP_PASS2))
314                                 dedup_skipped_size -= pass2_de->leaf.data_len;
315
316                         STAILQ_REMOVE_HEAD(&pass2_dedup_queue, sq_entry);
317                         free(pass2_de);
318                 }
319                 assert(STAILQ_EMPTY(&pass2_dedup_queue));
320
321                 if (VerboseOpt >= 2)
322                         dump_real_dedup();
323
324                 /*
325                  * Calculate dedup ratio and get rid of the trees
326                  */
327                 while ((de = RB_ROOT(&dedup_tree)) != NULL) {
328                         if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
329                                 while ((sha_de = RB_ROOT(&de->u.fict_root)) != NULL) {
330                                         assert(sha_de->ref_blks != 0);
331                                         dedup_ref_size += sha_de->ref_size;
332                                         dedup_alloc_size += sha_de->ref_size / sha_de->ref_blks;
333
334                                         RB_REMOVE(sha_dedup_entry_rb_tree,
335                                                         &de->u.fict_root, sha_de);
336                                         free(sha_de);
337                                 }
338                                 assert(RB_EMPTY(&de->u.fict_root));
339                         } else {
340                                 assert(de->u.de.ref_blks != 0);
341                                 dedup_ref_size += de->u.de.ref_size;
342                                 dedup_alloc_size += de->u.de.ref_size / de->u.de.ref_blks;
343                         }
344
345                         RB_REMOVE(dedup_entry_rb_tree, &dedup_tree, de);
346                         free(de);
347                 }
348                 assert(RB_EMPTY(&dedup_tree));
349                 if (DedupCrcEnd && VerboseOpt == 0)
350                         printf(".");
351         } while (DedupCrcEnd);
352
353         printf("Dedup %s succeeded\n", av[0]);
354         relpfs(glob_fd, &glob_pfs);
355
356         humanize_unsigned(buf, sizeof(buf), dedup_ref_size, "B", 1024);
357         printf("Dedup ratio = %.2f (in this run)\n"
358                "    %8s referenced\n",
359                ((dedup_alloc_size != 0) ?
360                         (double)dedup_ref_size / dedup_alloc_size : 0),
361                buf
362         );
363         humanize_unsigned(buf, sizeof(buf), dedup_alloc_size, "B", 1024);
364         printf("    %8s allocated\n", buf);
365         humanize_unsigned(buf, sizeof(buf), dedup_skipped_size, "B", 1024);
366         printf("    %8s skipped\n", buf);
367         printf("    %8jd CRC collisions\n"
368                "    %8jd SHA collisions\n"
369                "    %8jd big-block underflows\n"
370                "    %8jd new dedup records\n"
371                "    %8jd new dedup bytes\n",
372                (intmax_t)dedup_crc_failures,
373                (intmax_t)dedup_sha_failures,
374                (intmax_t)dedup_underflows,
375                (intmax_t)dedup_successes_count,
376                (intmax_t)dedup_successes_bytes
377         );
378
379         /* Once completed remove cycle file */
380         hammer_reset_cycle();
381
382         /* We don't want to mess up with other directives */
383         if (needfree) {
384                 free(tmp);
385                 CyclePath = NULL;
386         }
387 }
388
389 static int
390 count_btree_elm(hammer_btree_leaf_elm_t scan_leaf __unused, int flags __unused)
391 {
392         return(1);
393 }
394
395 static int
396 collect_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags __unused)
397 {
398         struct sim_dedup_entry *sim_de;
399
400         /*
401          * If we are using too much memory we have to clean some out, which
402          * will cause the run to use multiple passes.  Be careful of integer
403          * overflows!
404          */
405         if (MemoryUse > MemoryLimit) {
406                 DedupCrcEnd = DedupCrcStart +
407                               (u_int32_t)(DedupCrcEnd - DedupCrcStart - 1) / 2;
408                 if (VerboseOpt) {
409                         printf("memory limit  crc-range %08x-%08x\n",
410                                 DedupCrcStart, DedupCrcEnd);
411                         fflush(stdout);
412                 }
413                 for (;;) {
414                         sim_de = RB_MAX(sim_dedup_entry_rb_tree,
415                                         &sim_dedup_tree);
416                         if (sim_de == NULL || sim_de->crc < DedupCrcEnd)
417                                 break;
418                         RB_REMOVE(sim_dedup_entry_rb_tree,
419                                   &sim_dedup_tree, sim_de);
420                         MemoryUse -= sizeof(*sim_de);
421                         free(sim_de);
422                 }
423         }
424
425         /*
426          * Collect statistics based on the CRC only, do not try to read
427          * any data blocks or run SHA hashes.
428          */
429         sim_de = RB_LOOKUP(sim_dedup_entry_rb_tree, &sim_dedup_tree,
430                            scan_leaf->data_crc);
431
432         if (sim_de == NULL) {
433                 sim_de = calloc(sizeof(*sim_de), 1);
434                 sim_de->crc = scan_leaf->data_crc;
435                 RB_INSERT(sim_dedup_entry_rb_tree, &sim_dedup_tree, sim_de);
436                 MemoryUse += sizeof(*sim_de);
437         }
438
439         sim_de->ref_blks += 1;
440         sim_de->ref_size += scan_leaf->data_len;
441         return (1);
442 }
443
444 static __inline int
445 validate_dedup_pair(hammer_btree_leaf_elm_t p, hammer_btree_leaf_elm_t q)
446 {
447         if ((p->data_offset & HAMMER_OFF_ZONE_MASK) !=
448             (q->data_offset & HAMMER_OFF_ZONE_MASK)) {
449                 return (1);
450         }
451         if (p->data_len != q->data_len) {
452                 return (1);
453         }
454
455         return (0);
456 }
457
458 #define DEDUP_TECH_FAILURE      1
459 #define DEDUP_CMP_FAILURE       2
460 #define DEDUP_INVALID_ZONE      3
461 #define DEDUP_UNDERFLOW         4
462 #define DEDUP_VERS_FAILURE      5
463
464 static __inline int
465 deduplicate(hammer_btree_leaf_elm_t p, hammer_btree_leaf_elm_t q)
466 {
467         struct hammer_ioc_dedup dedup;
468
469         bzero(&dedup, sizeof(dedup));
470
471         /*
472          * If data_offset fields are the same there is no need to run ioctl,
473          * candidate is already dedup'ed.
474          */
475         if (p->data_offset == q->data_offset) {
476                 return (0);
477         }
478
479         dedup.elm1 = p->base;
480         dedup.elm2 = q->base;
481         RunningIoctl = 1;
482         if (ioctl(glob_fd, HAMMERIOC_DEDUP, &dedup) < 0) {
483                 if (errno == EOPNOTSUPP) {
484                         /* must be at least version 5 */
485                         return (DEDUP_VERS_FAILURE);
486                 }
487                 /* Technical failure - locking or w/e */
488                 return (DEDUP_TECH_FAILURE);
489         }
490         if (dedup.head.flags & HAMMER_IOC_DEDUP_CMP_FAILURE) {
491                 return (DEDUP_CMP_FAILURE);
492         }
493         if (dedup.head.flags & HAMMER_IOC_DEDUP_INVALID_ZONE) {
494                 return (DEDUP_INVALID_ZONE);
495         }
496         if (dedup.head.flags & HAMMER_IOC_DEDUP_UNDERFLOW) {
497                 return (DEDUP_UNDERFLOW);
498         }
499         RunningIoctl = 0;
500         ++dedup_successes_count;
501         dedup_successes_bytes += p->data_len;
502         return (0);
503 }
504
505 static int
506 process_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags)
507 {
508         struct dedup_entry *de;
509         struct sha_dedup_entry *sha_de, temp;
510         struct pass2_dedup_entry *pass2_de;
511         int error;
512
513         /*
514          * If we are using too much memory we have to clean some out, which
515          * will cause the run to use multiple passes.  Be careful of integer
516          * overflows!
517          */
518         while (MemoryUse > MemoryLimit) {
519                 DedupCrcEnd = DedupCrcStart +
520                               (u_int32_t)(DedupCrcEnd - DedupCrcStart - 1) / 2;
521                 if (VerboseOpt) {
522                         printf("memory limit  crc-range %08x-%08x\n",
523                                 DedupCrcStart, DedupCrcEnd);
524                         fflush(stdout);
525                 }
526
527                 for (;;) {
528                         de = RB_MAX(dedup_entry_rb_tree, &dedup_tree);
529                         if (de == NULL || de->leaf.data_crc < DedupCrcEnd)
530                                 break;
531                         if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
532                                 while ((sha_de = RB_ROOT(&de->u.fict_root)) !=
533                                        NULL) {
534                                         RB_REMOVE(sha_dedup_entry_rb_tree,
535                                                   &de->u.fict_root, sha_de);
536                                         MemoryUse -= sizeof(*sha_de);
537                                         free(sha_de);
538                                 }
539                         }
540                         RB_REMOVE(dedup_entry_rb_tree, &dedup_tree, de);
541                         MemoryUse -= sizeof(*de);
542                         free(de);
543                 }
544         }
545
546         /*
547          * Collect statistics based on the CRC.  Colliding CRCs usually
548          * cause a SHA sub-tree to be created under the de.
549          *
550          * Trivial case if de not found.
551          */
552         de = RB_LOOKUP(dedup_entry_rb_tree, &dedup_tree, scan_leaf->data_crc);
553         if (de == NULL) {
554                 de = calloc(sizeof(*de), 1);
555                 de->leaf = *scan_leaf;
556                 RB_INSERT(dedup_entry_rb_tree, &dedup_tree, de);
557                 MemoryUse += sizeof(*de);
558                 goto upgrade_stats;
559         }
560
561         /*
562          * Found entry in CRC tree
563          */
564         if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
565                 /*
566                  * Optimize the case where a CRC failure results in multiple
567                  * SHA entries.  If we unconditionally issue a data-read a
568                  * degenerate situation where a colliding CRC's second SHA
569                  * entry contains the lion's share of the deduplication
570                  * candidates will result in excessive data block reads.
571                  *
572                  * Deal with the degenerate case by looking for a matching
573                  * data_offset/data_len in the SHA elements we already have
574                  * before reading the data block and generating a new SHA.
575                  */
576                 RB_FOREACH(sha_de, sha_dedup_entry_rb_tree, &de->u.fict_root) {
577                         if (sha_de->leaf.data_offset ==
578                                                 scan_leaf->data_offset &&
579                             sha_de->leaf.data_len == scan_leaf->data_len) {
580                                 memcpy(temp.sha_hash, sha_de->sha_hash,
581                                         SHA256_DIGEST_LENGTH);
582                                 break;
583                         }
584                 }
585
586                 /*
587                  * Entry in CRC tree is fictitious, so we already had problems
588                  * with this CRC. Upgrade (compute SHA) the candidate and
589                  * dive into SHA subtree. If upgrade fails insert the candidate
590                  * into Pass2 list (it will be processed later).
591                  */
592                 if (sha_de == NULL) {
593                         if (upgrade_chksum(scan_leaf, temp.sha_hash))
594                                 goto pass2_insert;
595
596                         sha_de = RB_FIND(sha_dedup_entry_rb_tree,
597                                          &de->u.fict_root, &temp);
598                 }
599
600                 /*
601                  * Nothing in SHA subtree so far, so this is a new
602                  * 'dataset'. Insert new entry into SHA subtree.
603                  */
604                 if (sha_de == NULL) {
605                         sha_de = calloc(sizeof(*sha_de), 1);
606                         sha_de->leaf = *scan_leaf;
607                         memcpy(sha_de->sha_hash, temp.sha_hash,
608                                SHA256_DIGEST_LENGTH);
609                         RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root,
610                                   sha_de);
611                         MemoryUse += sizeof(*sha_de);
612                         goto upgrade_stats_sha;
613                 }
614
615                 /*
616                  * Found entry in SHA subtree, it means we have a potential
617                  * dedup pair. Validate it (zones have to match and data_len
618                  * field have to be the same too. If validation fails, treat
619                  * it as a SHA collision (jump to sha256_failure).
620                  */
621                 if (validate_dedup_pair(&sha_de->leaf, scan_leaf))
622                         goto sha256_failure;
623
624                 /*
625                  * We have a valid dedup pair (SHA match, validated).
626                  *
627                  * In case of technical failure (dedup pair was good, but
628                  * ioctl failed anyways) insert the candidate into Pass2 list
629                  * (we will try to dedup it after we are done with the rest of
630                  * the tree).
631                  *
632                  * If ioctl fails because either of blocks is in the non-dedup
633                  * zone (we can dedup only in LARGE_DATA and SMALL_DATA) don't
634                  * bother with the candidate and terminate early.
635                  *
636                  * If ioctl fails because of big-block underflow replace the
637                  * leaf node that found dedup entry represents with scan_leaf.
638                  */
639                 error = deduplicate(&sha_de->leaf, scan_leaf);
640                 switch(error) {
641                 case 0:
642                         goto upgrade_stats_sha;
643                 case DEDUP_TECH_FAILURE:
644                         goto pass2_insert;
645                 case DEDUP_CMP_FAILURE:
646                         goto sha256_failure;
647                 case DEDUP_INVALID_ZONE:
648                         goto terminate_early;
649                 case DEDUP_UNDERFLOW:
650                         ++dedup_underflows;
651                         sha_de->leaf = *scan_leaf;
652                         memcpy(sha_de->sha_hash, temp.sha_hash,
653                                 SHA256_DIGEST_LENGTH);
654                         goto upgrade_stats_sha;
655                 case DEDUP_VERS_FAILURE:
656                         fprintf(stderr,
657                                 "HAMMER filesystem must be at least "
658                                 "version 5 to dedup\n");
659                         exit (1);
660                 default:
661                         fprintf(stderr, "Unknown error\n");
662                         goto terminate_early;
663                 }
664
665                 /*
666                  * Ooh la la.. SHA-256 collision. Terminate early, there's
667                  * nothing we can do here.
668                  */
669 sha256_failure:
670                 ++dedup_sha_failures;
671                 goto terminate_early;
672         } else {
673                 /*
674                  * Candidate CRC is good for now (we found an entry in CRC
675                  * tree and it's not fictitious). This means we have a
676                  * potential dedup pair.
677                  */
678                 if (validate_dedup_pair(&de->leaf, scan_leaf))
679                         goto crc_failure;
680
681                 /*
682                  * We have a valid dedup pair (CRC match, validated)
683                  */
684                 error = deduplicate(&de->leaf, scan_leaf);
685                 switch(error) {
686                 case 0:
687                         goto upgrade_stats;
688                 case DEDUP_TECH_FAILURE:
689                         goto pass2_insert;
690                 case DEDUP_CMP_FAILURE:
691                         goto crc_failure;
692                 case DEDUP_INVALID_ZONE:
693                         goto terminate_early;
694                 case DEDUP_UNDERFLOW:
695                         ++dedup_underflows;
696                         de->leaf = *scan_leaf;
697                         goto upgrade_stats;
698                 case DEDUP_VERS_FAILURE:
699                         fprintf(stderr,
700                                 "HAMMER filesystem must be at least "
701                                 "version 5 to dedup\n");
702                         exit (1);
703                 default:
704                         fprintf(stderr, "Unknown error\n");
705                         goto terminate_early;
706                 }
707
708 crc_failure:
709                 /*
710                  * We got a CRC collision - either ioctl failed because of
711                  * the comparison failure or validation of the potential
712                  * dedup pair went bad. In all cases insert both blocks
713                  * into SHA subtree (this requires checksum upgrade) and mark
714                  * entry that corresponds to this CRC in the CRC tree
715                  * fictitious, so that all futher operations with this CRC go
716                  * through SHA subtree.
717                  */
718                 ++dedup_crc_failures;
719
720                 /*
721                  * Insert block that was represented by now fictitious dedup
722                  * entry (create a new SHA entry and preserve stats of the
723                  * old CRC one). If checksum upgrade fails insert the
724                  * candidate into Pass2 list and return - keep both trees
725                  * unmodified.
726                  */
727                 sha_de = calloc(sizeof(*sha_de), 1);
728                 sha_de->leaf = de->leaf;
729                 sha_de->ref_blks = de->u.de.ref_blks;
730                 sha_de->ref_size = de->u.de.ref_size;
731                 if (upgrade_chksum(&sha_de->leaf, sha_de->sha_hash)) {
732                         free(sha_de);
733                         goto pass2_insert;
734                 }
735                 MemoryUse += sizeof(*sha_de);
736
737                 RB_INIT(&de->u.fict_root);
738                 /*
739                  * Here we can insert without prior checking because the tree
740                  * is empty at this point
741                  */
742                 RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root, sha_de);
743
744                 /*
745                  * Mark entry in CRC tree fictitious
746                  */
747                 de->flags |= HAMMER_DEDUP_ENTRY_FICTITIOUS;
748
749                 /*
750                  * Upgrade checksum of the candidate and insert it into
751                  * SHA subtree. If upgrade fails insert the candidate into
752                  * Pass2 list.
753                  */
754                 if (upgrade_chksum(scan_leaf, temp.sha_hash)) {
755                         goto pass2_insert;
756                 }
757                 sha_de = RB_FIND(sha_dedup_entry_rb_tree, &de->u.fict_root,
758                                  &temp);
759                 if (sha_de != NULL)
760                         /* There is an entry with this SHA already, but the only
761                          * RB-tree element at this point is that entry we just
762                          * added. We know for sure these blocks are different
763                          * (this is crc_failure branch) so treat it as SHA
764                          * collision.
765                          */
766                         goto sha256_failure;
767
768                 sha_de = calloc(sizeof(*sha_de), 1);
769                 sha_de->leaf = *scan_leaf;
770                 memcpy(sha_de->sha_hash, temp.sha_hash, SHA256_DIGEST_LENGTH);
771                 RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root, sha_de);
772                 MemoryUse += sizeof(*sha_de);
773                 goto upgrade_stats_sha;
774         }
775
776 upgrade_stats:
777         de->u.de.ref_blks += 1;
778         de->u.de.ref_size += scan_leaf->data_len;
779         return (1);
780
781 upgrade_stats_sha:
782         sha_de->ref_blks += 1;
783         sha_de->ref_size += scan_leaf->data_len;
784         return (1);
785
786 pass2_insert:
787         /*
788          * If in pass2 mode don't insert anything, fall through to
789          * terminate_early
790          */
791         if ((flags & DEDUP_PASS2) == 0) {
792                 pass2_de = calloc(sizeof(*pass2_de), 1);
793                 pass2_de->leaf = *scan_leaf;
794                 STAILQ_INSERT_TAIL(&pass2_dedup_queue, pass2_de, sq_entry);
795                 dedup_skipped_size += scan_leaf->data_len;
796                 return (1);
797         }
798
799 terminate_early:
800         /*
801          * Early termination path. Fixup stats.
802          */
803         dedup_alloc_size += scan_leaf->data_len;
804         dedup_ref_size += scan_leaf->data_len;
805         return (0);
806 }
807
808 static int
809 upgrade_chksum(hammer_btree_leaf_elm_t leaf, u_int8_t *sha_hash)
810 {
811         struct hammer_ioc_data data;
812         char *buf = malloc(DEDUP_BUF);
813         SHA256_CTX ctx;
814         int error;
815
816         bzero(&data, sizeof(data));
817         data.elm = leaf->base;
818         data.ubuf = buf;
819         data.size = DEDUP_BUF;
820
821         error = 0;
822         if (ioctl(glob_fd, HAMMERIOC_GET_DATA, &data) < 0) {
823                 fprintf(stderr, "Get-data failed: %s\n", strerror(errno));
824                 error = 1;
825                 goto done;
826         }
827         DedupDataReads += leaf->data_len;
828
829         if (data.leaf.data_len != leaf->data_len) {
830                 error = 1;
831                 goto done;
832         }
833
834         if (data.leaf.base.btype == HAMMER_BTREE_TYPE_RECORD &&
835             data.leaf.base.rec_type == HAMMER_RECTYPE_DATA) {
836                 SHA256_Init(&ctx);
837                 SHA256_Update(&ctx, (void *)buf, data.leaf.data_len);
838                 SHA256_Final(sha_hash, &ctx);
839         }
840
841 done:
842         free(buf);
843         return (error);
844 }
845
846 static void
847 sigAlrm(int signo __unused)
848 {
849         SigAlrmFlag = 1;
850 }
851
852 static void
853 sigInfo(int signo __unused)
854 {
855         SigInfoFlag = 1;
856 }
857
858 static void
859 scan_pfs(char *filesystem, scan_pfs_cb_t func, const char *id)
860 {
861         struct hammer_ioc_mirror_rw mirror;
862         hammer_ioc_mrecord_any_t mrec;
863         struct hammer_btree_leaf_elm elm;
864         char *buf = malloc(DEDUP_BUF);
865         char buf1[8];
866         char buf2[8];
867         int offset, bytes;
868
869         SigInfoFlag = 0;
870         DedupDataReads = 0;
871         DedupCurrentRecords = 0;
872         signal(SIGINFO, sigInfo);
873         signal(SIGALRM, sigAlrm);
874
875         /*
876          * Deduplication happens per element so hammer(8) is in full
877          * control of the ioctl()s to actually perform it. SIGALRM
878          * needs to be handled within hammer(8) but a checkpoint
879          * is needed for resuming. Use cycle file for that.
880          *
881          * Try to obtain the previous obj_id from the cycle file and
882          * if not available just start from the beginning.
883          */
884         bzero(&mirror, sizeof(mirror));
885         hammer_key_beg_init(&mirror.key_beg);
886         hammer_get_cycle(&mirror.key_beg, &mirror.tid_beg);
887
888         if (mirror.key_beg.obj_id != (int64_t)HAMMER_MIN_OBJID) {
889                 if (VerboseOpt)
890                         fprintf(stderr, "%s: mirror-read: Resuming at object %016jx\n",
891                             id, (uintmax_t)mirror.key_beg.obj_id);
892         }
893
894         hammer_key_end_init(&mirror.key_end);
895
896         mirror.tid_beg = glob_pfs.ondisk->sync_beg_tid;
897         mirror.tid_end = glob_pfs.ondisk->sync_end_tid;
898         mirror.head.flags |= HAMMER_IOC_MIRROR_NODATA; /* we want only keys */
899         mirror.ubuf = buf;
900         mirror.size = DEDUP_BUF;
901         mirror.pfs_id = glob_pfs.pfs_id;
902         mirror.shared_uuid = glob_pfs.ondisk->shared_uuid;
903
904         if (VerboseOpt && DedupCrcStart == 0) {
905                 printf("%s %s: objspace %016jx:%04x %016jx:%04x\n",
906                         id, filesystem,
907                         (uintmax_t)mirror.key_beg.obj_id,
908                         mirror.key_beg.localization,
909                         (uintmax_t)mirror.key_end.obj_id,
910                         mirror.key_end.localization);
911                 printf("%s %s: pfs_id %d\n",
912                         id, filesystem, glob_pfs.pfs_id);
913         }
914         fflush(stdout);
915         fflush(stderr);
916
917         do {
918                 mirror.count = 0;
919                 mirror.pfs_id = glob_pfs.pfs_id;
920                 mirror.shared_uuid = glob_pfs.ondisk->shared_uuid;
921                 if (ioctl(glob_fd, HAMMERIOC_MIRROR_READ, &mirror) < 0) {
922                         fprintf(stderr, "Mirror-read %s failed: %s\n",
923                                 filesystem, strerror(errno));
924                         exit(1);
925                 }
926                 if (mirror.head.flags & HAMMER_IOC_HEAD_ERROR) {
927                         fprintf(stderr, "Mirror-read %s fatal error %d\n",
928                                 filesystem, mirror.head.error);
929                         exit(1);
930                 }
931                 if (mirror.count) {
932                         offset = 0;
933                         while (offset < mirror.count) {
934                                 mrec = (void *)((char *)buf + offset);
935                                 bytes = HAMMER_HEAD_DOALIGN(mrec->head.rec_size);
936                                 if (offset + bytes > mirror.count) {
937                                         fprintf(stderr, "Misaligned record\n");
938                                         exit(1);
939                                 }
940                                 assert((mrec->head.type &
941                                        HAMMER_MRECF_TYPE_MASK) ==
942                                        HAMMER_MREC_TYPE_REC);
943                                 offset += bytes;
944                                 elm = mrec->rec.leaf;
945                                 if (elm.base.btype != HAMMER_BTREE_TYPE_RECORD)
946                                         continue;
947                                 if (elm.base.rec_type != HAMMER_RECTYPE_DATA)
948                                         continue;
949                                 ++DedupCurrentRecords;
950                                 if (DedupCrcStart != DedupCrcEnd) {
951                                         if (elm.data_crc < DedupCrcStart)
952                                                 continue;
953                                         if (DedupCrcEnd &&
954                                             elm.data_crc >= DedupCrcEnd) {
955                                                 continue;
956                                         }
957                                 }
958                                 func(&elm, 0);
959                         }
960                 }
961                 mirror.key_beg = mirror.key_cur;
962                 if (DidInterrupt || SigAlrmFlag) {
963                         if (VerboseOpt)
964                                 fprintf(stderr, "%s\n",
965                                     (DidInterrupt ? "Interrupted" : "Timeout"));
966                         hammer_set_cycle(&mirror.key_cur, mirror.tid_beg);
967                         if (VerboseOpt)
968                                 fprintf(stderr, "Cyclefile %s updated for "
969                                     "continuation\n", CyclePath);
970                         exit(1);
971                 }
972                 if (SigInfoFlag) {
973                         if (DedupTotalRecords) {
974                                 humanize_unsigned(buf1, sizeof(buf1),
975                                                   DedupDataReads,
976                                                   "B", 1024);
977                                 humanize_unsigned(buf2, sizeof(buf2),
978                                                   dedup_successes_bytes,
979                                                   "B", 1024);
980                                 fprintf(stderr, "%s count %7jd/%jd "
981                                                 "(%02d.%02d%%) "
982                                                 "ioread %s newddup %s\n",
983                                         id,
984                                         (intmax_t)DedupCurrentRecords,
985                                         (intmax_t)DedupTotalRecords,
986                                         (int)(DedupCurrentRecords * 100 /
987                                                 DedupTotalRecords),
988                                         (int)(DedupCurrentRecords * 10000 /
989                                                 DedupTotalRecords % 100),
990                                         buf1, buf2);
991                         } else {
992                                 fprintf(stderr, "%s count %-7jd\n",
993                                         id,
994                                         (intmax_t)DedupCurrentRecords);
995                         }
996                         SigInfoFlag = 0;
997                 }
998         } while (mirror.count != 0);
999
1000         signal(SIGINFO, SIG_IGN);
1001         signal(SIGALRM, SIG_IGN);
1002
1003         free(buf);
1004 }
1005
1006 static void
1007 dump_simulated_dedup(void)
1008 {
1009         struct sim_dedup_entry *sim_de;
1010
1011         printf("=== Dumping simulated dedup entries:\n");
1012         RB_FOREACH(sim_de, sim_dedup_entry_rb_tree, &sim_dedup_tree) {
1013                 printf("\tcrc=%08x cnt=%ju size=%ju\n",
1014                         sim_de->crc,
1015                         (intmax_t)sim_de->ref_blks,
1016                         (intmax_t)sim_de->ref_size);
1017         }
1018         printf("end of dump ===\n");
1019 }
1020
1021 static void
1022 dump_real_dedup(void)
1023 {
1024         struct dedup_entry *de;
1025         struct sha_dedup_entry *sha_de;
1026         int i;
1027
1028         printf("=== Dumping dedup entries:\n");
1029         RB_FOREACH(de, dedup_entry_rb_tree, &dedup_tree) {
1030                 if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
1031                         printf("\tcrc=%08x fictitious\n", de->leaf.data_crc);
1032
1033                         RB_FOREACH(sha_de, sha_dedup_entry_rb_tree, &de->u.fict_root) {
1034                                 printf("\t\tcrc=%08x cnt=%ju size=%ju\n\t"
1035                                        "\t\tsha=",
1036                                        sha_de->leaf.data_crc,
1037                                        (intmax_t)sha_de->ref_blks,
1038                                        (intmax_t)sha_de->ref_size);
1039                                 for (i = 0; i < SHA256_DIGEST_LENGTH; ++i)
1040                                         printf("%02x", sha_de->sha_hash[i]);
1041                                 printf("\n");
1042                         }
1043                 } else {
1044                         printf("\tcrc=%08x cnt=%ju size=%ju\n",
1045                                de->leaf.data_crc,
1046                                (intmax_t)de->u.de.ref_blks,
1047                                (intmax_t)de->u.de.ref_size);
1048                 }
1049         }
1050         printf("end of dump ===\n");
1051 }
1052
1053 static void
1054 dedup_usage(int code)
1055 {
1056         fprintf(stderr,
1057                 "hammer dedup-simulate <filesystem>\n"
1058                 "hammer dedup <filesystem>\n"
1059         );
1060         exit(code);
1061 }