ipfw3_nat: move housekeeping into callout func
[dragonfly.git] / sbin / hammer / cmd_dedup.c
1 /*
2  * Copyright (c) 2010 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Ilya Dryomov <idryomov@gmail.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34
35 #include <libutil.h>
36 #include <crypto/sha2/sha2.h>
37
38 #include "hammer.h"
39
40 #define DEDUP_BUF (64 * 1024)
41
42 /* Sorted list of block CRCs - light version for dedup-simulate */
43 struct sim_dedup_entry_rb_tree;
44 RB_HEAD(sim_dedup_entry_rb_tree, sim_dedup_entry) sim_dedup_tree =
45                                         RB_INITIALIZER(&sim_dedup_tree);
46 RB_PROTOTYPE2(sim_dedup_entry_rb_tree, sim_dedup_entry, rb_entry,
47                 rb_sim_dedup_entry_compare, hammer_crc_t);
48
49 struct sim_dedup_entry {
50         hammer_crc_t    crc;
51         uint64_t        ref_blks; /* number of blocks referenced */
52         uint64_t        ref_size; /* size of data referenced */
53         RB_ENTRY(sim_dedup_entry) rb_entry;
54 };
55
56 struct dedup_entry {
57         struct hammer_btree_leaf_elm leaf;
58         union {
59                 struct {
60                         uint64_t ref_blks;
61                         uint64_t ref_size;
62                 } de;
63                 RB_HEAD(sha_dedup_entry_rb_tree, sha_dedup_entry) fict_root;
64         } u;
65         uint8_t flags;
66         RB_ENTRY(dedup_entry) rb_entry;
67 };
68
69 #define HAMMER_DEDUP_ENTRY_FICTITIOUS   0x0001
70
71 struct sha_dedup_entry {
72         struct hammer_btree_leaf_elm    leaf;
73         uint64_t                        ref_blks;
74         uint64_t                        ref_size;
75         uint8_t                         sha_hash[SHA256_DIGEST_LENGTH];
76         RB_ENTRY(sha_dedup_entry)       fict_entry;
77 };
78
79 /* Sorted list of HAMMER B-Tree keys */
80 struct dedup_entry_rb_tree;
81 struct sha_dedup_entry_rb_tree;
82
83 RB_HEAD(dedup_entry_rb_tree, dedup_entry) dedup_tree =
84                                         RB_INITIALIZER(&dedup_tree);
85 RB_PROTOTYPE2(dedup_entry_rb_tree, dedup_entry, rb_entry,
86                 rb_dedup_entry_compare, hammer_crc_t);
87
88 RB_PROTOTYPE(sha_dedup_entry_rb_tree, sha_dedup_entry, fict_entry,
89                 rb_sha_dedup_entry_compare);
90
91 /*
92  * Pass2 list - contains entries that were not dedup'ed because ioctl failed
93  */
94 STAILQ_HEAD(, pass2_dedup_entry) pass2_dedup_queue =
95                                 STAILQ_HEAD_INITIALIZER(pass2_dedup_queue);
96
97 struct pass2_dedup_entry {
98         struct hammer_btree_leaf_elm    leaf;
99         STAILQ_ENTRY(pass2_dedup_entry) sq_entry;
100 };
101
102 #define DEDUP_PASS2     0x0001 /* process_btree_elm() mode */
103
104 static int SigInfoFlag;
105 static int SigAlrmFlag;
106 static int64_t DedupDataReads;
107 static int64_t DedupCurrentRecords;
108 static int64_t DedupTotalRecords;
109 static uint32_t DedupCrcStart;
110 static uint32_t DedupCrcEnd;
111 static uint64_t MemoryUse;
112
113 /* PFS global ids - we deal with just one PFS at a run */
114 static int glob_fd;
115 static struct hammer_ioc_pseudofs_rw glob_pfs;
116
117 /*
118  * Global accounting variables
119  *
120  * Last three don't have to be 64-bit, just to be safe..
121  */
122 static uint64_t dedup_alloc_size;
123 static uint64_t dedup_ref_size;
124 static uint64_t dedup_skipped_size;
125 static uint64_t dedup_crc_failures;
126 static uint64_t dedup_sha_failures;
127 static uint64_t dedup_underflows;
128 static uint64_t dedup_successes_count;
129 static uint64_t dedup_successes_bytes;
130
131 static int rb_sim_dedup_entry_compare(struct sim_dedup_entry *sim_de1,
132                                 struct sim_dedup_entry *sim_de2);
133 static int rb_dedup_entry_compare(struct dedup_entry *de1,
134                                 struct dedup_entry *de2);
135 static int rb_sha_dedup_entry_compare(struct sha_dedup_entry *sha_de1,
136                                 struct sha_dedup_entry *sha_de2);
137 typedef int (*scan_pfs_cb_t)(hammer_btree_leaf_elm_t scan_leaf, int flags);
138 static void scan_pfs(char *filesystem, scan_pfs_cb_t func, const char *id);
139 static int collect_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags);
140 static int count_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags);
141 static int process_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags);
142 static int upgrade_chksum(hammer_btree_leaf_elm_t leaf, uint8_t *sha_hash);
143 static void dump_simulated_dedup(void);
144 static void dump_real_dedup(void);
145 static void dedup_usage(int code);
146
147 RB_GENERATE2(sim_dedup_entry_rb_tree, sim_dedup_entry, rb_entry,
148                 rb_sim_dedup_entry_compare, hammer_crc_t, crc);
149 RB_GENERATE2(dedup_entry_rb_tree, dedup_entry, rb_entry,
150                 rb_dedup_entry_compare, hammer_crc_t, leaf.data_crc);
151 RB_GENERATE(sha_dedup_entry_rb_tree, sha_dedup_entry, fict_entry,
152                 rb_sha_dedup_entry_compare);
153
154 static
155 int
156 rb_sim_dedup_entry_compare(struct sim_dedup_entry *sim_de1,
157                         struct sim_dedup_entry *sim_de2)
158 {
159         if (sim_de1->crc < sim_de2->crc)
160                 return (-1);
161         if (sim_de1->crc > sim_de2->crc)
162                 return (1);
163
164         return (0);
165 }
166
167 static
168 int
169 rb_dedup_entry_compare(struct dedup_entry *de1, struct dedup_entry *de2)
170 {
171         if (de1->leaf.data_crc < de2->leaf.data_crc)
172                 return (-1);
173         if (de1->leaf.data_crc > de2->leaf.data_crc)
174                 return (1);
175
176         return (0);
177 }
178
179 static
180 int
181 rb_sha_dedup_entry_compare(struct sha_dedup_entry *sha_de1,
182                         struct sha_dedup_entry *sha_de2)
183 {
184         unsigned long *h1 = (unsigned long *)&sha_de1->sha_hash;
185         unsigned long *h2 = (unsigned long *)&sha_de2->sha_hash;
186         int i;
187
188         for (i = 0; i < SHA256_DIGEST_LENGTH / (int)sizeof(unsigned long); ++i) {
189                 if (h1[i] < h2[i])
190                         return (-1);
191                 if (h1[i] > h2[i])
192                         return (1);
193         }
194
195         return (0);
196 }
197
198 /*
199  * dedup-simulate <filesystem>
200  */
201 void
202 hammer_cmd_dedup_simulate(char **av, int ac)
203 {
204         struct sim_dedup_entry *sim_de;
205
206         if (ac != 1) {
207                 dedup_usage(1);
208                 /* not reached */
209         }
210
211         glob_fd = getpfs(&glob_pfs, av[0]);
212
213         /*
214          * Collection passes (memory limited)
215          */
216         printf("Dedup-simulate running\n");
217         do {
218                 DedupCrcStart = DedupCrcEnd;
219                 DedupCrcEnd = 0;
220                 MemoryUse = 0;
221
222                 if (VerboseOpt) {
223                         printf("B-Tree pass  crc-range %08x-max\n",
224                                 DedupCrcStart);
225                         fflush(stdout);
226                 }
227                 scan_pfs(av[0], collect_btree_elm, "simu-pass");
228
229                 if (VerboseOpt >= 2)
230                         dump_simulated_dedup();
231
232                 /*
233                  * Calculate simulated dedup ratio and get rid of the tree
234                  */
235                 while ((sim_de = RB_ROOT(&sim_dedup_tree)) != NULL) {
236                         assert(sim_de->ref_blks != 0);
237                         dedup_ref_size += sim_de->ref_size;
238                         dedup_alloc_size += sim_de->ref_size / sim_de->ref_blks;
239
240                         RB_REMOVE(sim_dedup_entry_rb_tree, &sim_dedup_tree, sim_de);
241                         free(sim_de);
242                 }
243                 if (DedupCrcEnd && VerboseOpt == 0)
244                         printf(".");
245         } while (DedupCrcEnd);
246
247         printf("Dedup-simulate %s succeeded\n", av[0]);
248         relpfs(glob_fd, &glob_pfs);
249
250         printf("Simulated dedup ratio = %.2f\n",
251             (dedup_alloc_size != 0) ?
252                 (double)dedup_ref_size / dedup_alloc_size : 0);
253 }
254
255 /*
256  * dedup <filesystem>
257  */
258 void
259 hammer_cmd_dedup(char **av, int ac)
260 {
261         struct dedup_entry *de;
262         struct sha_dedup_entry *sha_de;
263         struct pass2_dedup_entry *pass2_de;
264         char *tmp;
265         char buf[8];
266         int needfree = 0;
267
268         if (TimeoutOpt > 0)
269                 alarm(TimeoutOpt);
270
271         if (ac != 1) {
272                 dedup_usage(1);
273                 /* not reached */
274         }
275
276         STAILQ_INIT(&pass2_dedup_queue);
277
278         glob_fd = getpfs(&glob_pfs, av[0]);
279
280         /*
281          * A cycle file is _required_ for resuming dedup after the timeout
282          * specified with -t has expired. If no -c option, then place a
283          * .dedup.cycle file either in the PFS snapshots directory or in
284          * the default snapshots directory.
285          */
286         if (!CyclePath) {
287                 if (glob_pfs.ondisk->snapshots[0] != '/') {
288                         asprintf(&tmp, "%s/%s/.dedup.cycle",
289                             SNAPSHOTS_BASE, av[0]);
290                 } else {
291                         asprintf(&tmp, "%s/.dedup.cycle",
292                             glob_pfs.ondisk->snapshots);
293                 }
294                 CyclePath = tmp;
295                 needfree = 1;
296         }
297
298         /*
299          * Pre-pass to cache the btree
300          */
301         scan_pfs(av[0], count_btree_elm, "pre-pass ");
302         DedupTotalRecords = DedupCurrentRecords;
303
304         /*
305          * Collection passes (memory limited)
306          */
307         printf("Dedup running\n");
308         do {
309                 DedupCrcStart = DedupCrcEnd;
310                 DedupCrcEnd = 0;
311                 MemoryUse = 0;
312
313                 if (VerboseOpt) {
314                         printf("B-Tree pass  crc-range %08x-max\n",
315                                 DedupCrcStart);
316                         fflush(stdout);
317                 }
318                 scan_pfs(av[0], process_btree_elm, "main-pass");
319
320                 while ((pass2_de = STAILQ_FIRST(&pass2_dedup_queue)) != NULL) {
321                         if (process_btree_elm(&pass2_de->leaf, DEDUP_PASS2))
322                                 dedup_skipped_size -= pass2_de->leaf.data_len;
323
324                         STAILQ_REMOVE_HEAD(&pass2_dedup_queue, sq_entry);
325                         free(pass2_de);
326                 }
327                 assert(STAILQ_EMPTY(&pass2_dedup_queue));
328
329                 if (VerboseOpt >= 2)
330                         dump_real_dedup();
331
332                 /*
333                  * Calculate dedup ratio and get rid of the trees
334                  */
335                 while ((de = RB_ROOT(&dedup_tree)) != NULL) {
336                         if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
337                                 while ((sha_de = RB_ROOT(&de->u.fict_root)) != NULL) {
338                                         assert(sha_de->ref_blks != 0);
339                                         dedup_ref_size += sha_de->ref_size;
340                                         dedup_alloc_size += sha_de->ref_size / sha_de->ref_blks;
341
342                                         RB_REMOVE(sha_dedup_entry_rb_tree,
343                                                         &de->u.fict_root, sha_de);
344                                         free(sha_de);
345                                 }
346                                 assert(RB_EMPTY(&de->u.fict_root));
347                         } else {
348                                 assert(de->u.de.ref_blks != 0);
349                                 dedup_ref_size += de->u.de.ref_size;
350                                 dedup_alloc_size += de->u.de.ref_size / de->u.de.ref_blks;
351                         }
352
353                         RB_REMOVE(dedup_entry_rb_tree, &dedup_tree, de);
354                         free(de);
355                 }
356                 assert(RB_EMPTY(&dedup_tree));
357                 if (DedupCrcEnd && VerboseOpt == 0)
358                         printf(".");
359         } while (DedupCrcEnd);
360
361         printf("Dedup %s succeeded\n", av[0]);
362         relpfs(glob_fd, &glob_pfs);
363
364         humanize_unsigned(buf, sizeof(buf), dedup_ref_size, "B", 1024);
365         printf("Dedup ratio = %.2f (in this run)\n"
366                "    %8s referenced\n",
367                ((dedup_alloc_size != 0) ?
368                         (double)dedup_ref_size / dedup_alloc_size : 0),
369                buf
370         );
371         humanize_unsigned(buf, sizeof(buf), dedup_alloc_size, "B", 1024);
372         printf("    %8s allocated\n", buf);
373         humanize_unsigned(buf, sizeof(buf), dedup_skipped_size, "B", 1024);
374         printf("    %8s skipped\n", buf);
375         printf("    %8jd CRC collisions\n"
376                "    %8jd SHA collisions\n"
377                "    %8jd big-block underflows\n"
378                "    %8jd new dedup records\n"
379                "    %8jd new dedup bytes\n",
380                (intmax_t)dedup_crc_failures,
381                (intmax_t)dedup_sha_failures,
382                (intmax_t)dedup_underflows,
383                (intmax_t)dedup_successes_count,
384                (intmax_t)dedup_successes_bytes
385         );
386
387         /* Once completed remove cycle file */
388         hammer_reset_cycle();
389
390         /* We don't want to mess up with other directives */
391         if (needfree) {
392                 free(tmp);
393                 CyclePath = NULL;
394         }
395 }
396
397 static
398 int
399 count_btree_elm(hammer_btree_leaf_elm_t scan_leaf __unused, int flags __unused)
400 {
401         return(1);
402 }
403
404 static
405 int
406 collect_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags __unused)
407 {
408         struct sim_dedup_entry *sim_de;
409
410         /*
411          * If we are using too much memory we have to clean some out, which
412          * will cause the run to use multiple passes.  Be careful of integer
413          * overflows!
414          */
415         if (MemoryUse > MemoryLimit) {
416                 DedupCrcEnd = DedupCrcStart +
417                               (uint32_t)(DedupCrcEnd - DedupCrcStart - 1) / 2;
418                 if (VerboseOpt) {
419                         printf("memory limit  crc-range %08x-%08x\n",
420                                 DedupCrcStart, DedupCrcEnd);
421                         fflush(stdout);
422                 }
423                 for (;;) {
424                         sim_de = RB_MAX(sim_dedup_entry_rb_tree,
425                                         &sim_dedup_tree);
426                         if (sim_de == NULL || sim_de->crc < DedupCrcEnd)
427                                 break;
428                         RB_REMOVE(sim_dedup_entry_rb_tree,
429                                   &sim_dedup_tree, sim_de);
430                         MemoryUse -= sizeof(*sim_de);
431                         free(sim_de);
432                 }
433         }
434
435         /*
436          * Collect statistics based on the CRC only, do not try to read
437          * any data blocks or run SHA hashes.
438          */
439         sim_de = RB_LOOKUP(sim_dedup_entry_rb_tree, &sim_dedup_tree,
440                            scan_leaf->data_crc);
441
442         if (sim_de == NULL) {
443                 sim_de = calloc(1, sizeof(*sim_de));
444                 sim_de->crc = scan_leaf->data_crc;
445                 RB_INSERT(sim_dedup_entry_rb_tree, &sim_dedup_tree, sim_de);
446                 MemoryUse += sizeof(*sim_de);
447         }
448
449         sim_de->ref_blks += 1;
450         sim_de->ref_size += scan_leaf->data_len;
451         return (1);
452 }
453
454 static __inline
455 int
456 validate_dedup_pair(hammer_btree_leaf_elm_t p, hammer_btree_leaf_elm_t q)
457 {
458         if (HAMMER_ZONE(p->data_offset) != HAMMER_ZONE(q->data_offset))
459                 return (1);
460         if (p->data_len != q->data_len)
461                 return (1);
462
463         return (0);
464 }
465
466 #define DEDUP_TECH_FAILURE      1
467 #define DEDUP_CMP_FAILURE       2
468 #define DEDUP_INVALID_ZONE      3
469 #define DEDUP_UNDERFLOW         4
470 #define DEDUP_VERS_FAILURE      5
471
472 static __inline
473 int
474 deduplicate(hammer_btree_leaf_elm_t p, hammer_btree_leaf_elm_t q)
475 {
476         struct hammer_ioc_dedup dedup;
477
478         bzero(&dedup, sizeof(dedup));
479
480         /*
481          * If data_offset fields are the same there is no need to run ioctl,
482          * candidate is already dedup'ed.
483          */
484         if (p->data_offset == q->data_offset)
485                 return (0);
486
487         dedup.elm1 = p->base;
488         dedup.elm2 = q->base;
489         RunningIoctl = 1;
490         if (ioctl(glob_fd, HAMMERIOC_DEDUP, &dedup) < 0) {
491                 if (errno == EOPNOTSUPP)
492                         return (DEDUP_VERS_FAILURE); /* must be at least version 5 */
493                 /* Technical failure - locking or w/e */
494                 return (DEDUP_TECH_FAILURE);
495         }
496         if (dedup.head.flags & HAMMER_IOC_DEDUP_CMP_FAILURE)
497                 return (DEDUP_CMP_FAILURE);
498         if (dedup.head.flags & HAMMER_IOC_DEDUP_INVALID_ZONE)
499                 return (DEDUP_INVALID_ZONE);
500         if (dedup.head.flags & HAMMER_IOC_DEDUP_UNDERFLOW)
501                 return (DEDUP_UNDERFLOW);
502         RunningIoctl = 0;
503         ++dedup_successes_count;
504         dedup_successes_bytes += p->data_len;
505         return (0);
506 }
507
508 static
509 int
510 process_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags)
511 {
512         struct dedup_entry *de;
513         struct sha_dedup_entry *sha_de, temp;
514         struct pass2_dedup_entry *pass2_de;
515         int error;
516
517         /*
518          * If we are using too much memory we have to clean some out, which
519          * will cause the run to use multiple passes.  Be careful of integer
520          * overflows!
521          */
522         while (MemoryUse > MemoryLimit) {
523                 DedupCrcEnd = DedupCrcStart +
524                               (uint32_t)(DedupCrcEnd - DedupCrcStart - 1) / 2;
525                 if (VerboseOpt) {
526                         printf("memory limit  crc-range %08x-%08x\n",
527                                 DedupCrcStart, DedupCrcEnd);
528                         fflush(stdout);
529                 }
530
531                 for (;;) {
532                         de = RB_MAX(dedup_entry_rb_tree, &dedup_tree);
533                         if (de == NULL || de->leaf.data_crc < DedupCrcEnd)
534                                 break;
535                         if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
536                                 while ((sha_de = RB_ROOT(&de->u.fict_root)) !=
537                                        NULL) {
538                                         RB_REMOVE(sha_dedup_entry_rb_tree,
539                                                   &de->u.fict_root, sha_de);
540                                         MemoryUse -= sizeof(*sha_de);
541                                         free(sha_de);
542                                 }
543                         }
544                         RB_REMOVE(dedup_entry_rb_tree, &dedup_tree, de);
545                         MemoryUse -= sizeof(*de);
546                         free(de);
547                 }
548         }
549
550         /*
551          * Collect statistics based on the CRC.  Colliding CRCs usually
552          * cause a SHA sub-tree to be created under the de.
553          *
554          * Trivial case if de not found.
555          */
556         de = RB_LOOKUP(dedup_entry_rb_tree, &dedup_tree, scan_leaf->data_crc);
557         if (de == NULL) {
558                 de = calloc(1, sizeof(*de));
559                 de->leaf = *scan_leaf;
560                 RB_INSERT(dedup_entry_rb_tree, &dedup_tree, de);
561                 MemoryUse += sizeof(*de);
562                 goto upgrade_stats;
563         }
564
565         /*
566          * Found entry in CRC tree
567          */
568         if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
569                 /*
570                  * Optimize the case where a CRC failure results in multiple
571                  * SHA entries.  If we unconditionally issue a data-read a
572                  * degenerate situation where a colliding CRC's second SHA
573                  * entry contains the lion's share of the deduplication
574                  * candidates will result in excessive data block reads.
575                  *
576                  * Deal with the degenerate case by looking for a matching
577                  * data_offset/data_len in the SHA elements we already have
578                  * before reading the data block and generating a new SHA.
579                  */
580                 RB_FOREACH(sha_de, sha_dedup_entry_rb_tree, &de->u.fict_root) {
581                         if (sha_de->leaf.data_offset ==
582                                                 scan_leaf->data_offset &&
583                             sha_de->leaf.data_len == scan_leaf->data_len) {
584                                 memcpy(temp.sha_hash, sha_de->sha_hash,
585                                         SHA256_DIGEST_LENGTH);
586                                 break;
587                         }
588                 }
589
590                 /*
591                  * Entry in CRC tree is fictitious, so we already had problems
592                  * with this CRC. Upgrade (compute SHA) the candidate and
593                  * dive into SHA subtree. If upgrade fails insert the candidate
594                  * into Pass2 list (it will be processed later).
595                  */
596                 if (sha_de == NULL) {
597                         if (upgrade_chksum(scan_leaf, temp.sha_hash))
598                                 goto pass2_insert;
599
600                         sha_de = RB_FIND(sha_dedup_entry_rb_tree,
601                                          &de->u.fict_root, &temp);
602                 }
603
604                 /*
605                  * Nothing in SHA subtree so far, so this is a new
606                  * 'dataset'. Insert new entry into SHA subtree.
607                  */
608                 if (sha_de == NULL) {
609                         sha_de = calloc(1, sizeof(*sha_de));
610                         sha_de->leaf = *scan_leaf;
611                         memcpy(sha_de->sha_hash, temp.sha_hash,
612                                SHA256_DIGEST_LENGTH);
613                         RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root,
614                                   sha_de);
615                         MemoryUse += sizeof(*sha_de);
616                         goto upgrade_stats_sha;
617                 }
618
619                 /*
620                  * Found entry in SHA subtree, it means we have a potential
621                  * dedup pair. Validate it (zones have to match and data_len
622                  * field have to be the same too. If validation fails, treat
623                  * it as a SHA collision (jump to sha256_failure).
624                  */
625                 if (validate_dedup_pair(&sha_de->leaf, scan_leaf))
626                         goto sha256_failure;
627
628                 /*
629                  * We have a valid dedup pair (SHA match, validated).
630                  *
631                  * In case of technical failure (dedup pair was good, but
632                  * ioctl failed anyways) insert the candidate into Pass2 list
633                  * (we will try to dedup it after we are done with the rest of
634                  * the tree).
635                  *
636                  * If ioctl fails because either of blocks is in the non-dedup
637                  * zone (we can dedup only in LARGE_DATA and SMALL_DATA) don't
638                  * bother with the candidate and terminate early.
639                  *
640                  * If ioctl fails because of big-block underflow replace the
641                  * leaf node that found dedup entry represents with scan_leaf.
642                  */
643                 error = deduplicate(&sha_de->leaf, scan_leaf);
644                 switch(error) {
645                 case 0:
646                         goto upgrade_stats_sha;
647                 case DEDUP_TECH_FAILURE:
648                         goto pass2_insert;
649                 case DEDUP_CMP_FAILURE:
650                         goto sha256_failure;
651                 case DEDUP_INVALID_ZONE:
652                         goto terminate_early;
653                 case DEDUP_UNDERFLOW:
654                         ++dedup_underflows;
655                         sha_de->leaf = *scan_leaf;
656                         memcpy(sha_de->sha_hash, temp.sha_hash,
657                                 SHA256_DIGEST_LENGTH);
658                         goto upgrade_stats_sha;
659                 case DEDUP_VERS_FAILURE:
660                         errx(1, "HAMMER filesystem must be at least "
661                                 "version 5 to dedup");
662                         /* not reached */
663                 default:
664                         fprintf(stderr, "Unknown error\n");
665                         goto terminate_early;
666                 }
667
668                 /*
669                  * Ooh la la.. SHA-256 collision. Terminate early, there's
670                  * nothing we can do here.
671                  */
672 sha256_failure:
673                 ++dedup_sha_failures;
674                 goto terminate_early;
675         } else {
676                 /*
677                  * Candidate CRC is good for now (we found an entry in CRC
678                  * tree and it's not fictitious). This means we have a
679                  * potential dedup pair.
680                  */
681                 if (validate_dedup_pair(&de->leaf, scan_leaf))
682                         goto crc_failure;
683
684                 /*
685                  * We have a valid dedup pair (CRC match, validated)
686                  */
687                 error = deduplicate(&de->leaf, scan_leaf);
688                 switch(error) {
689                 case 0:
690                         goto upgrade_stats;
691                 case DEDUP_TECH_FAILURE:
692                         goto pass2_insert;
693                 case DEDUP_CMP_FAILURE:
694                         goto crc_failure;
695                 case DEDUP_INVALID_ZONE:
696                         goto terminate_early;
697                 case DEDUP_UNDERFLOW:
698                         ++dedup_underflows;
699                         de->leaf = *scan_leaf;
700                         goto upgrade_stats;
701                 case DEDUP_VERS_FAILURE:
702                         errx(1, "HAMMER filesystem must be at least "
703                                 "version 5 to dedup");
704                         /* not reached */
705                 default:
706                         fprintf(stderr, "Unknown error\n");
707                         goto terminate_early;
708                 }
709
710 crc_failure:
711                 /*
712                  * We got a CRC collision - either ioctl failed because of
713                  * the comparison failure or validation of the potential
714                  * dedup pair went bad. In all cases insert both blocks
715                  * into SHA subtree (this requires checksum upgrade) and mark
716                  * entry that corresponds to this CRC in the CRC tree
717                  * fictitious, so that all futher operations with this CRC go
718                  * through SHA subtree.
719                  */
720                 ++dedup_crc_failures;
721
722                 /*
723                  * Insert block that was represented by now fictitious dedup
724                  * entry (create a new SHA entry and preserve stats of the
725                  * old CRC one). If checksum upgrade fails insert the
726                  * candidate into Pass2 list and return - keep both trees
727                  * unmodified.
728                  */
729                 sha_de = calloc(1, sizeof(*sha_de));
730                 sha_de->leaf = de->leaf;
731                 sha_de->ref_blks = de->u.de.ref_blks;
732                 sha_de->ref_size = de->u.de.ref_size;
733                 if (upgrade_chksum(&sha_de->leaf, sha_de->sha_hash)) {
734                         free(sha_de);
735                         goto pass2_insert;
736                 }
737                 MemoryUse += sizeof(*sha_de);
738
739                 RB_INIT(&de->u.fict_root);
740                 /*
741                  * Here we can insert without prior checking because the tree
742                  * is empty at this point
743                  */
744                 RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root, sha_de);
745
746                 /*
747                  * Mark entry in CRC tree fictitious
748                  */
749                 de->flags |= HAMMER_DEDUP_ENTRY_FICTITIOUS;
750
751                 /*
752                  * Upgrade checksum of the candidate and insert it into
753                  * SHA subtree. If upgrade fails insert the candidate into
754                  * Pass2 list.
755                  */
756                 if (upgrade_chksum(scan_leaf, temp.sha_hash))
757                         goto pass2_insert;
758                 sha_de = RB_FIND(sha_dedup_entry_rb_tree, &de->u.fict_root,
759                                  &temp);
760                 if (sha_de != NULL) {
761                         /* There is an entry with this SHA already, but the only
762                          * RB-tree element at this point is that entry we just
763                          * added. We know for sure these blocks are different
764                          * (this is crc_failure branch) so treat it as SHA
765                          * collision.
766                          */
767                         goto sha256_failure;
768                 }
769
770                 sha_de = calloc(1, sizeof(*sha_de));
771                 sha_de->leaf = *scan_leaf;
772                 memcpy(sha_de->sha_hash, temp.sha_hash, SHA256_DIGEST_LENGTH);
773                 RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root, sha_de);
774                 MemoryUse += sizeof(*sha_de);
775                 goto upgrade_stats_sha;
776         }
777
778 upgrade_stats:
779         de->u.de.ref_blks += 1;
780         de->u.de.ref_size += scan_leaf->data_len;
781         return (1);
782
783 upgrade_stats_sha:
784         sha_de->ref_blks += 1;
785         sha_de->ref_size += scan_leaf->data_len;
786         return (1);
787
788 pass2_insert:
789         /*
790          * If in pass2 mode don't insert anything, fall through to
791          * terminate_early
792          */
793         if ((flags & DEDUP_PASS2) == 0) {
794                 pass2_de = calloc(1, sizeof(*pass2_de));
795                 pass2_de->leaf = *scan_leaf;
796                 STAILQ_INSERT_TAIL(&pass2_dedup_queue, pass2_de, sq_entry);
797                 dedup_skipped_size += scan_leaf->data_len;
798                 return (1);
799         }
800
801 terminate_early:
802         /*
803          * Early termination path. Fixup stats.
804          */
805         dedup_alloc_size += scan_leaf->data_len;
806         dedup_ref_size += scan_leaf->data_len;
807         return (0);
808 }
809
810 static
811 int
812 upgrade_chksum(hammer_btree_leaf_elm_t leaf, uint8_t *sha_hash)
813 {
814         struct hammer_ioc_data data;
815         char *buf = malloc(DEDUP_BUF);
816         SHA256_CTX ctx;
817         int error;
818
819         bzero(&data, sizeof(data));
820         data.elm = leaf->base;
821         data.ubuf = buf;
822         data.size = DEDUP_BUF;
823
824         error = 0;
825         if (ioctl(glob_fd, HAMMERIOC_GET_DATA, &data) < 0) {
826                 fprintf(stderr, "Get-data failed: %s\n", strerror(errno));
827                 error = 1;
828                 goto done;
829         }
830         DedupDataReads += leaf->data_len;
831
832         if (data.leaf.data_len != leaf->data_len) {
833                 error = 1;
834                 goto done;
835         }
836
837         if (data.leaf.base.btype == HAMMER_BTREE_TYPE_RECORD &&
838             data.leaf.base.rec_type == HAMMER_RECTYPE_DATA) {
839                 SHA256_Init(&ctx);
840                 SHA256_Update(&ctx, (void *)buf, data.leaf.data_len);
841                 SHA256_Final(sha_hash, &ctx);
842         }
843
844 done:
845         free(buf);
846         return (error);
847 }
848
849 static
850 void
851 sigAlrm(int signo __unused)
852 {
853         SigAlrmFlag = 1;
854 }
855
856 static
857 void
858 sigInfo(int signo __unused)
859 {
860         SigInfoFlag = 1;
861 }
862
863 static
864 void
865 scan_pfs(char *filesystem, scan_pfs_cb_t func, const char *id)
866 {
867         struct hammer_ioc_mirror_rw mirror;
868         hammer_ioc_mrecord_any_t mrec;
869         struct hammer_btree_leaf_elm elm;
870         char *buf = malloc(DEDUP_BUF);
871         char buf1[8];
872         char buf2[8];
873         int offset, bytes;
874
875         SigInfoFlag = 0;
876         DedupDataReads = 0;
877         DedupCurrentRecords = 0;
878         signal(SIGINFO, sigInfo);
879         signal(SIGALRM, sigAlrm);
880
881         /*
882          * Deduplication happens per element so hammer(8) is in full
883          * control of the ioctl()s to actually perform it. SIGALRM
884          * needs to be handled within hammer(8) but a checkpoint
885          * is needed for resuming. Use cycle file for that.
886          *
887          * Try to obtain the previous obj_id from the cycle file and
888          * if not available just start from the beginning.
889          */
890         bzero(&mirror, sizeof(mirror));
891         hammer_key_beg_init(&mirror.key_beg);
892         hammer_get_cycle(&mirror.key_beg, &mirror.tid_beg);
893
894         if (mirror.key_beg.obj_id != (int64_t)HAMMER_MIN_OBJID) {
895                 if (VerboseOpt) {
896                         fprintf(stderr, "%s: mirror-read: Resuming at object %016jx\n",
897                             id, (uintmax_t)mirror.key_beg.obj_id);
898                 }
899         }
900
901         hammer_key_end_init(&mirror.key_end);
902
903         mirror.tid_beg = glob_pfs.ondisk->sync_beg_tid;
904         mirror.tid_end = glob_pfs.ondisk->sync_end_tid;
905         mirror.head.flags |= HAMMER_IOC_MIRROR_NODATA; /* we want only keys */
906         mirror.ubuf = buf;
907         mirror.size = DEDUP_BUF;
908         mirror.pfs_id = glob_pfs.pfs_id;
909         mirror.shared_uuid = glob_pfs.ondisk->shared_uuid;
910
911         if (VerboseOpt && DedupCrcStart == 0) {
912                 printf("%s %s: objspace %016jx:%04x %016jx:%04x\n",
913                         id, filesystem,
914                         (uintmax_t)mirror.key_beg.obj_id,
915                         mirror.key_beg.localization,
916                         (uintmax_t)mirror.key_end.obj_id,
917                         mirror.key_end.localization);
918                 printf("%s %s: pfs_id %d\n",
919                         id, filesystem, glob_pfs.pfs_id);
920         }
921         fflush(stdout);
922         fflush(stderr);
923
924         do {
925                 mirror.count = 0;
926                 mirror.pfs_id = glob_pfs.pfs_id;
927                 mirror.shared_uuid = glob_pfs.ondisk->shared_uuid;
928                 if (ioctl(glob_fd, HAMMERIOC_MIRROR_READ, &mirror) < 0) {
929                         err(1, "Mirror-read %s failed", filesystem);
930                         /* not reached */
931                 }
932                 if (mirror.head.flags & HAMMER_IOC_HEAD_ERROR) {
933                         errx(1, "Mirror-read %s fatal error %d",
934                                 filesystem, mirror.head.error);
935                         /* not reached */
936                 }
937                 if (mirror.count) {
938                         offset = 0;
939                         while (offset < mirror.count) {
940                                 mrec = (void *)((char *)buf + offset);
941                                 bytes = HAMMER_HEAD_DOALIGN(mrec->head.rec_size);
942                                 if (offset + bytes > mirror.count) {
943                                         errx(1, "Misaligned record");
944                                         /* not reached */
945                                 }
946                                 assert((mrec->head.type &
947                                        HAMMER_MRECF_TYPE_MASK) ==
948                                        HAMMER_MREC_TYPE_REC);
949                                 offset += bytes;
950                                 elm = mrec->rec.leaf;
951                                 if (elm.base.btype != HAMMER_BTREE_TYPE_RECORD)
952                                         continue;
953                                 if (elm.base.rec_type != HAMMER_RECTYPE_DATA)
954                                         continue;
955                                 ++DedupCurrentRecords;
956                                 if (DedupCrcStart != DedupCrcEnd) {
957                                         if (elm.data_crc < DedupCrcStart)
958                                                 continue;
959                                         if (DedupCrcEnd &&
960                                             elm.data_crc >= DedupCrcEnd) {
961                                                 continue;
962                                         }
963                                 }
964                                 func(&elm, 0);
965                         }
966                 }
967                 mirror.key_beg = mirror.key_cur;
968                 if (DidInterrupt || SigAlrmFlag) {
969                         if (VerboseOpt) {
970                                 fprintf(stderr, "%s\n",
971                                     (DidInterrupt ? "Interrupted" : "Timeout"));
972                         }
973                         hammer_set_cycle(&mirror.key_cur, mirror.tid_beg);
974                         if (VerboseOpt) {
975                                 fprintf(stderr, "Cyclefile %s updated for "
976                                     "continuation\n", CyclePath);
977                         }
978                         exit(1);
979                 }
980                 if (SigInfoFlag) {
981                         if (DedupTotalRecords) {
982                                 humanize_unsigned(buf1, sizeof(buf1),
983                                                   DedupDataReads,
984                                                   "B", 1024);
985                                 humanize_unsigned(buf2, sizeof(buf2),
986                                                   dedup_successes_bytes,
987                                                   "B", 1024);
988                                 fprintf(stderr, "%s count %7jd/%jd "
989                                                 "(%02d.%02d%%) "
990                                                 "ioread %s newddup %s\n",
991                                         id,
992                                         (intmax_t)DedupCurrentRecords,
993                                         (intmax_t)DedupTotalRecords,
994                                         (int)(DedupCurrentRecords * 100 /
995                                                 DedupTotalRecords),
996                                         (int)(DedupCurrentRecords * 10000 /
997                                                 DedupTotalRecords % 100),
998                                         buf1, buf2);
999                         } else {
1000                                 fprintf(stderr, "%s count %-7jd\n",
1001                                         id,
1002                                         (intmax_t)DedupCurrentRecords);
1003                         }
1004                         SigInfoFlag = 0;
1005                 }
1006         } while (mirror.count != 0);
1007
1008         signal(SIGINFO, SIG_IGN);
1009         signal(SIGALRM, SIG_IGN);
1010
1011         free(buf);
1012 }
1013
1014 static
1015 void
1016 dump_simulated_dedup(void)
1017 {
1018         struct sim_dedup_entry *sim_de;
1019
1020         printf("=== Dumping simulated dedup entries:\n");
1021         RB_FOREACH(sim_de, sim_dedup_entry_rb_tree, &sim_dedup_tree) {
1022                 printf("\tcrc=%08x cnt=%ju size=%ju\n",
1023                         sim_de->crc,
1024                         (intmax_t)sim_de->ref_blks,
1025                         (intmax_t)sim_de->ref_size);
1026         }
1027         printf("end of dump ===\n");
1028 }
1029
1030 static
1031 void
1032 dump_real_dedup(void)
1033 {
1034         struct dedup_entry *de;
1035         struct sha_dedup_entry *sha_de;
1036         int i;
1037
1038         printf("=== Dumping dedup entries:\n");
1039         RB_FOREACH(de, dedup_entry_rb_tree, &dedup_tree) {
1040                 if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
1041                         printf("\tcrc=%08x fictitious\n", de->leaf.data_crc);
1042
1043                         RB_FOREACH(sha_de, sha_dedup_entry_rb_tree, &de->u.fict_root) {
1044                                 printf("\t\tcrc=%08x cnt=%ju size=%ju\n\t"
1045                                        "\t\tsha=",
1046                                        sha_de->leaf.data_crc,
1047                                        (intmax_t)sha_de->ref_blks,
1048                                        (intmax_t)sha_de->ref_size);
1049                                 for (i = 0; i < SHA256_DIGEST_LENGTH; ++i)
1050                                         printf("%02x", sha_de->sha_hash[i]);
1051                                 printf("\n");
1052                         }
1053                 } else {
1054                         printf("\tcrc=%08x cnt=%ju size=%ju\n",
1055                                de->leaf.data_crc,
1056                                (intmax_t)de->u.de.ref_blks,
1057                                (intmax_t)de->u.de.ref_size);
1058                 }
1059         }
1060         printf("end of dump ===\n");
1061 }
1062
1063 static
1064 void
1065 dedup_usage(int code)
1066 {
1067         fprintf(stderr,
1068                 "hammer dedup-simulate <filesystem>\n"
1069                 "hammer dedup <filesystem>\n"
1070         );
1071         exit(code);
1072 }