Merge branch 'master' of git://crater.dragonflybsd.org/dragonfly
[dragonfly.git] / sbin / hammer / cmd_dedup.c
1 /*
2  * Copyright (c) 2010 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Ilya Dryomov <idryomov@gmail.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34
35 #include "hammer.h"
36 #include <libutil.h>
37 #include <crypto/sha2/sha2.h>
38
39 #define DEDUP_BUF (64 * 1024)
40
41 /* Sorted list of block CRCs - light version for dedup-simulate */
42 struct sim_dedup_entry_rb_tree;
43 RB_HEAD(sim_dedup_entry_rb_tree, sim_dedup_entry) sim_dedup_tree =
44                                         RB_INITIALIZER(&sim_dedup_tree);
45 RB_PROTOTYPE2(sim_dedup_entry_rb_tree, sim_dedup_entry, rb_entry,
46                 rb_sim_dedup_entry_compare, hammer_crc_t);
47
48 struct sim_dedup_entry {
49         hammer_crc_t    crc;
50         u_int64_t       ref_blks; /* number of blocks referenced */
51         u_int64_t       ref_size; /* size of data referenced */
52         RB_ENTRY(sim_dedup_entry) rb_entry;
53 };
54
55 /* Sorted list of HAMMER B-Tree keys */
56 struct dedup_entry_rb_tree;
57 struct sha_dedup_entry_rb_tree;
58
59 RB_HEAD(dedup_entry_rb_tree, dedup_entry) dedup_tree =
60                                         RB_INITIALIZER(&dedup_tree);
61 RB_PROTOTYPE2(dedup_entry_rb_tree, dedup_entry, rb_entry,
62                 rb_dedup_entry_compare, hammer_crc_t);
63
64 RB_PROTOTYPE(sha_dedup_entry_rb_tree, sha_dedup_entry, fict_entry,
65                 rb_sha_dedup_entry_compare);
66
67 struct dedup_entry {
68         struct hammer_btree_leaf_elm leaf;
69         union {
70                 struct {
71                         u_int64_t ref_blks;
72                         u_int64_t ref_size;
73                 } de;
74                 RB_HEAD(sha_dedup_entry_rb_tree, sha_dedup_entry) fict_root;
75         } u;
76         u_int8_t flags;
77         RB_ENTRY(dedup_entry) rb_entry;
78 };
79
80 #define HAMMER_DEDUP_ENTRY_FICTITIOUS   0x0001
81
82 struct sha_dedup_entry {
83         struct hammer_btree_leaf_elm    leaf;
84         u_int64_t                       ref_blks;
85         u_int64_t                       ref_size;
86         u_int8_t                        sha_hash[SHA256_DIGEST_LENGTH];
87         RB_ENTRY(sha_dedup_entry)       fict_entry;
88 };
89
90 /*
91  * Pass2 list - contains entries that were not dedup'ed because ioctl failed
92  */
93 STAILQ_HEAD(, pass2_dedup_entry) pass2_dedup_queue =
94                                 STAILQ_HEAD_INITIALIZER(pass2_dedup_queue);
95
96 struct pass2_dedup_entry {
97         struct hammer_btree_leaf_elm    leaf;
98         STAILQ_ENTRY(pass2_dedup_entry) sq_entry;
99 };
100
101 #define DEDUP_PASS2     0x0001 /* process_btree_elm() mode */
102
103 /* PFS global ids - we deal with just one PFS at a run */
104 int glob_fd;
105 struct hammer_ioc_pseudofs_rw glob_pfs;
106
107 /*
108  * Global accounting variables
109  *
110  * Last three don't have to be 64-bit, just to be safe..
111  */
112 u_int64_t dedup_alloc_size = 0;
113 u_int64_t dedup_ref_size = 0;
114 u_int64_t dedup_skipped_size = 0;
115 u_int64_t dedup_crc_failures = 0;
116 u_int64_t dedup_sha_failures = 0;
117 u_int64_t dedup_underflows = 0;
118
119 static int rb_sim_dedup_entry_compare(struct sim_dedup_entry *sim_de1,
120                                 struct sim_dedup_entry *sim_de2);
121 static int rb_dedup_entry_compare(struct dedup_entry *de1,
122                                 struct dedup_entry *de2);
123 static int rb_sha_dedup_entry_compare(struct sha_dedup_entry *sha_de1,
124                                 struct sha_dedup_entry *sha_de2);
125 typedef int (*scan_pfs_cb_t)(hammer_btree_leaf_elm_t scan_leaf, int flags);
126 static void scan_pfs(char *filesystem, scan_pfs_cb_t func);
127 static int collect_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags);
128 static int process_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags);
129 static int upgrade_chksum(hammer_btree_leaf_elm_t leaf, u_int8_t *sha_hash);
130 static void dump_simulated_dedup(void);
131 static void dump_real_dedup(void);
132 static void dedup_usage(int code);
133
134 RB_GENERATE2(sim_dedup_entry_rb_tree, sim_dedup_entry, rb_entry,
135                 rb_sim_dedup_entry_compare, hammer_crc_t, crc);
136 RB_GENERATE2(dedup_entry_rb_tree, dedup_entry, rb_entry,
137                 rb_dedup_entry_compare, hammer_crc_t, leaf.data_crc);
138 RB_GENERATE(sha_dedup_entry_rb_tree, sha_dedup_entry, fict_entry,
139                 rb_sha_dedup_entry_compare);
140
141 static int
142 rb_sim_dedup_entry_compare(struct sim_dedup_entry *sim_de1,
143                         struct sim_dedup_entry *sim_de2)
144 {
145         if (sim_de1->crc < sim_de2->crc)
146                 return (-1);
147         if (sim_de1->crc > sim_de2->crc)
148                 return (1);
149
150         return (0);
151 }
152
153 static int
154 rb_dedup_entry_compare(struct dedup_entry *de1, struct dedup_entry *de2)
155 {
156         if (de1->leaf.data_crc < de2->leaf.data_crc)
157                 return (-1);
158         if (de1->leaf.data_crc > de2->leaf.data_crc)
159                 return (1);
160
161         return (0);
162 }
163
164 static int
165 rb_sha_dedup_entry_compare(struct sha_dedup_entry *sha_de1,
166                         struct sha_dedup_entry *sha_de2)
167 {
168         unsigned long *h1 = (unsigned long *)&sha_de1->sha_hash;
169         unsigned long *h2 = (unsigned long *)&sha_de2->sha_hash;
170         int i;
171
172         for (i = 0; i < SHA256_DIGEST_LENGTH / (int)sizeof(unsigned long); ++i) {
173                 if (h1[i] < h2[i])
174                         return (-1);
175                 if (h1[i] > h2[i])
176                         return (1);
177         }
178
179         return (0);
180 }
181
182 /*
183  * dedup-simulate <filesystem>
184  */
185 void
186 hammer_cmd_dedup_simulate(char **av, int ac)
187 {
188         struct sim_dedup_entry *sim_de;
189
190         if (ac != 1)
191                 dedup_usage(1);
192
193         glob_fd = getpfs(&glob_pfs, av[0]);
194
195         printf("Dedup-simulate ");
196         scan_pfs(av[0], &collect_btree_elm);
197         printf("Dedup-simulate %s succeeded\n", av[0]);
198
199         relpfs(glob_fd, &glob_pfs);
200
201         if (VerboseOpt >= 2)
202                 dump_simulated_dedup();
203
204         /*
205          * Calculate simulated dedup ratio and get rid of the tree
206          */
207         while ((sim_de = RB_ROOT(&sim_dedup_tree)) != NULL) {
208                 assert(sim_de->ref_blks != 0);
209                 dedup_ref_size += sim_de->ref_size;
210                 dedup_alloc_size += sim_de->ref_size / sim_de->ref_blks;
211
212                 RB_REMOVE(sim_dedup_entry_rb_tree, &sim_dedup_tree, sim_de);
213                 free(sim_de);
214         }
215
216         printf("Simulated dedup ratio = %.2f\n",
217             (dedup_alloc_size != 0) ?
218                 (double)dedup_ref_size / dedup_alloc_size : 0);
219 }
220
221 /*
222  * dedup <filesystem>
223  */
224 void
225 hammer_cmd_dedup(char **av, int ac)
226 {
227         struct dedup_entry *de;
228         struct sha_dedup_entry *sha_de;
229         struct pass2_dedup_entry *pass2_de;
230         char buf[8];
231
232         if (TimeoutOpt > 0)
233                 alarm(TimeoutOpt);
234
235         if (ac != 1)
236                 dedup_usage(1);
237
238         STAILQ_INIT(&pass2_dedup_queue);
239
240         glob_fd = getpfs(&glob_pfs, av[0]);
241
242         printf("Dedup ");
243         scan_pfs(av[0], &process_btree_elm);
244
245         while ((pass2_de = STAILQ_FIRST(&pass2_dedup_queue)) != NULL) {
246                 if (process_btree_elm(&pass2_de->leaf, DEDUP_PASS2))
247                         dedup_skipped_size -= pass2_de->leaf.data_len;
248
249                 STAILQ_REMOVE_HEAD(&pass2_dedup_queue, sq_entry);
250                 free(pass2_de);
251         }
252         assert(STAILQ_EMPTY(&pass2_dedup_queue));
253
254         printf("Dedup %s succeeded\n", av[0]);
255
256         relpfs(glob_fd, &glob_pfs);
257
258         if (VerboseOpt >= 2)
259                 dump_real_dedup();
260
261         /*
262          * Calculate dedup ratio and get rid of the trees
263          */
264         while ((de = RB_ROOT(&dedup_tree)) != NULL) {
265                 if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
266                         while ((sha_de = RB_ROOT(&de->u.fict_root)) != NULL) {
267                                 assert(sha_de->ref_blks != 0);
268                                 dedup_ref_size += sha_de->ref_size;
269                                 dedup_alloc_size += sha_de->ref_size / sha_de->ref_blks;
270
271                                 RB_REMOVE(sha_dedup_entry_rb_tree,
272                                                 &de->u.fict_root, sha_de);
273                                 free(sha_de);
274                         }
275                         assert(RB_EMPTY(&de->u.fict_root));
276                 } else {
277                         assert(de->u.de.ref_blks != 0);
278                         dedup_ref_size += de->u.de.ref_size;
279                         dedup_alloc_size += de->u.de.ref_size / de->u.de.ref_blks;
280                 }
281
282                 RB_REMOVE(dedup_entry_rb_tree, &dedup_tree, de);
283                 free(de);
284         }
285         assert(RB_EMPTY(&dedup_tree));
286
287         humanize_unsigned(buf, sizeof(buf), dedup_ref_size, "B", 1024);
288         printf("Dedup ratio = %.2f\n"
289                "    %8s referenced\n",
290                ((dedup_alloc_size != 0) ?
291                         (double)dedup_ref_size / dedup_alloc_size : 0),
292                buf
293         );
294         humanize_unsigned(buf, sizeof(buf), dedup_alloc_size, "B", 1024);
295         printf("    %8s allocated\n", buf);
296         humanize_unsigned(buf, sizeof(buf), dedup_skipped_size, "B", 1024);
297         printf("    %8s skipped\n", buf);
298         printf("    %8jd CRC collisions\n"
299                "    %8jd SHA collisions\n"
300                "    %8jd bigblock underflows\n",
301                (intmax_t)dedup_crc_failures,
302                (intmax_t)dedup_sha_failures,
303                (intmax_t)dedup_underflows
304         );
305 }
306
307 static int
308 collect_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags __unused)
309 {
310         struct sim_dedup_entry *sim_de;
311
312         sim_de = RB_LOOKUP(sim_dedup_entry_rb_tree, &sim_dedup_tree, scan_leaf->data_crc);
313
314         if (sim_de == NULL) {
315                 sim_de = calloc(sizeof(*sim_de), 1);
316                 sim_de->crc = scan_leaf->data_crc;
317                 RB_INSERT(sim_dedup_entry_rb_tree, &sim_dedup_tree, sim_de);
318         }
319
320         sim_de->ref_blks += 1;
321         sim_de->ref_size += scan_leaf->data_len;
322         return (1);
323 }
324
325 static __inline int
326 validate_dedup_pair(hammer_btree_leaf_elm_t p, hammer_btree_leaf_elm_t q)
327 {
328         if ((p->data_offset & HAMMER_OFF_ZONE_MASK) !=
329             (q->data_offset & HAMMER_OFF_ZONE_MASK)) {
330                 return (1);
331         }
332         if (p->data_len != q->data_len) {
333                 return (1);
334         }
335
336         return (0);
337 }
338
339 #define DEDUP_TECH_FAILURE      1
340 #define DEDUP_CMP_FAILURE       2
341 #define DEDUP_INVALID_ZONE      3
342 #define DEDUP_UNDERFLOW         4
343 #define DEDUP_VERS_FAILURE      5
344
345 static __inline int
346 deduplicate(hammer_btree_leaf_elm_t p, hammer_btree_leaf_elm_t q)
347 {
348         struct hammer_ioc_dedup dedup;
349
350         bzero(&dedup, sizeof(dedup));
351
352         /*
353          * If data_offset fields are the same there is no need to run ioctl,
354          * candidate is already dedup'ed.
355          */
356         if (p->data_offset == q->data_offset) {
357                 return (0);
358         }
359
360         dedup.elm1 = p->base;
361         dedup.elm2 = q->base;
362         RunningIoctl = 1;
363         if (ioctl(glob_fd, HAMMERIOC_DEDUP, &dedup) < 0) {
364                 if (errno == EOPNOTSUPP) {
365                         /* must be at least version 5 */
366                         return (DEDUP_VERS_FAILURE);
367                 }
368                 /* Technical failure - locking or w/e */
369                 return (DEDUP_TECH_FAILURE);
370         }
371         if (dedup.head.flags & HAMMER_IOC_DEDUP_CMP_FAILURE) {
372                 return (DEDUP_CMP_FAILURE);
373         }
374         if (dedup.head.flags & HAMMER_IOC_DEDUP_INVALID_ZONE) {
375                 return (DEDUP_INVALID_ZONE);
376         }
377         if (dedup.head.flags & HAMMER_IOC_DEDUP_UNDERFLOW) {
378                 return (DEDUP_UNDERFLOW);
379         }
380         RunningIoctl = 0;
381
382         return (0);
383 }
384
385 static int
386 process_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags)
387 {
388         struct dedup_entry *de;
389         struct sha_dedup_entry *sha_de, temp;
390         struct pass2_dedup_entry *pass2_de;
391         int error;
392
393         de = RB_LOOKUP(dedup_entry_rb_tree, &dedup_tree, scan_leaf->data_crc);
394         if (de == NULL) {
395                 /*
396                  * Insert new entry into CRC tree
397                  */
398                 de = calloc(sizeof(*de), 1);
399                 de->leaf = *scan_leaf;
400                 RB_INSERT(dedup_entry_rb_tree, &dedup_tree, de);
401                 goto upgrade_stats;
402         }
403
404         /*
405          * Found entry in CRC tree
406          */
407         if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
408                 /*
409                  * Entry in CRC tree is fictitious, so we already had problems
410                  * with this CRC. Upgrade (compute SHA) the candidate and
411                  * dive into SHA subtree. If upgrade fails insert the candidate
412                  * into Pass2 list (it will be processed later).
413                  */
414                 if (upgrade_chksum(scan_leaf, temp.sha_hash))
415                         goto pass2_insert;
416
417                 sha_de = RB_FIND(sha_dedup_entry_rb_tree, &de->u.fict_root, &temp);
418                 if (sha_de == NULL) {
419                         /*
420                          * Nothing in SHA subtree so far, so this is a new
421                          * 'dataset'. Insert new entry into SHA subtree.
422                          */
423                         sha_de = calloc(sizeof(*sha_de), 1);
424                         sha_de->leaf = *scan_leaf;
425                         memcpy(sha_de->sha_hash, temp.sha_hash, SHA256_DIGEST_LENGTH);
426                         RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root, sha_de);
427                         goto upgrade_stats_sha;
428                 }
429
430                 /*
431                  * Found entry in SHA subtree, it means we have a potential
432                  * dedup pair. Validate it (zones have to match and data_len
433                  * field have to be the same too. If validation fails, treat
434                  * it as a SHA collision (jump to sha256_failure).
435                  */
436                 if (validate_dedup_pair(&sha_de->leaf, scan_leaf))
437                         goto sha256_failure;
438
439                 /*
440                  * We have a valid dedup pair (SHA match, validated).
441                  *
442                  * In case of technical failure (dedup pair was good, but
443                  * ioctl failed anyways) insert the candidate into Pass2 list
444                  * (we will try to dedup it after we are done with the rest of
445                  * the tree).
446                  *
447                  * If ioctl fails because either of blocks is in the non-dedup
448                  * zone (we can dedup only in LARGE_DATA and SMALL_DATA) don't
449                  * bother with the candidate and terminate early.
450                  *
451                  * If ioctl fails because of bigblock underflow replace the
452                  * leaf node that found dedup entry represents with scan_leaf.
453                  */
454                 error = deduplicate(&sha_de->leaf, scan_leaf);
455                 switch(error) {
456                 case 0:
457                         goto upgrade_stats_sha;
458                 case DEDUP_TECH_FAILURE:
459                         goto pass2_insert;
460                 case DEDUP_CMP_FAILURE:
461                         goto sha256_failure;
462                 case DEDUP_INVALID_ZONE:
463                         goto terminate_early;
464                 case DEDUP_UNDERFLOW:
465                         ++dedup_underflows;
466                         sha_de->leaf = *scan_leaf;
467                         memcpy(sha_de->sha_hash, temp.sha_hash, SHA256_DIGEST_LENGTH);
468                         goto upgrade_stats_sha;
469                 case DEDUP_VERS_FAILURE:
470                         fprintf(stderr,
471                                 "HAMMER filesystem must be at least "
472                                 "version 5 to dedup\n");
473                         exit (1);
474                 default:
475                         fprintf(stderr, "Unknown error\n");
476                         goto terminate_early;
477                 }
478
479                 /*
480                  * Ooh la la.. SHA-256 collision. Terminate early, there's
481                  * nothing we can do here.
482                  */
483 sha256_failure:
484                 ++dedup_sha_failures;
485                 goto terminate_early;
486         } else {
487                 /*
488                  * Candidate CRC is good for now (we found an entry in CRC
489                  * tree and it's not fictitious). This means we have a
490                  * potential dedup pair.
491                  */
492                 if (validate_dedup_pair(&de->leaf, scan_leaf))
493                         goto crc_failure;
494
495                 /*
496                  * We have a valid dedup pair (CRC match, validated)
497                  */
498                 error = deduplicate(&de->leaf, scan_leaf);
499                 switch(error) {
500                 case 0:
501                         goto upgrade_stats;
502                 case DEDUP_TECH_FAILURE:
503                         goto pass2_insert;
504                 case DEDUP_CMP_FAILURE:
505                         goto crc_failure;
506                 case DEDUP_INVALID_ZONE:
507                         goto terminate_early;
508                 case DEDUP_UNDERFLOW:
509                         ++dedup_underflows;
510                         de->leaf = *scan_leaf;
511                         goto upgrade_stats;
512                 case DEDUP_VERS_FAILURE:
513                         fprintf(stderr,
514                                 "HAMMER filesystem must be at least "
515                                 "version 5 to dedup\n");
516                         exit (1);
517                 default:
518                         fprintf(stderr, "Unknown error\n");
519                         goto terminate_early;
520                 }
521
522 crc_failure:
523                 /*
524                  * We got a CRC collision - either ioctl failed because of
525                  * the comparison failure or validation of the potential
526                  * dedup pair went bad. In all cases insert both blocks
527                  * into SHA subtree (this requires checksum upgrade) and mark
528                  * entry that corresponds to this CRC in the CRC tree
529                  * fictitious, so that all futher operations with this CRC go
530                  * through SHA subtree.
531                  */
532                 ++dedup_crc_failures;
533                 /*
534                  * Insert block that was represented by now fictitious dedup
535                  * entry (create a new SHA entry and preserve stats of the
536                  * old CRC one). If checksum upgrade fails insert the
537                  * candidate into Pass2 list and return - keep both trees
538                  * unmodified.
539                  */
540                 sha_de = calloc(sizeof(*sha_de), 1);
541                 sha_de->leaf = de->leaf;
542                 sha_de->ref_blks = de->u.de.ref_blks;
543                 sha_de->ref_size = de->u.de.ref_size;
544                 if (upgrade_chksum(&sha_de->leaf, sha_de->sha_hash)) {
545                         free(sha_de);
546                         goto pass2_insert;
547                 }
548
549                 RB_INIT(&de->u.fict_root);
550                 /*
551                  * Here we can insert without prior checking because the tree
552                  * is empty at this point
553                  */
554                 RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root, sha_de);
555
556                 /*
557                  * Mark entry in CRC tree fictitious
558                  */
559                 de->flags |= HAMMER_DEDUP_ENTRY_FICTITIOUS;
560
561                 /*
562                  * Upgrade checksum of the candidate and insert it into
563                  * SHA subtree. If upgrade fails insert the candidate into
564                  * Pass2 list.
565                  */
566                 if (upgrade_chksum(scan_leaf, temp.sha_hash)) {
567                         goto pass2_insert;
568                 }
569                 sha_de = RB_FIND(sha_dedup_entry_rb_tree, &de->u.fict_root, &temp);
570                 if (sha_de != NULL)
571                         /* There is an entry with this SHA already, but the only
572                          * RB-tree element at this point is that entry we just
573                          * added. We know for sure these blocks are different
574                          * (this is crc_failure branch) so treat it as SHA
575                          * collision.
576                          */
577                         goto sha256_failure;
578
579                 sha_de = calloc(sizeof(*sha_de), 1);
580                 sha_de->leaf = *scan_leaf;
581                 memcpy(sha_de->sha_hash, temp.sha_hash, SHA256_DIGEST_LENGTH);
582                 RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root, sha_de);
583                 goto upgrade_stats_sha;
584         }
585
586 upgrade_stats:
587         de->u.de.ref_blks += 1;
588         de->u.de.ref_size += scan_leaf->data_len;
589         return (1);
590
591 upgrade_stats_sha:
592         sha_de->ref_blks += 1;
593         sha_de->ref_size += scan_leaf->data_len;
594         return (1);
595
596 pass2_insert:
597         /*
598          * If in pass2 mode don't insert anything, fall through to
599          * terminate_early
600          */
601         if ((flags & DEDUP_PASS2) == 0) {
602                 pass2_de = calloc(sizeof(*pass2_de), 1);
603                 pass2_de->leaf = *scan_leaf;
604                 STAILQ_INSERT_TAIL(&pass2_dedup_queue, pass2_de, sq_entry);
605                 dedup_skipped_size += scan_leaf->data_len;
606                 return (1);
607         }
608
609 terminate_early:
610         /*
611          * Early termination path. Fixup stats.
612          */
613         dedup_alloc_size += scan_leaf->data_len;
614         dedup_ref_size += scan_leaf->data_len;
615         return (0);
616 }
617
618 static int
619 upgrade_chksum(hammer_btree_leaf_elm_t leaf, u_int8_t *sha_hash)
620 {
621         struct hammer_ioc_data data;
622         char *buf = malloc(DEDUP_BUF);
623         SHA256_CTX ctx;
624         int error;
625
626         bzero(&data, sizeof(data));
627         data.elm = leaf->base;
628         data.ubuf = buf;
629         data.size = DEDUP_BUF;
630
631         error = 0;
632         if (ioctl(glob_fd, HAMMERIOC_GET_DATA, &data) < 0) {
633                 fprintf(stderr, "Get-data failed: %s\n", strerror(errno));
634                 error = 1;
635                 goto done;
636         }
637
638         if (data.leaf.data_len != leaf->data_len) {
639                 error = 1;
640                 goto done;
641         }
642
643         if (data.leaf.base.btype == HAMMER_BTREE_TYPE_RECORD &&
644             data.leaf.base.rec_type == HAMMER_RECTYPE_DATA) {
645                 SHA256_Init(&ctx);
646                 SHA256_Update(&ctx, (void *)buf, data.leaf.data_len);
647                 SHA256_Final(sha_hash, &ctx);
648         }
649
650 done:
651         free(buf);
652         return (error);
653 }
654
655 static void
656 scan_pfs(char *filesystem, scan_pfs_cb_t func)
657 {
658         struct hammer_ioc_mirror_rw mirror;
659         hammer_ioc_mrecord_any_t mrec;
660         struct hammer_btree_leaf_elm elm;
661         char *buf = malloc(DEDUP_BUF);
662         int offset, bytes;
663
664         bzero(&mirror, sizeof(mirror));
665         hammer_key_beg_init(&mirror.key_beg);
666         hammer_key_end_init(&mirror.key_end);
667
668         mirror.tid_beg = glob_pfs.ondisk->sync_beg_tid;
669         mirror.tid_end = glob_pfs.ondisk->sync_end_tid;
670         mirror.head.flags |= HAMMER_IOC_MIRROR_NODATA; /* we want only keys */
671         mirror.ubuf = buf;
672         mirror.size = DEDUP_BUF;
673         mirror.pfs_id = glob_pfs.pfs_id;
674         mirror.shared_uuid = glob_pfs.ondisk->shared_uuid;
675
676         printf("%s: objspace %016jx:%04x %016jx:%04x pfs_id %d\n",
677                 filesystem,
678                 (uintmax_t)mirror.key_beg.obj_id,
679                 mirror.key_beg.localization,
680                 (uintmax_t)mirror.key_end.obj_id,
681                 mirror.key_end.localization,
682                 glob_pfs.pfs_id);
683         fflush(stdout);
684
685         do {
686                 mirror.count = 0;
687                 mirror.pfs_id = glob_pfs.pfs_id;
688                 mirror.shared_uuid = glob_pfs.ondisk->shared_uuid;
689                 if (ioctl(glob_fd, HAMMERIOC_MIRROR_READ, &mirror) < 0) {
690                         fprintf(stderr, "Mirror-read %s failed: %s\n",
691                                 filesystem, strerror(errno));
692                         exit(1);
693                 }
694                 if (mirror.head.flags & HAMMER_IOC_HEAD_ERROR) {
695                         fprintf(stderr, "Mirror-read %s fatal error %d\n",
696                                 filesystem, mirror.head.error);
697                         exit(1);
698                 }
699                 if (mirror.count) {
700                         offset = 0;
701                         while (offset < mirror.count) {
702                                 mrec = (void *)((char *)buf + offset);
703                                 bytes = HAMMER_HEAD_DOALIGN(mrec->head.rec_size);
704                                 if (offset + bytes > mirror.count) {
705                                         fprintf(stderr, "Misaligned record\n");
706                                         exit(1);
707                                 }
708                                 assert((mrec->head.type &
709                                     HAMMER_MRECF_TYPE_MASK) == HAMMER_MREC_TYPE_REC);
710
711                                 elm = mrec->rec.leaf;
712                                 if (elm.base.btype == HAMMER_BTREE_TYPE_RECORD &&
713                                     elm.base.rec_type == HAMMER_RECTYPE_DATA) {
714                                         func(&elm, 0);
715                                 }
716                                 offset += bytes;
717                         }
718                 }
719                 mirror.key_beg = mirror.key_cur;
720         } while (mirror.count != 0);
721
722         free(buf);
723 }
724
725 static void
726 dump_simulated_dedup(void)
727 {
728         struct sim_dedup_entry *sim_de;
729
730         printf("=== Dumping simulated dedup entries:\n");
731         RB_FOREACH(sim_de, sim_dedup_entry_rb_tree, &sim_dedup_tree) {
732                 printf("\tcrc=%08x cnt=%ju size=%ju\n",
733                         sim_de->crc,
734                         (intmax_t)sim_de->ref_blks,
735                         (intmax_t)sim_de->ref_size);
736         }
737         printf("end of dump ===\n");
738 }
739
740 static void
741 dump_real_dedup(void)
742 {
743         struct dedup_entry *de;
744         struct sha_dedup_entry *sha_de;
745         int i;
746
747         printf("=== Dumping dedup entries:\n");
748         RB_FOREACH(de, dedup_entry_rb_tree, &dedup_tree) {
749                 if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
750                         printf("\tcrc=%08x fictitious\n", de->leaf.data_crc);
751
752                         RB_FOREACH(sha_de, sha_dedup_entry_rb_tree, &de->u.fict_root) {
753                                 printf("\t\tcrc=%08x cnt=%ju size=%ju\n\t"
754                                        "\t\tsha=",
755                                        sha_de->leaf.data_crc,
756                                        (intmax_t)sha_de->ref_blks,
757                                        (intmax_t)sha_de->ref_size);
758                                 for (i = 0; i < SHA256_DIGEST_LENGTH; ++i)
759                                         printf("%02x", sha_de->sha_hash[i]);
760                                 printf("\n");
761                         }
762                 } else {
763                         printf("\tcrc=%08x cnt=%ju size=%ju\n",
764                                de->leaf.data_crc,
765                                (intmax_t)de->u.de.ref_blks,
766                                (intmax_t)de->u.de.ref_size);
767                 }
768         }
769         printf("end of dump ===\n");
770 }
771
772 static void
773 dedup_usage(int code)
774 {
775         fprintf(stderr,
776                 "hammer dedup-simulate <filesystem>\n"
777                 "hammer dedup <filesystem>\n"
778         );
779         exit(code);
780 }