sbin/newfs_hammer: Properly allocate root inode and pfs from meta zone
[dragonfly.git] / sbin / hammer / cmd_dedup.c
1 /*
2  * Copyright (c) 2010 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Ilya Dryomov <idryomov@gmail.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34
35 #include "hammer.h"
36 #include <libutil.h>
37 #include <crypto/sha2/sha2.h>
38
39 #define DEDUP_BUF (64 * 1024)
40
41 /* Sorted list of block CRCs - light version for dedup-simulate */
42 struct sim_dedup_entry_rb_tree;
43 RB_HEAD(sim_dedup_entry_rb_tree, sim_dedup_entry) sim_dedup_tree =
44                                         RB_INITIALIZER(&sim_dedup_tree);
45 RB_PROTOTYPE2(sim_dedup_entry_rb_tree, sim_dedup_entry, rb_entry,
46                 rb_sim_dedup_entry_compare, hammer_crc_t);
47
48 struct sim_dedup_entry {
49         hammer_crc_t    crc;
50         u_int64_t       ref_blks; /* number of blocks referenced */
51         u_int64_t       ref_size; /* size of data referenced */
52         RB_ENTRY(sim_dedup_entry) rb_entry;
53 };
54
55 struct dedup_entry {
56         struct hammer_btree_leaf_elm leaf;
57         union {
58                 struct {
59                         u_int64_t ref_blks;
60                         u_int64_t ref_size;
61                 } de;
62                 RB_HEAD(sha_dedup_entry_rb_tree, sha_dedup_entry) fict_root;
63         } u;
64         u_int8_t flags;
65         RB_ENTRY(dedup_entry) rb_entry;
66 };
67
68 #define HAMMER_DEDUP_ENTRY_FICTITIOUS   0x0001
69
70 struct sha_dedup_entry {
71         struct hammer_btree_leaf_elm    leaf;
72         u_int64_t                       ref_blks;
73         u_int64_t                       ref_size;
74         u_int8_t                        sha_hash[SHA256_DIGEST_LENGTH];
75         RB_ENTRY(sha_dedup_entry)       fict_entry;
76 };
77
78 /* Sorted list of HAMMER B-Tree keys */
79 struct dedup_entry_rb_tree;
80 struct sha_dedup_entry_rb_tree;
81
82 RB_HEAD(dedup_entry_rb_tree, dedup_entry) dedup_tree =
83                                         RB_INITIALIZER(&dedup_tree);
84 RB_PROTOTYPE2(dedup_entry_rb_tree, dedup_entry, rb_entry,
85                 rb_dedup_entry_compare, hammer_crc_t);
86
87 RB_PROTOTYPE(sha_dedup_entry_rb_tree, sha_dedup_entry, fict_entry,
88                 rb_sha_dedup_entry_compare);
89
90 /*
91  * Pass2 list - contains entries that were not dedup'ed because ioctl failed
92  */
93 STAILQ_HEAD(, pass2_dedup_entry) pass2_dedup_queue =
94                                 STAILQ_HEAD_INITIALIZER(pass2_dedup_queue);
95
96 struct pass2_dedup_entry {
97         struct hammer_btree_leaf_elm    leaf;
98         STAILQ_ENTRY(pass2_dedup_entry) sq_entry;
99 };
100
101 #define DEDUP_PASS2     0x0001 /* process_btree_elm() mode */
102
103 static int SigInfoFlag;
104 static int SigAlrmFlag;
105 static int64_t DedupDataReads;
106 static int64_t DedupCurrentRecords;
107 static int64_t DedupTotalRecords;
108 static u_int32_t DedupCrcStart;
109 static u_int32_t DedupCrcEnd;
110 static u_int64_t MemoryUse;
111
112 /* PFS global ids - we deal with just one PFS at a run */
113 int glob_fd;
114 struct hammer_ioc_pseudofs_rw glob_pfs;
115
116 /*
117  * Global accounting variables
118  *
119  * Last three don't have to be 64-bit, just to be safe..
120  */
121 u_int64_t dedup_alloc_size = 0;
122 u_int64_t dedup_ref_size = 0;
123 u_int64_t dedup_skipped_size = 0;
124 u_int64_t dedup_crc_failures = 0;
125 u_int64_t dedup_sha_failures = 0;
126 u_int64_t dedup_underflows = 0;
127 u_int64_t dedup_successes_count = 0;
128 u_int64_t dedup_successes_bytes = 0;
129
130 static int rb_sim_dedup_entry_compare(struct sim_dedup_entry *sim_de1,
131                                 struct sim_dedup_entry *sim_de2);
132 static int rb_dedup_entry_compare(struct dedup_entry *de1,
133                                 struct dedup_entry *de2);
134 static int rb_sha_dedup_entry_compare(struct sha_dedup_entry *sha_de1,
135                                 struct sha_dedup_entry *sha_de2);
136 typedef int (*scan_pfs_cb_t)(hammer_btree_leaf_elm_t scan_leaf, int flags);
137 static void scan_pfs(char *filesystem, scan_pfs_cb_t func, const char *id);
138 static int collect_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags);
139 static int count_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags);
140 static int process_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags);
141 static int upgrade_chksum(hammer_btree_leaf_elm_t leaf, u_int8_t *sha_hash);
142 static void dump_simulated_dedup(void);
143 static void dump_real_dedup(void);
144 static void dedup_usage(int code);
145
146 RB_GENERATE2(sim_dedup_entry_rb_tree, sim_dedup_entry, rb_entry,
147                 rb_sim_dedup_entry_compare, hammer_crc_t, crc);
148 RB_GENERATE2(dedup_entry_rb_tree, dedup_entry, rb_entry,
149                 rb_dedup_entry_compare, hammer_crc_t, leaf.data_crc);
150 RB_GENERATE(sha_dedup_entry_rb_tree, sha_dedup_entry, fict_entry,
151                 rb_sha_dedup_entry_compare);
152
153 static int
154 rb_sim_dedup_entry_compare(struct sim_dedup_entry *sim_de1,
155                         struct sim_dedup_entry *sim_de2)
156 {
157         if (sim_de1->crc < sim_de2->crc)
158                 return (-1);
159         if (sim_de1->crc > sim_de2->crc)
160                 return (1);
161
162         return (0);
163 }
164
165 static int
166 rb_dedup_entry_compare(struct dedup_entry *de1, struct dedup_entry *de2)
167 {
168         if (de1->leaf.data_crc < de2->leaf.data_crc)
169                 return (-1);
170         if (de1->leaf.data_crc > de2->leaf.data_crc)
171                 return (1);
172
173         return (0);
174 }
175
176 static int
177 rb_sha_dedup_entry_compare(struct sha_dedup_entry *sha_de1,
178                         struct sha_dedup_entry *sha_de2)
179 {
180         unsigned long *h1 = (unsigned long *)&sha_de1->sha_hash;
181         unsigned long *h2 = (unsigned long *)&sha_de2->sha_hash;
182         int i;
183
184         for (i = 0; i < SHA256_DIGEST_LENGTH / (int)sizeof(unsigned long); ++i) {
185                 if (h1[i] < h2[i])
186                         return (-1);
187                 if (h1[i] > h2[i])
188                         return (1);
189         }
190
191         return (0);
192 }
193
194 /*
195  * dedup-simulate <filesystem>
196  */
197 void
198 hammer_cmd_dedup_simulate(char **av, int ac)
199 {
200         struct sim_dedup_entry *sim_de;
201
202         if (ac != 1)
203                 dedup_usage(1);
204
205         glob_fd = getpfs(&glob_pfs, av[0]);
206
207         /*
208          * Collection passes (memory limited)
209          */
210         printf("Dedup-simulate running\n");
211         do {
212                 DedupCrcStart = DedupCrcEnd;
213                 DedupCrcEnd = 0;
214                 MemoryUse = 0;
215
216                 if (VerboseOpt) {
217                         printf("b-tree pass  crc-range %08x-max\n",
218                                 DedupCrcStart);
219                         fflush(stdout);
220                 }
221                 scan_pfs(av[0], collect_btree_elm, "simu-pass");
222
223                 if (VerboseOpt >= 2)
224                         dump_simulated_dedup();
225
226                 /*
227                  * Calculate simulated dedup ratio and get rid of the tree
228                  */
229                 while ((sim_de = RB_ROOT(&sim_dedup_tree)) != NULL) {
230                         assert(sim_de->ref_blks != 0);
231                         dedup_ref_size += sim_de->ref_size;
232                         dedup_alloc_size += sim_de->ref_size / sim_de->ref_blks;
233
234                         RB_REMOVE(sim_dedup_entry_rb_tree, &sim_dedup_tree, sim_de);
235                         free(sim_de);
236                 }
237                 if (DedupCrcEnd && VerboseOpt == 0)
238                         printf(".");
239         } while (DedupCrcEnd);
240
241         printf("Dedup-simulate %s succeeded\n", av[0]);
242         relpfs(glob_fd, &glob_pfs);
243
244         printf("Simulated dedup ratio = %.2f\n",
245             (dedup_alloc_size != 0) ?
246                 (double)dedup_ref_size / dedup_alloc_size : 0);
247 }
248
249 /*
250  * dedup <filesystem>
251  */
252 void
253 hammer_cmd_dedup(char **av, int ac)
254 {
255         struct dedup_entry *de;
256         struct sha_dedup_entry *sha_de;
257         struct pass2_dedup_entry *pass2_de;
258         char *tmp;
259         char buf[8];
260         int needfree = 0;
261
262         if (TimeoutOpt > 0)
263                 alarm(TimeoutOpt);
264
265         if (ac != 1)
266                 dedup_usage(1);
267
268         STAILQ_INIT(&pass2_dedup_queue);
269
270         glob_fd = getpfs(&glob_pfs, av[0]);
271
272         /*
273          * A cycle file is _required_ for resuming dedup after the timeout
274          * specified with -t has expired. If no -c option, then place a
275          * .dedup.cycle file either in the PFS snapshots directory or in
276          * the default snapshots directory.
277          */
278         if (!CyclePath) {
279                 if (glob_pfs.ondisk->snapshots[0] != '/')
280                         asprintf(&tmp, "%s/%s/.dedup.cycle",
281                             SNAPSHOTS_BASE, av[0]);
282                 else
283                         asprintf(&tmp, "%s/.dedup.cycle",
284                             glob_pfs.ondisk->snapshots);
285                 CyclePath = tmp;
286                 needfree = 1;
287         }
288
289         /*
290          * Pre-pass to cache the btree
291          */
292         scan_pfs(av[0], count_btree_elm, "pre-pass ");
293         DedupTotalRecords = DedupCurrentRecords;
294
295         /*
296          * Collection passes (memory limited)
297          */
298         printf("Dedup running\n");
299         do {
300                 DedupCrcStart = DedupCrcEnd;
301                 DedupCrcEnd = 0;
302                 MemoryUse = 0;
303
304                 if (VerboseOpt) {
305                         printf("b-tree pass  crc-range %08x-max\n",
306                                 DedupCrcStart);
307                         fflush(stdout);
308                 }
309                 scan_pfs(av[0], process_btree_elm, "main-pass");
310
311                 while ((pass2_de = STAILQ_FIRST(&pass2_dedup_queue)) != NULL) {
312                         if (process_btree_elm(&pass2_de->leaf, DEDUP_PASS2))
313                                 dedup_skipped_size -= pass2_de->leaf.data_len;
314
315                         STAILQ_REMOVE_HEAD(&pass2_dedup_queue, sq_entry);
316                         free(pass2_de);
317                 }
318                 assert(STAILQ_EMPTY(&pass2_dedup_queue));
319
320                 if (VerboseOpt >= 2)
321                         dump_real_dedup();
322
323                 /*
324                  * Calculate dedup ratio and get rid of the trees
325                  */
326                 while ((de = RB_ROOT(&dedup_tree)) != NULL) {
327                         if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
328                                 while ((sha_de = RB_ROOT(&de->u.fict_root)) != NULL) {
329                                         assert(sha_de->ref_blks != 0);
330                                         dedup_ref_size += sha_de->ref_size;
331                                         dedup_alloc_size += sha_de->ref_size / sha_de->ref_blks;
332
333                                         RB_REMOVE(sha_dedup_entry_rb_tree,
334                                                         &de->u.fict_root, sha_de);
335                                         free(sha_de);
336                                 }
337                                 assert(RB_EMPTY(&de->u.fict_root));
338                         } else {
339                                 assert(de->u.de.ref_blks != 0);
340                                 dedup_ref_size += de->u.de.ref_size;
341                                 dedup_alloc_size += de->u.de.ref_size / de->u.de.ref_blks;
342                         }
343
344                         RB_REMOVE(dedup_entry_rb_tree, &dedup_tree, de);
345                         free(de);
346                 }
347                 assert(RB_EMPTY(&dedup_tree));
348                 if (DedupCrcEnd && VerboseOpt == 0)
349                         printf(".");
350         } while (DedupCrcEnd);
351
352         printf("Dedup %s succeeded\n", av[0]);
353         relpfs(glob_fd, &glob_pfs);
354
355         humanize_unsigned(buf, sizeof(buf), dedup_ref_size, "B", 1024);
356         printf("Dedup ratio = %.2f (in this run)\n"
357                "    %8s referenced\n",
358                ((dedup_alloc_size != 0) ?
359                         (double)dedup_ref_size / dedup_alloc_size : 0),
360                buf
361         );
362         humanize_unsigned(buf, sizeof(buf), dedup_alloc_size, "B", 1024);
363         printf("    %8s allocated\n", buf);
364         humanize_unsigned(buf, sizeof(buf), dedup_skipped_size, "B", 1024);
365         printf("    %8s skipped\n", buf);
366         printf("    %8jd CRC collisions\n"
367                "    %8jd SHA collisions\n"
368                "    %8jd bigblock underflows\n"
369                "    %8jd new dedup records\n"
370                "    %8jd new dedup bytes\n",
371                (intmax_t)dedup_crc_failures,
372                (intmax_t)dedup_sha_failures,
373                (intmax_t)dedup_underflows,
374                (intmax_t)dedup_successes_count,
375                (intmax_t)dedup_successes_bytes
376         );
377
378         /* Once completed remove cycle file */
379         hammer_reset_cycle();
380
381         /* We don't want to mess up with other directives */
382         if (needfree) {
383                 free(tmp);
384                 CyclePath = NULL;
385         }
386 }
387
388 static int
389 count_btree_elm(hammer_btree_leaf_elm_t scan_leaf __unused, int flags __unused)
390 {
391         return(1);
392 }
393
394 static int
395 collect_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags __unused)
396 {
397         struct sim_dedup_entry *sim_de;
398
399         /*
400          * If we are using too much memory we have to clean some out, which
401          * will cause the run to use multiple passes.  Be careful of integer
402          * overflows!
403          */
404         if (MemoryUse > MemoryLimit) {
405                 DedupCrcEnd = DedupCrcStart +
406                               (u_int32_t)(DedupCrcEnd - DedupCrcStart - 1) / 2;
407                 if (VerboseOpt) {
408                         printf("memory limit  crc-range %08x-%08x\n",
409                                 DedupCrcStart, DedupCrcEnd);
410                         fflush(stdout);
411                 }
412                 for (;;) {
413                         sim_de = RB_MAX(sim_dedup_entry_rb_tree,
414                                         &sim_dedup_tree);
415                         if (sim_de == NULL || sim_de->crc < DedupCrcEnd)
416                                 break;
417                         RB_REMOVE(sim_dedup_entry_rb_tree,
418                                   &sim_dedup_tree, sim_de);
419                         MemoryUse -= sizeof(*sim_de);
420                         free(sim_de);
421                 }
422         }
423
424         /*
425          * Collect statistics based on the CRC only, do not try to read
426          * any data blocks or run SHA hashes.
427          */
428         sim_de = RB_LOOKUP(sim_dedup_entry_rb_tree, &sim_dedup_tree,
429                            scan_leaf->data_crc);
430
431         if (sim_de == NULL) {
432                 sim_de = calloc(sizeof(*sim_de), 1);
433                 sim_de->crc = scan_leaf->data_crc;
434                 RB_INSERT(sim_dedup_entry_rb_tree, &sim_dedup_tree, sim_de);
435                 MemoryUse += sizeof(*sim_de);
436         }
437
438         sim_de->ref_blks += 1;
439         sim_de->ref_size += scan_leaf->data_len;
440         return (1);
441 }
442
443 static __inline int
444 validate_dedup_pair(hammer_btree_leaf_elm_t p, hammer_btree_leaf_elm_t q)
445 {
446         if ((p->data_offset & HAMMER_OFF_ZONE_MASK) !=
447             (q->data_offset & HAMMER_OFF_ZONE_MASK)) {
448                 return (1);
449         }
450         if (p->data_len != q->data_len) {
451                 return (1);
452         }
453
454         return (0);
455 }
456
457 #define DEDUP_TECH_FAILURE      1
458 #define DEDUP_CMP_FAILURE       2
459 #define DEDUP_INVALID_ZONE      3
460 #define DEDUP_UNDERFLOW         4
461 #define DEDUP_VERS_FAILURE      5
462
463 static __inline int
464 deduplicate(hammer_btree_leaf_elm_t p, hammer_btree_leaf_elm_t q)
465 {
466         struct hammer_ioc_dedup dedup;
467
468         bzero(&dedup, sizeof(dedup));
469
470         /*
471          * If data_offset fields are the same there is no need to run ioctl,
472          * candidate is already dedup'ed.
473          */
474         if (p->data_offset == q->data_offset) {
475                 return (0);
476         }
477
478         dedup.elm1 = p->base;
479         dedup.elm2 = q->base;
480         RunningIoctl = 1;
481         if (ioctl(glob_fd, HAMMERIOC_DEDUP, &dedup) < 0) {
482                 if (errno == EOPNOTSUPP) {
483                         /* must be at least version 5 */
484                         return (DEDUP_VERS_FAILURE);
485                 }
486                 /* Technical failure - locking or w/e */
487                 return (DEDUP_TECH_FAILURE);
488         }
489         if (dedup.head.flags & HAMMER_IOC_DEDUP_CMP_FAILURE) {
490                 return (DEDUP_CMP_FAILURE);
491         }
492         if (dedup.head.flags & HAMMER_IOC_DEDUP_INVALID_ZONE) {
493                 return (DEDUP_INVALID_ZONE);
494         }
495         if (dedup.head.flags & HAMMER_IOC_DEDUP_UNDERFLOW) {
496                 return (DEDUP_UNDERFLOW);
497         }
498         RunningIoctl = 0;
499         ++dedup_successes_count;
500         dedup_successes_bytes += p->data_len;
501         return (0);
502 }
503
504 static int
505 process_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags)
506 {
507         struct dedup_entry *de;
508         struct sha_dedup_entry *sha_de, temp;
509         struct pass2_dedup_entry *pass2_de;
510         int error;
511
512         /*
513          * If we are using too much memory we have to clean some out, which
514          * will cause the run to use multiple passes.  Be careful of integer
515          * overflows!
516          */
517         while (MemoryUse > MemoryLimit) {
518                 DedupCrcEnd = DedupCrcStart +
519                               (u_int32_t)(DedupCrcEnd - DedupCrcStart - 1) / 2;
520                 if (VerboseOpt) {
521                         printf("memory limit  crc-range %08x-%08x\n",
522                                 DedupCrcStart, DedupCrcEnd);
523                         fflush(stdout);
524                 }
525
526                 for (;;) {
527                         de = RB_MAX(dedup_entry_rb_tree, &dedup_tree);
528                         if (de == NULL || de->leaf.data_crc < DedupCrcEnd)
529                                 break;
530                         if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
531                                 while ((sha_de = RB_ROOT(&de->u.fict_root)) !=
532                                        NULL) {
533                                         RB_REMOVE(sha_dedup_entry_rb_tree,
534                                                   &de->u.fict_root, sha_de);
535                                         MemoryUse -= sizeof(*sha_de);
536                                         free(sha_de);
537                                 }
538                         }
539                         RB_REMOVE(dedup_entry_rb_tree, &dedup_tree, de);
540                         MemoryUse -= sizeof(*de);
541                         free(de);
542                 }
543         }
544
545         /*
546          * Collect statistics based on the CRC.  Colliding CRCs usually
547          * cause a SHA sub-tree to be created under the de.
548          *
549          * Trivial case if de not found.
550          */
551         de = RB_LOOKUP(dedup_entry_rb_tree, &dedup_tree, scan_leaf->data_crc);
552         if (de == NULL) {
553                 de = calloc(sizeof(*de), 1);
554                 de->leaf = *scan_leaf;
555                 RB_INSERT(dedup_entry_rb_tree, &dedup_tree, de);
556                 MemoryUse += sizeof(*de);
557                 goto upgrade_stats;
558         }
559
560         /*
561          * Found entry in CRC tree
562          */
563         if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
564                 /*
565                  * Optimize the case where a CRC failure results in multiple
566                  * SHA entries.  If we unconditionally issue a data-read a
567                  * degenerate situation where a colliding CRC's second SHA
568                  * entry contains the lion's share of the deduplication
569                  * candidates will result in excessive data block reads.
570                  *
571                  * Deal with the degenerate case by looking for a matching
572                  * data_offset/data_len in the SHA elements we already have
573                  * before reading the data block and generating a new SHA.
574                  */
575                 RB_FOREACH(sha_de, sha_dedup_entry_rb_tree, &de->u.fict_root) {
576                         if (sha_de->leaf.data_offset ==
577                                                 scan_leaf->data_offset &&
578                             sha_de->leaf.data_len == scan_leaf->data_len) {
579                                 memcpy(temp.sha_hash, sha_de->sha_hash,
580                                         SHA256_DIGEST_LENGTH);
581                                 break;
582                         }
583                 }
584
585                 /*
586                  * Entry in CRC tree is fictitious, so we already had problems
587                  * with this CRC. Upgrade (compute SHA) the candidate and
588                  * dive into SHA subtree. If upgrade fails insert the candidate
589                  * into Pass2 list (it will be processed later).
590                  */
591                 if (sha_de == NULL) {
592                         if (upgrade_chksum(scan_leaf, temp.sha_hash))
593                                 goto pass2_insert;
594
595                         sha_de = RB_FIND(sha_dedup_entry_rb_tree,
596                                          &de->u.fict_root, &temp);
597                 }
598
599                 /*
600                  * Nothing in SHA subtree so far, so this is a new
601                  * 'dataset'. Insert new entry into SHA subtree.
602                  */
603                 if (sha_de == NULL) {
604                         sha_de = calloc(sizeof(*sha_de), 1);
605                         sha_de->leaf = *scan_leaf;
606                         memcpy(sha_de->sha_hash, temp.sha_hash,
607                                SHA256_DIGEST_LENGTH);
608                         RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root,
609                                   sha_de);
610                         MemoryUse += sizeof(*sha_de);
611                         goto upgrade_stats_sha;
612                 }
613
614                 /*
615                  * Found entry in SHA subtree, it means we have a potential
616                  * dedup pair. Validate it (zones have to match and data_len
617                  * field have to be the same too. If validation fails, treat
618                  * it as a SHA collision (jump to sha256_failure).
619                  */
620                 if (validate_dedup_pair(&sha_de->leaf, scan_leaf))
621                         goto sha256_failure;
622
623                 /*
624                  * We have a valid dedup pair (SHA match, validated).
625                  *
626                  * In case of technical failure (dedup pair was good, but
627                  * ioctl failed anyways) insert the candidate into Pass2 list
628                  * (we will try to dedup it after we are done with the rest of
629                  * the tree).
630                  *
631                  * If ioctl fails because either of blocks is in the non-dedup
632                  * zone (we can dedup only in LARGE_DATA and SMALL_DATA) don't
633                  * bother with the candidate and terminate early.
634                  *
635                  * If ioctl fails because of bigblock underflow replace the
636                  * leaf node that found dedup entry represents with scan_leaf.
637                  */
638                 error = deduplicate(&sha_de->leaf, scan_leaf);
639                 switch(error) {
640                 case 0:
641                         goto upgrade_stats_sha;
642                 case DEDUP_TECH_FAILURE:
643                         goto pass2_insert;
644                 case DEDUP_CMP_FAILURE:
645                         goto sha256_failure;
646                 case DEDUP_INVALID_ZONE:
647                         goto terminate_early;
648                 case DEDUP_UNDERFLOW:
649                         ++dedup_underflows;
650                         sha_de->leaf = *scan_leaf;
651                         memcpy(sha_de->sha_hash, temp.sha_hash,
652                                 SHA256_DIGEST_LENGTH);
653                         goto upgrade_stats_sha;
654                 case DEDUP_VERS_FAILURE:
655                         fprintf(stderr,
656                                 "HAMMER filesystem must be at least "
657                                 "version 5 to dedup\n");
658                         exit (1);
659                 default:
660                         fprintf(stderr, "Unknown error\n");
661                         goto terminate_early;
662                 }
663
664                 /*
665                  * Ooh la la.. SHA-256 collision. Terminate early, there's
666                  * nothing we can do here.
667                  */
668 sha256_failure:
669                 ++dedup_sha_failures;
670                 goto terminate_early;
671         } else {
672                 /*
673                  * Candidate CRC is good for now (we found an entry in CRC
674                  * tree and it's not fictitious). This means we have a
675                  * potential dedup pair.
676                  */
677                 if (validate_dedup_pair(&de->leaf, scan_leaf))
678                         goto crc_failure;
679
680                 /*
681                  * We have a valid dedup pair (CRC match, validated)
682                  */
683                 error = deduplicate(&de->leaf, scan_leaf);
684                 switch(error) {
685                 case 0:
686                         goto upgrade_stats;
687                 case DEDUP_TECH_FAILURE:
688                         goto pass2_insert;
689                 case DEDUP_CMP_FAILURE:
690                         goto crc_failure;
691                 case DEDUP_INVALID_ZONE:
692                         goto terminate_early;
693                 case DEDUP_UNDERFLOW:
694                         ++dedup_underflows;
695                         de->leaf = *scan_leaf;
696                         goto upgrade_stats;
697                 case DEDUP_VERS_FAILURE:
698                         fprintf(stderr,
699                                 "HAMMER filesystem must be at least "
700                                 "version 5 to dedup\n");
701                         exit (1);
702                 default:
703                         fprintf(stderr, "Unknown error\n");
704                         goto terminate_early;
705                 }
706
707 crc_failure:
708                 /*
709                  * We got a CRC collision - either ioctl failed because of
710                  * the comparison failure or validation of the potential
711                  * dedup pair went bad. In all cases insert both blocks
712                  * into SHA subtree (this requires checksum upgrade) and mark
713                  * entry that corresponds to this CRC in the CRC tree
714                  * fictitious, so that all futher operations with this CRC go
715                  * through SHA subtree.
716                  */
717                 ++dedup_crc_failures;
718
719                 /*
720                  * Insert block that was represented by now fictitious dedup
721                  * entry (create a new SHA entry and preserve stats of the
722                  * old CRC one). If checksum upgrade fails insert the
723                  * candidate into Pass2 list and return - keep both trees
724                  * unmodified.
725                  */
726                 sha_de = calloc(sizeof(*sha_de), 1);
727                 sha_de->leaf = de->leaf;
728                 sha_de->ref_blks = de->u.de.ref_blks;
729                 sha_de->ref_size = de->u.de.ref_size;
730                 if (upgrade_chksum(&sha_de->leaf, sha_de->sha_hash)) {
731                         free(sha_de);
732                         goto pass2_insert;
733                 }
734                 MemoryUse += sizeof(*sha_de);
735
736                 RB_INIT(&de->u.fict_root);
737                 /*
738                  * Here we can insert without prior checking because the tree
739                  * is empty at this point
740                  */
741                 RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root, sha_de);
742
743                 /*
744                  * Mark entry in CRC tree fictitious
745                  */
746                 de->flags |= HAMMER_DEDUP_ENTRY_FICTITIOUS;
747
748                 /*
749                  * Upgrade checksum of the candidate and insert it into
750                  * SHA subtree. If upgrade fails insert the candidate into
751                  * Pass2 list.
752                  */
753                 if (upgrade_chksum(scan_leaf, temp.sha_hash)) {
754                         goto pass2_insert;
755                 }
756                 sha_de = RB_FIND(sha_dedup_entry_rb_tree, &de->u.fict_root,
757                                  &temp);
758                 if (sha_de != NULL)
759                         /* There is an entry with this SHA already, but the only
760                          * RB-tree element at this point is that entry we just
761                          * added. We know for sure these blocks are different
762                          * (this is crc_failure branch) so treat it as SHA
763                          * collision.
764                          */
765                         goto sha256_failure;
766
767                 sha_de = calloc(sizeof(*sha_de), 1);
768                 sha_de->leaf = *scan_leaf;
769                 memcpy(sha_de->sha_hash, temp.sha_hash, SHA256_DIGEST_LENGTH);
770                 RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root, sha_de);
771                 MemoryUse += sizeof(*sha_de);
772                 goto upgrade_stats_sha;
773         }
774
775 upgrade_stats:
776         de->u.de.ref_blks += 1;
777         de->u.de.ref_size += scan_leaf->data_len;
778         return (1);
779
780 upgrade_stats_sha:
781         sha_de->ref_blks += 1;
782         sha_de->ref_size += scan_leaf->data_len;
783         return (1);
784
785 pass2_insert:
786         /*
787          * If in pass2 mode don't insert anything, fall through to
788          * terminate_early
789          */
790         if ((flags & DEDUP_PASS2) == 0) {
791                 pass2_de = calloc(sizeof(*pass2_de), 1);
792                 pass2_de->leaf = *scan_leaf;
793                 STAILQ_INSERT_TAIL(&pass2_dedup_queue, pass2_de, sq_entry);
794                 dedup_skipped_size += scan_leaf->data_len;
795                 return (1);
796         }
797
798 terminate_early:
799         /*
800          * Early termination path. Fixup stats.
801          */
802         dedup_alloc_size += scan_leaf->data_len;
803         dedup_ref_size += scan_leaf->data_len;
804         return (0);
805 }
806
807 static int
808 upgrade_chksum(hammer_btree_leaf_elm_t leaf, u_int8_t *sha_hash)
809 {
810         struct hammer_ioc_data data;
811         char *buf = malloc(DEDUP_BUF);
812         SHA256_CTX ctx;
813         int error;
814
815         bzero(&data, sizeof(data));
816         data.elm = leaf->base;
817         data.ubuf = buf;
818         data.size = DEDUP_BUF;
819
820         error = 0;
821         if (ioctl(glob_fd, HAMMERIOC_GET_DATA, &data) < 0) {
822                 fprintf(stderr, "Get-data failed: %s\n", strerror(errno));
823                 error = 1;
824                 goto done;
825         }
826         DedupDataReads += leaf->data_len;
827
828         if (data.leaf.data_len != leaf->data_len) {
829                 error = 1;
830                 goto done;
831         }
832
833         if (data.leaf.base.btype == HAMMER_BTREE_TYPE_RECORD &&
834             data.leaf.base.rec_type == HAMMER_RECTYPE_DATA) {
835                 SHA256_Init(&ctx);
836                 SHA256_Update(&ctx, (void *)buf, data.leaf.data_len);
837                 SHA256_Final(sha_hash, &ctx);
838         }
839
840 done:
841         free(buf);
842         return (error);
843 }
844
845 static void
846 sigAlrm(int signo __unused)
847 {
848         SigAlrmFlag = 1;
849 }
850
851 static void
852 sigInfo(int signo __unused)
853 {
854         SigInfoFlag = 1;
855 }
856
857 static void
858 scan_pfs(char *filesystem, scan_pfs_cb_t func, const char *id)
859 {
860         struct hammer_ioc_mirror_rw mirror;
861         hammer_ioc_mrecord_any_t mrec;
862         struct hammer_btree_leaf_elm elm;
863         char *buf = malloc(DEDUP_BUF);
864         char buf1[8];
865         char buf2[8];
866         int offset, bytes;
867
868         SigInfoFlag = 0;
869         DedupDataReads = 0;
870         DedupCurrentRecords = 0;
871         signal(SIGINFO, sigInfo);
872         signal(SIGALRM, sigAlrm);
873
874         /*
875          * Deduplication happens per element so hammer(8) is in full
876          * control of the ioctl()s to actually perform it. SIGALRM
877          * needs to be handled within hammer(8) but a checkpoint
878          * is needed for resuming. Use cycle file for that.
879          *
880          * Try to obtain the previous obj_id from the cycle file and
881          * if not available just start from the beginning.
882          */
883         bzero(&mirror, sizeof(mirror));
884         hammer_key_beg_init(&mirror.key_beg);
885         hammer_get_cycle(&mirror.key_beg, &mirror.tid_beg);
886
887         if (mirror.key_beg.obj_id != (int64_t)HAMMER_MIN_OBJID) {
888                 if (VerboseOpt)
889                         fprintf(stderr, "%s: mirror-read: Resuming at object %016jx\n",
890                             id, (uintmax_t)mirror.key_beg.obj_id);
891         }
892
893         hammer_key_end_init(&mirror.key_end);
894
895         mirror.tid_beg = glob_pfs.ondisk->sync_beg_tid;
896         mirror.tid_end = glob_pfs.ondisk->sync_end_tid;
897         mirror.head.flags |= HAMMER_IOC_MIRROR_NODATA; /* we want only keys */
898         mirror.ubuf = buf;
899         mirror.size = DEDUP_BUF;
900         mirror.pfs_id = glob_pfs.pfs_id;
901         mirror.shared_uuid = glob_pfs.ondisk->shared_uuid;
902
903         if (VerboseOpt && DedupCrcStart == 0) {
904                 printf("%s %s: objspace %016jx:%04x %016jx:%04x\n",
905                         id, filesystem,
906                         (uintmax_t)mirror.key_beg.obj_id,
907                         mirror.key_beg.localization,
908                         (uintmax_t)mirror.key_end.obj_id,
909                         mirror.key_end.localization);
910                 printf("%s %s: pfs_id %d\n",
911                         id, filesystem, glob_pfs.pfs_id);
912         }
913         fflush(stdout);
914         fflush(stderr);
915
916         do {
917                 mirror.count = 0;
918                 mirror.pfs_id = glob_pfs.pfs_id;
919                 mirror.shared_uuid = glob_pfs.ondisk->shared_uuid;
920                 if (ioctl(glob_fd, HAMMERIOC_MIRROR_READ, &mirror) < 0) {
921                         fprintf(stderr, "Mirror-read %s failed: %s\n",
922                                 filesystem, strerror(errno));
923                         exit(1);
924                 }
925                 if (mirror.head.flags & HAMMER_IOC_HEAD_ERROR) {
926                         fprintf(stderr, "Mirror-read %s fatal error %d\n",
927                                 filesystem, mirror.head.error);
928                         exit(1);
929                 }
930                 if (mirror.count) {
931                         offset = 0;
932                         while (offset < mirror.count) {
933                                 mrec = (void *)((char *)buf + offset);
934                                 bytes = HAMMER_HEAD_DOALIGN(mrec->head.rec_size);
935                                 if (offset + bytes > mirror.count) {
936                                         fprintf(stderr, "Misaligned record\n");
937                                         exit(1);
938                                 }
939                                 assert((mrec->head.type &
940                                        HAMMER_MRECF_TYPE_MASK) ==
941                                        HAMMER_MREC_TYPE_REC);
942                                 offset += bytes;
943                                 elm = mrec->rec.leaf;
944                                 if (elm.base.btype != HAMMER_BTREE_TYPE_RECORD)
945                                         continue;
946                                 if (elm.base.rec_type != HAMMER_RECTYPE_DATA)
947                                         continue;
948                                 ++DedupCurrentRecords;
949                                 if (DedupCrcStart != DedupCrcEnd) {
950                                         if (elm.data_crc < DedupCrcStart)
951                                                 continue;
952                                         if (DedupCrcEnd &&
953                                             elm.data_crc >= DedupCrcEnd) {
954                                                 continue;
955                                         }
956                                 }
957                                 func(&elm, 0);
958                         }
959                 }
960                 mirror.key_beg = mirror.key_cur;
961                 if (DidInterrupt || SigAlrmFlag) {
962                         if (VerboseOpt)
963                                 fprintf(stderr, "%s\n",
964                                     (DidInterrupt ? "Interrupted" : "Timeout"));
965                         hammer_set_cycle(&mirror.key_cur, mirror.tid_beg);
966                         if (VerboseOpt)
967                                 fprintf(stderr, "Cyclefile %s updated for "
968                                     "continuation\n", CyclePath);
969                         exit(1);
970                 }
971                 if (SigInfoFlag) {
972                         if (DedupTotalRecords) {
973                                 humanize_unsigned(buf1, sizeof(buf1),
974                                                   DedupDataReads,
975                                                   "B", 1024);
976                                 humanize_unsigned(buf2, sizeof(buf2),
977                                                   dedup_successes_bytes,
978                                                   "B", 1024);
979                                 fprintf(stderr, "%s count %7jd/%jd "
980                                                 "(%02d.%02d%%) "
981                                                 "ioread %s newddup %s\n",
982                                         id,
983                                         (intmax_t)DedupCurrentRecords,
984                                         (intmax_t)DedupTotalRecords,
985                                         (int)(DedupCurrentRecords * 100 /
986                                                 DedupTotalRecords),
987                                         (int)(DedupCurrentRecords * 10000 /
988                                                 DedupTotalRecords % 100),
989                                         buf1, buf2);
990                         } else {
991                                 fprintf(stderr, "%s count %-7jd\n",
992                                         id,
993                                         (intmax_t)DedupCurrentRecords);
994                         }
995                         SigInfoFlag = 0;
996                 }
997         } while (mirror.count != 0);
998
999         signal(SIGINFO, SIG_IGN);
1000         signal(SIGALRM, SIG_IGN);
1001
1002         free(buf);
1003 }
1004
1005 static void
1006 dump_simulated_dedup(void)
1007 {
1008         struct sim_dedup_entry *sim_de;
1009
1010         printf("=== Dumping simulated dedup entries:\n");
1011         RB_FOREACH(sim_de, sim_dedup_entry_rb_tree, &sim_dedup_tree) {
1012                 printf("\tcrc=%08x cnt=%ju size=%ju\n",
1013                         sim_de->crc,
1014                         (intmax_t)sim_de->ref_blks,
1015                         (intmax_t)sim_de->ref_size);
1016         }
1017         printf("end of dump ===\n");
1018 }
1019
1020 static void
1021 dump_real_dedup(void)
1022 {
1023         struct dedup_entry *de;
1024         struct sha_dedup_entry *sha_de;
1025         int i;
1026
1027         printf("=== Dumping dedup entries:\n");
1028         RB_FOREACH(de, dedup_entry_rb_tree, &dedup_tree) {
1029                 if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
1030                         printf("\tcrc=%08x fictitious\n", de->leaf.data_crc);
1031
1032                         RB_FOREACH(sha_de, sha_dedup_entry_rb_tree, &de->u.fict_root) {
1033                                 printf("\t\tcrc=%08x cnt=%ju size=%ju\n\t"
1034                                        "\t\tsha=",
1035                                        sha_de->leaf.data_crc,
1036                                        (intmax_t)sha_de->ref_blks,
1037                                        (intmax_t)sha_de->ref_size);
1038                                 for (i = 0; i < SHA256_DIGEST_LENGTH; ++i)
1039                                         printf("%02x", sha_de->sha_hash[i]);
1040                                 printf("\n");
1041                         }
1042                 } else {
1043                         printf("\tcrc=%08x cnt=%ju size=%ju\n",
1044                                de->leaf.data_crc,
1045                                (intmax_t)de->u.de.ref_blks,
1046                                (intmax_t)de->u.de.ref_size);
1047                 }
1048         }
1049         printf("end of dump ===\n");
1050 }
1051
1052 static void
1053 dedup_usage(int code)
1054 {
1055         fprintf(stderr,
1056                 "hammer dedup-simulate <filesystem>\n"
1057                 "hammer dedup <filesystem>\n"
1058         );
1059         exit(code);
1060 }