Correct some casts in printf arguments in various utilities.
[dragonfly.git] / sbin / hammer / cmd_dedup.c
CommitLineData
bb29b5d8
MD
1/*
2 * Copyright (c) 2010 The DragonFly Project. All rights reserved.
3 *
4 * This code is derived from software contributed to The DragonFly Project
5 * by Ilya Dryomov <idryomov@gmail.com>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 */
34
41ae0862
TK
35#include "hammer.h"
36
37#include <sys/tree.h>
bb29b5d8 38#include <libutil.h>
e0662909 39#include <openssl/sha.h>
bb29b5d8
MD
40
41#define DEDUP_BUF (64 * 1024)
42
43/* Sorted list of block CRCs - light version for dedup-simulate */
44struct sim_dedup_entry_rb_tree;
45RB_HEAD(sim_dedup_entry_rb_tree, sim_dedup_entry) sim_dedup_tree =
46 RB_INITIALIZER(&sim_dedup_tree);
47RB_PROTOTYPE2(sim_dedup_entry_rb_tree, sim_dedup_entry, rb_entry,
48 rb_sim_dedup_entry_compare, hammer_crc_t);
49
50struct sim_dedup_entry {
51 hammer_crc_t crc;
46137e17
TK
52 uint64_t ref_blks; /* number of blocks referenced */
53 uint64_t ref_size; /* size of data referenced */
bb29b5d8
MD
54 RB_ENTRY(sim_dedup_entry) rb_entry;
55};
56
bb29b5d8
MD
57struct dedup_entry {
58 struct hammer_btree_leaf_elm leaf;
59 union {
60 struct {
46137e17
TK
61 uint64_t ref_blks;
62 uint64_t ref_size;
bb29b5d8
MD
63 } de;
64 RB_HEAD(sha_dedup_entry_rb_tree, sha_dedup_entry) fict_root;
65 } u;
46137e17 66 uint8_t flags;
bb29b5d8
MD
67 RB_ENTRY(dedup_entry) rb_entry;
68};
69
70#define HAMMER_DEDUP_ENTRY_FICTITIOUS 0x0001
71
72struct sha_dedup_entry {
73 struct hammer_btree_leaf_elm leaf;
46137e17
TK
74 uint64_t ref_blks;
75 uint64_t ref_size;
76 uint8_t sha_hash[SHA256_DIGEST_LENGTH];
bb29b5d8
MD
77 RB_ENTRY(sha_dedup_entry) fict_entry;
78};
79
4fd111b9
FT
80/* Sorted list of HAMMER B-Tree keys */
81struct dedup_entry_rb_tree;
82struct sha_dedup_entry_rb_tree;
83
84RB_HEAD(dedup_entry_rb_tree, dedup_entry) dedup_tree =
85 RB_INITIALIZER(&dedup_tree);
86RB_PROTOTYPE2(dedup_entry_rb_tree, dedup_entry, rb_entry,
87 rb_dedup_entry_compare, hammer_crc_t);
88
89RB_PROTOTYPE(sha_dedup_entry_rb_tree, sha_dedup_entry, fict_entry,
90 rb_sha_dedup_entry_compare);
91
bb29b5d8
MD
92/*
93 * Pass2 list - contains entries that were not dedup'ed because ioctl failed
94 */
95STAILQ_HEAD(, pass2_dedup_entry) pass2_dedup_queue =
96 STAILQ_HEAD_INITIALIZER(pass2_dedup_queue);
97
98struct pass2_dedup_entry {
99 struct hammer_btree_leaf_elm leaf;
100 STAILQ_ENTRY(pass2_dedup_entry) sq_entry;
101};
102
103#define DEDUP_PASS2 0x0001 /* process_btree_elm() mode */
104
e9c8ad3d 105static int SigInfoFlag;
3a92efe3 106static int SigAlrmFlag;
e9c8ad3d
MD
107static int64_t DedupDataReads;
108static int64_t DedupCurrentRecords;
109static int64_t DedupTotalRecords;
46137e17
TK
110static uint32_t DedupCrcStart;
111static uint32_t DedupCrcEnd;
112static uint64_t MemoryUse;
e9c8ad3d 113
bb29b5d8 114/* PFS global ids - we deal with just one PFS at a run */
07d6666f
TK
115static int glob_fd;
116static struct hammer_ioc_pseudofs_rw glob_pfs;
bb29b5d8
MD
117
118/*
119 * Global accounting variables
120 *
121 * Last three don't have to be 64-bit, just to be safe..
122 */
07d6666f
TK
123static uint64_t dedup_alloc_size;
124static uint64_t dedup_ref_size;
125static uint64_t dedup_skipped_size;
126static uint64_t dedup_crc_failures;
127static uint64_t dedup_sha_failures;
128static uint64_t dedup_underflows;
129static uint64_t dedup_successes_count;
130static uint64_t dedup_successes_bytes;
bb29b5d8
MD
131
132static int rb_sim_dedup_entry_compare(struct sim_dedup_entry *sim_de1,
133 struct sim_dedup_entry *sim_de2);
134static int rb_dedup_entry_compare(struct dedup_entry *de1,
135 struct dedup_entry *de2);
136static int rb_sha_dedup_entry_compare(struct sha_dedup_entry *sha_de1,
137 struct sha_dedup_entry *sha_de2);
138typedef int (*scan_pfs_cb_t)(hammer_btree_leaf_elm_t scan_leaf, int flags);
e9c8ad3d 139static void scan_pfs(char *filesystem, scan_pfs_cb_t func, const char *id);
bb29b5d8 140static int collect_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags);
e9c8ad3d 141static int count_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags);
bb29b5d8 142static int process_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags);
46137e17 143static int upgrade_chksum(hammer_btree_leaf_elm_t leaf, uint8_t *sha_hash);
bb29b5d8
MD
144static void dump_simulated_dedup(void);
145static void dump_real_dedup(void);
146static void dedup_usage(int code);
147
148RB_GENERATE2(sim_dedup_entry_rb_tree, sim_dedup_entry, rb_entry,
149 rb_sim_dedup_entry_compare, hammer_crc_t, crc);
150RB_GENERATE2(dedup_entry_rb_tree, dedup_entry, rb_entry,
151 rb_dedup_entry_compare, hammer_crc_t, leaf.data_crc);
152RB_GENERATE(sha_dedup_entry_rb_tree, sha_dedup_entry, fict_entry,
153 rb_sha_dedup_entry_compare);
154
005a4da7
TK
155static
156int
bb29b5d8
MD
157rb_sim_dedup_entry_compare(struct sim_dedup_entry *sim_de1,
158 struct sim_dedup_entry *sim_de2)
159{
160 if (sim_de1->crc < sim_de2->crc)
161 return (-1);
162 if (sim_de1->crc > sim_de2->crc)
163 return (1);
164
165 return (0);
166}
167
005a4da7
TK
168static
169int
bb29b5d8
MD
170rb_dedup_entry_compare(struct dedup_entry *de1, struct dedup_entry *de2)
171{
172 if (de1->leaf.data_crc < de2->leaf.data_crc)
173 return (-1);
174 if (de1->leaf.data_crc > de2->leaf.data_crc)
175 return (1);
176
177 return (0);
178}
179
005a4da7
TK
180static
181int
bb29b5d8
MD
182rb_sha_dedup_entry_compare(struct sha_dedup_entry *sha_de1,
183 struct sha_dedup_entry *sha_de2)
184{
185 unsigned long *h1 = (unsigned long *)&sha_de1->sha_hash;
186 unsigned long *h2 = (unsigned long *)&sha_de2->sha_hash;
187 int i;
188
189 for (i = 0; i < SHA256_DIGEST_LENGTH / (int)sizeof(unsigned long); ++i) {
190 if (h1[i] < h2[i])
191 return (-1);
192 if (h1[i] > h2[i])
193 return (1);
194 }
195
196 return (0);
197}
198
199/*
200 * dedup-simulate <filesystem>
201 */
202void
203hammer_cmd_dedup_simulate(char **av, int ac)
204{
205 struct sim_dedup_entry *sim_de;
206
88cdee70 207 if (ac != 1) {
bb29b5d8 208 dedup_usage(1);
88cdee70
TK
209 /* not reached */
210 }
bb29b5d8
MD
211
212 glob_fd = getpfs(&glob_pfs, av[0]);
213
fbe1c665
MD
214 /*
215 * Collection passes (memory limited)
216 */
217 printf("Dedup-simulate running\n");
218 do {
219 DedupCrcStart = DedupCrcEnd;
220 DedupCrcEnd = 0;
221 MemoryUse = 0;
222
223 if (VerboseOpt) {
78c4be83 224 printf("B-Tree pass crc-range %08x-max\n",
fbe1c665
MD
225 DedupCrcStart);
226 fflush(stdout);
227 }
228 scan_pfs(av[0], collect_btree_elm, "simu-pass");
bb29b5d8 229
fbe1c665
MD
230 if (VerboseOpt >= 2)
231 dump_simulated_dedup();
bb29b5d8 232
fbe1c665
MD
233 /*
234 * Calculate simulated dedup ratio and get rid of the tree
235 */
236 while ((sim_de = RB_ROOT(&sim_dedup_tree)) != NULL) {
237 assert(sim_de->ref_blks != 0);
238 dedup_ref_size += sim_de->ref_size;
239 dedup_alloc_size += sim_de->ref_size / sim_de->ref_blks;
bb29b5d8 240
fbe1c665
MD
241 RB_REMOVE(sim_dedup_entry_rb_tree, &sim_dedup_tree, sim_de);
242 free(sim_de);
243 }
244 if (DedupCrcEnd && VerboseOpt == 0)
245 printf(".");
246 } while (DedupCrcEnd);
bb29b5d8 247
fbe1c665
MD
248 printf("Dedup-simulate %s succeeded\n", av[0]);
249 relpfs(glob_fd, &glob_pfs);
bb29b5d8
MD
250
251 printf("Simulated dedup ratio = %.2f\n",
eae1bb35
ID
252 (dedup_alloc_size != 0) ?
253 (double)dedup_ref_size / dedup_alloc_size : 0);
bb29b5d8
MD
254}
255
256/*
257 * dedup <filesystem>
258 */
259void
260hammer_cmd_dedup(char **av, int ac)
261{
262 struct dedup_entry *de;
263 struct sha_dedup_entry *sha_de;
264 struct pass2_dedup_entry *pass2_de;
3a92efe3 265 char *tmp;
bb29b5d8 266 char buf[8];
3a92efe3 267 int needfree = 0;
bb29b5d8
MD
268
269 if (TimeoutOpt > 0)
270 alarm(TimeoutOpt);
271
88cdee70 272 if (ac != 1) {
bb29b5d8 273 dedup_usage(1);
88cdee70
TK
274 /* not reached */
275 }
bb29b5d8
MD
276
277 STAILQ_INIT(&pass2_dedup_queue);
278
279 glob_fd = getpfs(&glob_pfs, av[0]);
280
3a92efe3
AHJ
281 /*
282 * A cycle file is _required_ for resuming dedup after the timeout
283 * specified with -t has expired. If no -c option, then place a
284 * .dedup.cycle file either in the PFS snapshots directory or in
285 * the default snapshots directory.
286 */
287 if (!CyclePath) {
9930da22 288 if (glob_pfs.ondisk->snapshots[0] != '/') {
3a92efe3
AHJ
289 asprintf(&tmp, "%s/%s/.dedup.cycle",
290 SNAPSHOTS_BASE, av[0]);
9930da22 291 } else {
3a92efe3
AHJ
292 asprintf(&tmp, "%s/.dedup.cycle",
293 glob_pfs.ondisk->snapshots);
9930da22 294 }
3a92efe3
AHJ
295 CyclePath = tmp;
296 needfree = 1;
297 }
298
fbe1c665
MD
299 /*
300 * Pre-pass to cache the btree
301 */
17882027 302 scan_pfs(av[0], count_btree_elm, "pre-pass ");
e9c8ad3d 303 DedupTotalRecords = DedupCurrentRecords;
bb29b5d8 304
fbe1c665
MD
305 /*
306 * Collection passes (memory limited)
307 */
51d59e22 308 printf("Dedup running\n");
fbe1c665
MD
309 do {
310 DedupCrcStart = DedupCrcEnd;
311 DedupCrcEnd = 0;
312 MemoryUse = 0;
313
314 if (VerboseOpt) {
78c4be83 315 printf("B-Tree pass crc-range %08x-max\n",
fbe1c665
MD
316 DedupCrcStart);
317 fflush(stdout);
318 }
319 scan_pfs(av[0], process_btree_elm, "main-pass");
bb29b5d8 320
fbe1c665
MD
321 while ((pass2_de = STAILQ_FIRST(&pass2_dedup_queue)) != NULL) {
322 if (process_btree_elm(&pass2_de->leaf, DEDUP_PASS2))
323 dedup_skipped_size -= pass2_de->leaf.data_len;
bb29b5d8 324
fbe1c665
MD
325 STAILQ_REMOVE_HEAD(&pass2_dedup_queue, sq_entry);
326 free(pass2_de);
327 }
328 assert(STAILQ_EMPTY(&pass2_dedup_queue));
bb29b5d8 329
fbe1c665
MD
330 if (VerboseOpt >= 2)
331 dump_real_dedup();
bb29b5d8 332
fbe1c665
MD
333 /*
334 * Calculate dedup ratio and get rid of the trees
335 */
336 while ((de = RB_ROOT(&dedup_tree)) != NULL) {
337 if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
338 while ((sha_de = RB_ROOT(&de->u.fict_root)) != NULL) {
339 assert(sha_de->ref_blks != 0);
340 dedup_ref_size += sha_de->ref_size;
341 dedup_alloc_size += sha_de->ref_size / sha_de->ref_blks;
342
343 RB_REMOVE(sha_dedup_entry_rb_tree,
344 &de->u.fict_root, sha_de);
345 free(sha_de);
346 }
347 assert(RB_EMPTY(&de->u.fict_root));
348 } else {
349 assert(de->u.de.ref_blks != 0);
350 dedup_ref_size += de->u.de.ref_size;
351 dedup_alloc_size += de->u.de.ref_size / de->u.de.ref_blks;
bb29b5d8 352 }
fbe1c665
MD
353
354 RB_REMOVE(dedup_entry_rb_tree, &dedup_tree, de);
355 free(de);
bb29b5d8 356 }
fbe1c665
MD
357 assert(RB_EMPTY(&dedup_tree));
358 if (DedupCrcEnd && VerboseOpt == 0)
359 printf(".");
360 } while (DedupCrcEnd);
bb29b5d8 361
fbe1c665
MD
362 printf("Dedup %s succeeded\n", av[0]);
363 relpfs(glob_fd, &glob_pfs);
bb29b5d8 364
bb29b5d8 365 humanize_unsigned(buf, sizeof(buf), dedup_ref_size, "B", 1024);
3a92efe3 366 printf("Dedup ratio = %.2f (in this run)\n"
bb29b5d8 367 " %8s referenced\n",
eae1bb35
ID
368 ((dedup_alloc_size != 0) ?
369 (double)dedup_ref_size / dedup_alloc_size : 0),
bb29b5d8
MD
370 buf
371 );
372 humanize_unsigned(buf, sizeof(buf), dedup_alloc_size, "B", 1024);
373 printf(" %8s allocated\n", buf);
374 humanize_unsigned(buf, sizeof(buf), dedup_skipped_size, "B", 1024);
375 printf(" %8s skipped\n", buf);
376 printf(" %8jd CRC collisions\n"
377 " %8jd SHA collisions\n"
a981af19 378 " %8jd big-block underflows\n"
17882027
MD
379 " %8jd new dedup records\n"
380 " %8jd new dedup bytes\n",
bb29b5d8
MD
381 (intmax_t)dedup_crc_failures,
382 (intmax_t)dedup_sha_failures,
17882027
MD
383 (intmax_t)dedup_underflows,
384 (intmax_t)dedup_successes_count,
385 (intmax_t)dedup_successes_bytes
bb29b5d8 386 );
3a92efe3
AHJ
387
388 /* Once completed remove cycle file */
389 hammer_reset_cycle();
390
391 /* We don't want to mess up with other directives */
392 if (needfree) {
393 free(tmp);
394 CyclePath = NULL;
395 }
bb29b5d8
MD
396}
397
005a4da7
TK
398static
399int
e9c8ad3d
MD
400count_btree_elm(hammer_btree_leaf_elm_t scan_leaf __unused, int flags __unused)
401{
402 return(1);
403}
404
005a4da7
TK
405static
406int
bb29b5d8
MD
407collect_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags __unused)
408{
409 struct sim_dedup_entry *sim_de;
410
fbe1c665
MD
411 /*
412 * If we are using too much memory we have to clean some out, which
413 * will cause the run to use multiple passes. Be careful of integer
414 * overflows!
415 */
416 if (MemoryUse > MemoryLimit) {
417 DedupCrcEnd = DedupCrcStart +
46137e17 418 (uint32_t)(DedupCrcEnd - DedupCrcStart - 1) / 2;
fbe1c665
MD
419 if (VerboseOpt) {
420 printf("memory limit crc-range %08x-%08x\n",
421 DedupCrcStart, DedupCrcEnd);
422 fflush(stdout);
423 }
424 for (;;) {
425 sim_de = RB_MAX(sim_dedup_entry_rb_tree,
426 &sim_dedup_tree);
427 if (sim_de == NULL || sim_de->crc < DedupCrcEnd)
428 break;
429 RB_REMOVE(sim_dedup_entry_rb_tree,
430 &sim_dedup_tree, sim_de);
431 MemoryUse -= sizeof(*sim_de);
432 free(sim_de);
433 }
434 }
435
436 /*
437 * Collect statistics based on the CRC only, do not try to read
438 * any data blocks or run SHA hashes.
439 */
e9c8ad3d
MD
440 sim_de = RB_LOOKUP(sim_dedup_entry_rb_tree, &sim_dedup_tree,
441 scan_leaf->data_crc);
bb29b5d8
MD
442
443 if (sim_de == NULL) {
c8f165e2 444 sim_de = calloc(1, sizeof(*sim_de));
bb29b5d8
MD
445 sim_de->crc = scan_leaf->data_crc;
446 RB_INSERT(sim_dedup_entry_rb_tree, &sim_dedup_tree, sim_de);
fbe1c665 447 MemoryUse += sizeof(*sim_de);
bb29b5d8
MD
448 }
449
450 sim_de->ref_blks += 1;
451 sim_de->ref_size += scan_leaf->data_len;
452 return (1);
453}
454
005a4da7
TK
455static __inline
456int
bb29b5d8
MD
457validate_dedup_pair(hammer_btree_leaf_elm_t p, hammer_btree_leaf_elm_t q)
458{
52e2f1b5 459 if (HAMMER_ZONE(p->data_offset) != HAMMER_ZONE(q->data_offset))
bb29b5d8 460 return (1);
52e2f1b5 461 if (p->data_len != q->data_len)
bb29b5d8 462 return (1);
bb29b5d8
MD
463
464 return (0);
465}
466
467#define DEDUP_TECH_FAILURE 1
468#define DEDUP_CMP_FAILURE 2
469#define DEDUP_INVALID_ZONE 3
470#define DEDUP_UNDERFLOW 4
b9956a62 471#define DEDUP_VERS_FAILURE 5
bb29b5d8 472
005a4da7
TK
473static __inline
474int
bb29b5d8
MD
475deduplicate(hammer_btree_leaf_elm_t p, hammer_btree_leaf_elm_t q)
476{
477 struct hammer_ioc_dedup dedup;
478
479 bzero(&dedup, sizeof(dedup));
480
481 /*
482 * If data_offset fields are the same there is no need to run ioctl,
483 * candidate is already dedup'ed.
484 */
52e2f1b5 485 if (p->data_offset == q->data_offset)
bb29b5d8 486 return (0);
bb29b5d8
MD
487
488 dedup.elm1 = p->base;
489 dedup.elm2 = q->base;
490 RunningIoctl = 1;
491 if (ioctl(glob_fd, HAMMERIOC_DEDUP, &dedup) < 0) {
52e2f1b5
TK
492 if (errno == EOPNOTSUPP)
493 return (DEDUP_VERS_FAILURE); /* must be at least version 5 */
bb29b5d8
MD
494 /* Technical failure - locking or w/e */
495 return (DEDUP_TECH_FAILURE);
496 }
52e2f1b5 497 if (dedup.head.flags & HAMMER_IOC_DEDUP_CMP_FAILURE)
bb29b5d8 498 return (DEDUP_CMP_FAILURE);
52e2f1b5 499 if (dedup.head.flags & HAMMER_IOC_DEDUP_INVALID_ZONE)
bb29b5d8 500 return (DEDUP_INVALID_ZONE);
52e2f1b5 501 if (dedup.head.flags & HAMMER_IOC_DEDUP_UNDERFLOW)
bb29b5d8 502 return (DEDUP_UNDERFLOW);
bb29b5d8 503 RunningIoctl = 0;
17882027
MD
504 ++dedup_successes_count;
505 dedup_successes_bytes += p->data_len;
bb29b5d8
MD
506 return (0);
507}
508
005a4da7
TK
509static
510int
bb29b5d8
MD
511process_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags)
512{
513 struct dedup_entry *de;
514 struct sha_dedup_entry *sha_de, temp;
515 struct pass2_dedup_entry *pass2_de;
516 int error;
517
fbe1c665
MD
518 /*
519 * If we are using too much memory we have to clean some out, which
520 * will cause the run to use multiple passes. Be careful of integer
521 * overflows!
522 */
523 while (MemoryUse > MemoryLimit) {
524 DedupCrcEnd = DedupCrcStart +
46137e17 525 (uint32_t)(DedupCrcEnd - DedupCrcStart - 1) / 2;
fbe1c665
MD
526 if (VerboseOpt) {
527 printf("memory limit crc-range %08x-%08x\n",
528 DedupCrcStart, DedupCrcEnd);
529 fflush(stdout);
530 }
531
532 for (;;) {
533 de = RB_MAX(dedup_entry_rb_tree, &dedup_tree);
534 if (de == NULL || de->leaf.data_crc < DedupCrcEnd)
535 break;
f254e677 536 if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
fbe1c665
MD
537 while ((sha_de = RB_ROOT(&de->u.fict_root)) !=
538 NULL) {
539 RB_REMOVE(sha_dedup_entry_rb_tree,
540 &de->u.fict_root, sha_de);
541 MemoryUse -= sizeof(*sha_de);
542 free(sha_de);
543 }
f254e677 544 }
fbe1c665
MD
545 RB_REMOVE(dedup_entry_rb_tree, &dedup_tree, de);
546 MemoryUse -= sizeof(*de);
547 free(de);
548 }
549 }
550
551 /*
552 * Collect statistics based on the CRC. Colliding CRCs usually
553 * cause a SHA sub-tree to be created under the de.
554 *
555 * Trivial case if de not found.
556 */
bb29b5d8
MD
557 de = RB_LOOKUP(dedup_entry_rb_tree, &dedup_tree, scan_leaf->data_crc);
558 if (de == NULL) {
c8f165e2 559 de = calloc(1, sizeof(*de));
bb29b5d8
MD
560 de->leaf = *scan_leaf;
561 RB_INSERT(dedup_entry_rb_tree, &dedup_tree, de);
fbe1c665 562 MemoryUse += sizeof(*de);
bb29b5d8
MD
563 goto upgrade_stats;
564 }
565
566 /*
567 * Found entry in CRC tree
568 */
569 if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
17882027
MD
570 /*
571 * Optimize the case where a CRC failure results in multiple
572 * SHA entries. If we unconditionally issue a data-read a
573 * degenerate situation where a colliding CRC's second SHA
574 * entry contains the lion's share of the deduplication
575 * candidates will result in excessive data block reads.
576 *
577 * Deal with the degenerate case by looking for a matching
578 * data_offset/data_len in the SHA elements we already have
579 * before reading the data block and generating a new SHA.
580 */
f254e677 581 RB_FOREACH(sha_de, sha_dedup_entry_rb_tree, &de->u.fict_root) {
17882027
MD
582 if (sha_de->leaf.data_offset ==
583 scan_leaf->data_offset &&
584 sha_de->leaf.data_len == scan_leaf->data_len) {
585 memcpy(temp.sha_hash, sha_de->sha_hash,
586 SHA256_DIGEST_LENGTH);
587 break;
588 }
f254e677 589 }
17882027 590
bb29b5d8 591 /*
e8b22b55 592 * Entry in CRC tree is fictitious, so we already had problems
bb29b5d8
MD
593 * with this CRC. Upgrade (compute SHA) the candidate and
594 * dive into SHA subtree. If upgrade fails insert the candidate
595 * into Pass2 list (it will be processed later).
596 */
17882027
MD
597 if (sha_de == NULL) {
598 if (upgrade_chksum(scan_leaf, temp.sha_hash))
599 goto pass2_insert;
600
601 sha_de = RB_FIND(sha_dedup_entry_rb_tree,
602 &de->u.fict_root, &temp);
603 }
bb29b5d8 604
17882027
MD
605 /*
606 * Nothing in SHA subtree so far, so this is a new
607 * 'dataset'. Insert new entry into SHA subtree.
608 */
bb29b5d8 609 if (sha_de == NULL) {
c8f165e2 610 sha_de = calloc(1, sizeof(*sha_de));
bb29b5d8 611 sha_de->leaf = *scan_leaf;
17882027
MD
612 memcpy(sha_de->sha_hash, temp.sha_hash,
613 SHA256_DIGEST_LENGTH);
614 RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root,
615 sha_de);
fbe1c665 616 MemoryUse += sizeof(*sha_de);
bb29b5d8
MD
617 goto upgrade_stats_sha;
618 }
619
620 /*
621 * Found entry in SHA subtree, it means we have a potential
622 * dedup pair. Validate it (zones have to match and data_len
623 * field have to be the same too. If validation fails, treat
624 * it as a SHA collision (jump to sha256_failure).
625 */
626 if (validate_dedup_pair(&sha_de->leaf, scan_leaf))
627 goto sha256_failure;
628
629 /*
630 * We have a valid dedup pair (SHA match, validated).
631 *
632 * In case of technical failure (dedup pair was good, but
633 * ioctl failed anyways) insert the candidate into Pass2 list
634 * (we will try to dedup it after we are done with the rest of
635 * the tree).
636 *
637 * If ioctl fails because either of blocks is in the non-dedup
638 * zone (we can dedup only in LARGE_DATA and SMALL_DATA) don't
639 * bother with the candidate and terminate early.
640 *
a981af19 641 * If ioctl fails because of big-block underflow replace the
bb29b5d8
MD
642 * leaf node that found dedup entry represents with scan_leaf.
643 */
644 error = deduplicate(&sha_de->leaf, scan_leaf);
645 switch(error) {
b9956a62
MD
646 case 0:
647 goto upgrade_stats_sha;
bb29b5d8
MD
648 case DEDUP_TECH_FAILURE:
649 goto pass2_insert;
650 case DEDUP_CMP_FAILURE:
651 goto sha256_failure;
652 case DEDUP_INVALID_ZONE:
653 goto terminate_early;
654 case DEDUP_UNDERFLOW:
655 ++dedup_underflows;
656 sha_de->leaf = *scan_leaf;
17882027
MD
657 memcpy(sha_de->sha_hash, temp.sha_hash,
658 SHA256_DIGEST_LENGTH);
bb29b5d8 659 goto upgrade_stats_sha;
b9956a62 660 case DEDUP_VERS_FAILURE:
f4bb1f26
TK
661 errx(1, "HAMMER filesystem must be at least "
662 "version 5 to dedup");
73cca638 663 /* not reached */
bb29b5d8 664 default:
b9956a62
MD
665 fprintf(stderr, "Unknown error\n");
666 goto terminate_early;
bb29b5d8
MD
667 }
668
669 /*
670 * Ooh la la.. SHA-256 collision. Terminate early, there's
671 * nothing we can do here.
672 */
673sha256_failure:
674 ++dedup_sha_failures;
675 goto terminate_early;
676 } else {
677 /*
678 * Candidate CRC is good for now (we found an entry in CRC
e8b22b55 679 * tree and it's not fictitious). This means we have a
bb29b5d8
MD
680 * potential dedup pair.
681 */
682 if (validate_dedup_pair(&de->leaf, scan_leaf))
683 goto crc_failure;
684
685 /*
686 * We have a valid dedup pair (CRC match, validated)
687 */
688 error = deduplicate(&de->leaf, scan_leaf);
689 switch(error) {
b9956a62
MD
690 case 0:
691 goto upgrade_stats;
bb29b5d8
MD
692 case DEDUP_TECH_FAILURE:
693 goto pass2_insert;
694 case DEDUP_CMP_FAILURE:
695 goto crc_failure;
696 case DEDUP_INVALID_ZONE:
697 goto terminate_early;
698 case DEDUP_UNDERFLOW:
699 ++dedup_underflows;
700 de->leaf = *scan_leaf;
701 goto upgrade_stats;
b9956a62 702 case DEDUP_VERS_FAILURE:
f4bb1f26
TK
703 errx(1, "HAMMER filesystem must be at least "
704 "version 5 to dedup");
73cca638 705 /* not reached */
bb29b5d8 706 default:
b9956a62
MD
707 fprintf(stderr, "Unknown error\n");
708 goto terminate_early;
bb29b5d8
MD
709 }
710
711crc_failure:
712 /*
713 * We got a CRC collision - either ioctl failed because of
714 * the comparison failure or validation of the potential
715 * dedup pair went bad. In all cases insert both blocks
716 * into SHA subtree (this requires checksum upgrade) and mark
717 * entry that corresponds to this CRC in the CRC tree
e8b22b55 718 * fictitious, so that all futher operations with this CRC go
bb29b5d8
MD
719 * through SHA subtree.
720 */
721 ++dedup_crc_failures;
17882027 722
bb29b5d8 723 /*
e8b22b55 724 * Insert block that was represented by now fictitious dedup
bb29b5d8
MD
725 * entry (create a new SHA entry and preserve stats of the
726 * old CRC one). If checksum upgrade fails insert the
727 * candidate into Pass2 list and return - keep both trees
728 * unmodified.
729 */
c8f165e2 730 sha_de = calloc(1, sizeof(*sha_de));
bb29b5d8
MD
731 sha_de->leaf = de->leaf;
732 sha_de->ref_blks = de->u.de.ref_blks;
733 sha_de->ref_size = de->u.de.ref_size;
734 if (upgrade_chksum(&sha_de->leaf, sha_de->sha_hash)) {
735 free(sha_de);
736 goto pass2_insert;
737 }
fbe1c665 738 MemoryUse += sizeof(*sha_de);
bb29b5d8
MD
739
740 RB_INIT(&de->u.fict_root);
741 /*
742 * Here we can insert without prior checking because the tree
743 * is empty at this point
744 */
745 RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root, sha_de);
746
747 /*
e8b22b55 748 * Mark entry in CRC tree fictitious
bb29b5d8
MD
749 */
750 de->flags |= HAMMER_DEDUP_ENTRY_FICTITIOUS;
751
752 /*
753 * Upgrade checksum of the candidate and insert it into
754 * SHA subtree. If upgrade fails insert the candidate into
755 * Pass2 list.
756 */
52e2f1b5 757 if (upgrade_chksum(scan_leaf, temp.sha_hash))
bb29b5d8 758 goto pass2_insert;
17882027
MD
759 sha_de = RB_FIND(sha_dedup_entry_rb_tree, &de->u.fict_root,
760 &temp);
9930da22 761 if (sha_de != NULL) {
bb29b5d8
MD
762 /* There is an entry with this SHA already, but the only
763 * RB-tree element at this point is that entry we just
764 * added. We know for sure these blocks are different
765 * (this is crc_failure branch) so treat it as SHA
766 * collision.
767 */
768 goto sha256_failure;
9930da22 769 }
bb29b5d8 770
c8f165e2 771 sha_de = calloc(1, sizeof(*sha_de));
bb29b5d8
MD
772 sha_de->leaf = *scan_leaf;
773 memcpy(sha_de->sha_hash, temp.sha_hash, SHA256_DIGEST_LENGTH);
774 RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root, sha_de);
fbe1c665 775 MemoryUse += sizeof(*sha_de);
bb29b5d8
MD
776 goto upgrade_stats_sha;
777 }
778
779upgrade_stats:
780 de->u.de.ref_blks += 1;
781 de->u.de.ref_size += scan_leaf->data_len;
782 return (1);
783
784upgrade_stats_sha:
785 sha_de->ref_blks += 1;
786 sha_de->ref_size += scan_leaf->data_len;
787 return (1);
788
789pass2_insert:
790 /*
791 * If in pass2 mode don't insert anything, fall through to
792 * terminate_early
793 */
794 if ((flags & DEDUP_PASS2) == 0) {
c8f165e2 795 pass2_de = calloc(1, sizeof(*pass2_de));
bb29b5d8
MD
796 pass2_de->leaf = *scan_leaf;
797 STAILQ_INSERT_TAIL(&pass2_dedup_queue, pass2_de, sq_entry);
798 dedup_skipped_size += scan_leaf->data_len;
799 return (1);
800 }
801
802terminate_early:
803 /*
804 * Early termination path. Fixup stats.
805 */
806 dedup_alloc_size += scan_leaf->data_len;
807 dedup_ref_size += scan_leaf->data_len;
808 return (0);
809}
810
005a4da7
TK
811static
812int
46137e17 813upgrade_chksum(hammer_btree_leaf_elm_t leaf, uint8_t *sha_hash)
bb29b5d8
MD
814{
815 struct hammer_ioc_data data;
816 char *buf = malloc(DEDUP_BUF);
817 SHA256_CTX ctx;
818 int error;
819
820 bzero(&data, sizeof(data));
821 data.elm = leaf->base;
822 data.ubuf = buf;
823 data.size = DEDUP_BUF;
824
825 error = 0;
826 if (ioctl(glob_fd, HAMMERIOC_GET_DATA, &data) < 0) {
827 fprintf(stderr, "Get-data failed: %s\n", strerror(errno));
828 error = 1;
829 goto done;
830 }
e9c8ad3d 831 DedupDataReads += leaf->data_len;
bb29b5d8
MD
832
833 if (data.leaf.data_len != leaf->data_len) {
834 error = 1;
835 goto done;
836 }
837
838 if (data.leaf.base.btype == HAMMER_BTREE_TYPE_RECORD &&
839 data.leaf.base.rec_type == HAMMER_RECTYPE_DATA) {
840 SHA256_Init(&ctx);
841 SHA256_Update(&ctx, (void *)buf, data.leaf.data_len);
842 SHA256_Final(sha_hash, &ctx);
843 }
844
845done:
846 free(buf);
847 return (error);
848}
849
005a4da7
TK
850static
851void
3a92efe3
AHJ
852sigAlrm(int signo __unused)
853{
854 SigAlrmFlag = 1;
855}
856
005a4da7
TK
857static
858void
e9c8ad3d
MD
859sigInfo(int signo __unused)
860{
861 SigInfoFlag = 1;
862}
863
005a4da7
TK
864static
865void
e9c8ad3d 866scan_pfs(char *filesystem, scan_pfs_cb_t func, const char *id)
bb29b5d8
MD
867{
868 struct hammer_ioc_mirror_rw mirror;
869 hammer_ioc_mrecord_any_t mrec;
870 struct hammer_btree_leaf_elm elm;
871 char *buf = malloc(DEDUP_BUF);
fbe1c665
MD
872 char buf1[8];
873 char buf2[8];
bb29b5d8
MD
874 int offset, bytes;
875
e9c8ad3d
MD
876 SigInfoFlag = 0;
877 DedupDataReads = 0;
878 DedupCurrentRecords = 0;
879 signal(SIGINFO, sigInfo);
3a92efe3 880 signal(SIGALRM, sigAlrm);
e9c8ad3d 881
3a92efe3
AHJ
882 /*
883 * Deduplication happens per element so hammer(8) is in full
884 * control of the ioctl()s to actually perform it. SIGALRM
885 * needs to be handled within hammer(8) but a checkpoint
886 * is needed for resuming. Use cycle file for that.
887 *
888 * Try to obtain the previous obj_id from the cycle file and
889 * if not available just start from the beginning.
890 */
bb29b5d8
MD
891 bzero(&mirror, sizeof(mirror));
892 hammer_key_beg_init(&mirror.key_beg);
3a92efe3
AHJ
893 hammer_get_cycle(&mirror.key_beg, &mirror.tid_beg);
894
f254e677 895 if (mirror.key_beg.obj_id != (int64_t)HAMMER_MIN_OBJID) {
9930da22 896 if (VerboseOpt) {
3a92efe3
AHJ
897 fprintf(stderr, "%s: mirror-read: Resuming at object %016jx\n",
898 id, (uintmax_t)mirror.key_beg.obj_id);
9930da22 899 }
f254e677 900 }
3a92efe3 901
bb29b5d8
MD
902 hammer_key_end_init(&mirror.key_end);
903
904 mirror.tid_beg = glob_pfs.ondisk->sync_beg_tid;
905 mirror.tid_end = glob_pfs.ondisk->sync_end_tid;
906 mirror.head.flags |= HAMMER_IOC_MIRROR_NODATA; /* we want only keys */
907 mirror.ubuf = buf;
908 mirror.size = DEDUP_BUF;
909 mirror.pfs_id = glob_pfs.pfs_id;
910 mirror.shared_uuid = glob_pfs.ondisk->shared_uuid;
911
fbe1c665
MD
912 if (VerboseOpt && DedupCrcStart == 0) {
913 printf("%s %s: objspace %016jx:%04x %016jx:%04x\n",
914 id, filesystem,
915 (uintmax_t)mirror.key_beg.obj_id,
916 mirror.key_beg.localization,
917 (uintmax_t)mirror.key_end.obj_id,
918 mirror.key_end.localization);
919 printf("%s %s: pfs_id %d\n",
920 id, filesystem, glob_pfs.pfs_id);
921 }
bb29b5d8 922 fflush(stdout);
fbe1c665 923 fflush(stderr);
bb29b5d8
MD
924
925 do {
926 mirror.count = 0;
927 mirror.pfs_id = glob_pfs.pfs_id;
928 mirror.shared_uuid = glob_pfs.ondisk->shared_uuid;
052fd72b 929 if (ioctl(glob_fd, HAMMERIOC_MIRROR_READ, &mirror) < 0) {
02318f07 930 err(1, "Mirror-read %s failed", filesystem);
052fd72b
TK
931 /* not reached */
932 }
933 if (mirror.head.flags & HAMMER_IOC_HEAD_ERROR) {
02318f07 934 errx(1, "Mirror-read %s fatal error %d",
bb29b5d8 935 filesystem, mirror.head.error);
052fd72b
TK
936 /* not reached */
937 }
bb29b5d8
MD
938 if (mirror.count) {
939 offset = 0;
940 while (offset < mirror.count) {
941 mrec = (void *)((char *)buf + offset);
942 bytes = HAMMER_HEAD_DOALIGN(mrec->head.rec_size);
73cca638 943 if (offset + bytes > mirror.count) {
02318f07 944 errx(1, "Misaligned record");
73cca638
TK
945 /* not reached */
946 }
bb29b5d8 947 assert((mrec->head.type &
fbe1c665
MD
948 HAMMER_MRECF_TYPE_MASK) ==
949 HAMMER_MREC_TYPE_REC);
950 offset += bytes;
bb29b5d8 951 elm = mrec->rec.leaf;
fbe1c665
MD
952 if (elm.base.btype != HAMMER_BTREE_TYPE_RECORD)
953 continue;
954 if (elm.base.rec_type != HAMMER_RECTYPE_DATA)
955 continue;
956 ++DedupCurrentRecords;
957 if (DedupCrcStart != DedupCrcEnd) {
958 if (elm.data_crc < DedupCrcStart)
959 continue;
960 if (DedupCrcEnd &&
f254e677 961 elm.data_crc >= DedupCrcEnd) {
fbe1c665 962 continue;
f254e677 963 }
bb29b5d8 964 }
fbe1c665 965 func(&elm, 0);
bb29b5d8
MD
966 }
967 }
968 mirror.key_beg = mirror.key_cur;
3a92efe3 969 if (DidInterrupt || SigAlrmFlag) {
9930da22 970 if (VerboseOpt) {
3a92efe3
AHJ
971 fprintf(stderr, "%s\n",
972 (DidInterrupt ? "Interrupted" : "Timeout"));
9930da22 973 }
3a92efe3 974 hammer_set_cycle(&mirror.key_cur, mirror.tid_beg);
9930da22 975 if (VerboseOpt) {
3a92efe3
AHJ
976 fprintf(stderr, "Cyclefile %s updated for "
977 "continuation\n", CyclePath);
9930da22 978 }
e9c8ad3d
MD
979 exit(1);
980 }
981 if (SigInfoFlag) {
982 if (DedupTotalRecords) {
fbe1c665
MD
983 humanize_unsigned(buf1, sizeof(buf1),
984 DedupDataReads,
985 "B", 1024);
986 humanize_unsigned(buf2, sizeof(buf2),
987 dedup_successes_bytes,
988 "B", 1024);
17882027 989 fprintf(stderr, "%s count %7jd/%jd "
e9c8ad3d 990 "(%02d.%02d%%) "
fbe1c665 991 "ioread %s newddup %s\n",
e9c8ad3d
MD
992 id,
993 (intmax_t)DedupCurrentRecords,
994 (intmax_t)DedupTotalRecords,
995 (int)(DedupCurrentRecords * 100 /
996 DedupTotalRecords),
997 (int)(DedupCurrentRecords * 10000 /
998 DedupTotalRecords % 100),
fbe1c665 999 buf1, buf2);
e9c8ad3d 1000 } else {
17882027 1001 fprintf(stderr, "%s count %-7jd\n",
e9c8ad3d
MD
1002 id,
1003 (intmax_t)DedupCurrentRecords);
1004 }
1005 SigInfoFlag = 0;
1006 }
bb29b5d8
MD
1007 } while (mirror.count != 0);
1008
e9c8ad3d 1009 signal(SIGINFO, SIG_IGN);
3a92efe3 1010 signal(SIGALRM, SIG_IGN);
e9c8ad3d 1011
bb29b5d8
MD
1012 free(buf);
1013}
1014
005a4da7
TK
1015static
1016void
bb29b5d8
MD
1017dump_simulated_dedup(void)
1018{
1019 struct sim_dedup_entry *sim_de;
1020
1021 printf("=== Dumping simulated dedup entries:\n");
f254e677 1022 RB_FOREACH(sim_de, sim_dedup_entry_rb_tree, &sim_dedup_tree) {
1477c82a
MD
1023 printf("\tcrc=%08x cnt=%ju size=%ju\n",
1024 sim_de->crc,
1f5e0b99
SW
1025 (uintmax_t)sim_de->ref_blks,
1026 (uintmax_t)sim_de->ref_size);
f254e677 1027 }
bb29b5d8
MD
1028 printf("end of dump ===\n");
1029}
1030
005a4da7
TK
1031static
1032void
bb29b5d8
MD
1033dump_real_dedup(void)
1034{
1035 struct dedup_entry *de;
1036 struct sha_dedup_entry *sha_de;
1037 int i;
1038
1039 printf("=== Dumping dedup entries:\n");
1040 RB_FOREACH(de, dedup_entry_rb_tree, &dedup_tree) {
1041 if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
e8b22b55 1042 printf("\tcrc=%08x fictitious\n", de->leaf.data_crc);
bb29b5d8
MD
1043
1044 RB_FOREACH(sha_de, sha_dedup_entry_rb_tree, &de->u.fict_root) {
1477c82a
MD
1045 printf("\t\tcrc=%08x cnt=%ju size=%ju\n\t"
1046 "\t\tsha=",
1047 sha_de->leaf.data_crc,
1f5e0b99
SW
1048 (uintmax_t)sha_de->ref_blks,
1049 (uintmax_t)sha_de->ref_size);
bb29b5d8
MD
1050 for (i = 0; i < SHA256_DIGEST_LENGTH; ++i)
1051 printf("%02x", sha_de->sha_hash[i]);
1052 printf("\n");
1053 }
1054 } else {
1477c82a
MD
1055 printf("\tcrc=%08x cnt=%ju size=%ju\n",
1056 de->leaf.data_crc,
1f5e0b99
SW
1057 (uintmax_t)de->u.de.ref_blks,
1058 (uintmax_t)de->u.de.ref_size);
bb29b5d8
MD
1059 }
1060 }
1061 printf("end of dump ===\n");
1062}
1063
005a4da7
TK
1064static
1065void
bb29b5d8
MD
1066dedup_usage(int code)
1067{
1068 fprintf(stderr,
1069 "hammer dedup-simulate <filesystem>\n"
1070 "hammer dedup <filesystem>\n"
1071 );
1072 exit(code);
1073}