* Implements all the logic required to dedup a HAMMER filesystem.
* There is one remaining issue and that is the reblocker's propensity to
undo de-dup's hard work in certain cases.
* Code bounty for hammer_dedup
Submitted-by: Ilya Dryomov <idryomov@gmail.com>
cmd_synctid.c cmd_stats.c \
cmd_pseudofs.c cmd_snapshot.c cmd_mirror.c cmd_status.c \
cmd_cleanup.c cmd_info.c cmd_version.c cmd_volume.c \
- cmd_config.c cmd_recover.c
+ cmd_config.c cmd_recover.c cmd_dedup.c
MAN= hammer.8
CFLAGS+= -I${.CURDIR}/../../sys -DALIST_NO_DEBUG
-LDADD= -lm -lutil
+LDADD= -lm -lutil -lmd
.PATH: ${.CURDIR}/../../sys/libkern
SRCS+= crc32.c
* snapshots 1d 60d (0d 0d for /tmp, /var/tmp, /usr/obj)
* prune 1d 5m
* rebalance 1d 5m
+ * #dedup 1d 5m (not enabled by default)
* reblock 1d 5m
* recopy 30d 10m
*
int arg1, int arg2);
static int cleanup_recopy(const char *path, const char *snapshots_path,
int arg1, int arg2);
+static int cleanup_dedup(const char *path, const char *snapshots_path,
+ int arg1, int arg2);
static void runcmd(int *resp, const char *ctl, ...);
} else {
printf("skip\n");
}
+ } else if (strcmp(cmd, "dedup") == 0) {
+ if (check_period(snapshots_path, cmd, arg1, &savet)) {
+ printf("run");
+ fflush(stdout);
+ if (VerboseOpt)
+ printf("\n");
+ r = cleanup_dedup(path, snapshots_path,
+ arg1, arg2);
+ } else {
+ printf("skip\n");
+ }
} else {
printf("unknown directive\n");
r = 1;
snapshots,
"prune 1d 5m\n"
"rebalance 1d 5m\n"
+ "#dedup 1d 5m\n"
"reblock 1d 5m\n"
"recopy 30d 10m\n");
}
return(0);
}
+static int
+cleanup_dedup(const char *path, const char *snapshots_path __unused,
+ int arg1 __unused, int arg2)
+{
+ if (VerboseOpt == 0) {
+ printf(".");
+ fflush(stdout);
+ }
+
+ runcmd(NULL, "hammer -t %d dedup %s", arg2, path);
+ if (VerboseOpt == 0) {
+ printf(".");
+ fflush(stdout);
+ }
+ if (VerboseOpt == 0)
+ printf("\n");
+ return(0);
+}
+
static
void
runcmd(int *resp, const char *ctl, ...)
"#snapshots 1d 60d\n"
"#prune 1d 5m\n"
"#rebalance 1d 5m\n"
+ "#dedup 1d 5m\n"
"#reblock 1d 5m\n"
"#recopy 30d 10m\n");
config.head.error = 0;
--- /dev/null
+/*
+ * Copyright (c) 2010 The DragonFly Project. All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Ilya Dryomov <idryomov@gmail.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "hammer.h"
+#include <libutil.h>
+#include <crypto/sha2/sha2.h>
+
+#define DEDUP_BUF (64 * 1024)
+
+/* Sorted list of block CRCs - light version for dedup-simulate */
+struct sim_dedup_entry_rb_tree;
+RB_HEAD(sim_dedup_entry_rb_tree, sim_dedup_entry) sim_dedup_tree =
+ RB_INITIALIZER(&sim_dedup_tree);
+RB_PROTOTYPE2(sim_dedup_entry_rb_tree, sim_dedup_entry, rb_entry,
+ rb_sim_dedup_entry_compare, hammer_crc_t);
+
+struct sim_dedup_entry {
+ hammer_crc_t crc;
+ u_int64_t ref_blks; /* number of blocks referenced */
+ u_int64_t ref_size; /* size of data referenced */
+ RB_ENTRY(sim_dedup_entry) rb_entry;
+};
+
+/* Sorted list of HAMMER B-Tree keys */
+struct dedup_entry_rb_tree;
+struct sha_dedup_entry_rb_tree;
+
+RB_HEAD(dedup_entry_rb_tree, dedup_entry) dedup_tree =
+ RB_INITIALIZER(&dedup_tree);
+RB_PROTOTYPE2(dedup_entry_rb_tree, dedup_entry, rb_entry,
+ rb_dedup_entry_compare, hammer_crc_t);
+
+RB_PROTOTYPE(sha_dedup_entry_rb_tree, sha_dedup_entry, fict_entry,
+ rb_sha_dedup_entry_compare);
+
+struct dedup_entry {
+ struct hammer_btree_leaf_elm leaf;
+ union {
+ struct {
+ u_int64_t ref_blks;
+ u_int64_t ref_size;
+ } de;
+ RB_HEAD(sha_dedup_entry_rb_tree, sha_dedup_entry) fict_root;
+ } u;
+ u_int8_t flags;
+ RB_ENTRY(dedup_entry) rb_entry;
+};
+
+#define HAMMER_DEDUP_ENTRY_FICTITIOUS 0x0001
+
+struct sha_dedup_entry {
+ struct hammer_btree_leaf_elm leaf;
+ u_int64_t ref_blks;
+ u_int64_t ref_size;
+ u_int8_t sha_hash[SHA256_DIGEST_LENGTH];
+ RB_ENTRY(sha_dedup_entry) fict_entry;
+};
+
+/*
+ * Pass2 list - contains entries that were not dedup'ed because ioctl failed
+ */
+STAILQ_HEAD(, pass2_dedup_entry) pass2_dedup_queue =
+ STAILQ_HEAD_INITIALIZER(pass2_dedup_queue);
+
+struct pass2_dedup_entry {
+ struct hammer_btree_leaf_elm leaf;
+ STAILQ_ENTRY(pass2_dedup_entry) sq_entry;
+};
+
+#define DEDUP_PASS2 0x0001 /* process_btree_elm() mode */
+
+/* PFS global ids - we deal with just one PFS at a run */
+int glob_fd;
+struct hammer_ioc_pseudofs_rw glob_pfs;
+
+/*
+ * Global accounting variables
+ *
+ * Last three don't have to be 64-bit, just to be safe..
+ */
+u_int64_t dedup_alloc_size = 0;
+u_int64_t dedup_ref_size = 0;
+u_int64_t dedup_skipped_size = 0;
+u_int64_t dedup_crc_failures = 0;
+u_int64_t dedup_sha_failures = 0;
+u_int64_t dedup_underflows = 0;
+
+static int rb_sim_dedup_entry_compare(struct sim_dedup_entry *sim_de1,
+ struct sim_dedup_entry *sim_de2);
+static int rb_dedup_entry_compare(struct dedup_entry *de1,
+ struct dedup_entry *de2);
+static int rb_sha_dedup_entry_compare(struct sha_dedup_entry *sha_de1,
+ struct sha_dedup_entry *sha_de2);
+typedef int (*scan_pfs_cb_t)(hammer_btree_leaf_elm_t scan_leaf, int flags);
+static void scan_pfs(char *filesystem, scan_pfs_cb_t func);
+static int collect_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags);
+static int process_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags);
+static int upgrade_chksum(hammer_btree_leaf_elm_t leaf, u_int8_t *sha_hash);
+static void dump_simulated_dedup(void);
+static void dump_real_dedup(void);
+static void dedup_usage(int code);
+
+RB_GENERATE2(sim_dedup_entry_rb_tree, sim_dedup_entry, rb_entry,
+ rb_sim_dedup_entry_compare, hammer_crc_t, crc);
+RB_GENERATE2(dedup_entry_rb_tree, dedup_entry, rb_entry,
+ rb_dedup_entry_compare, hammer_crc_t, leaf.data_crc);
+RB_GENERATE(sha_dedup_entry_rb_tree, sha_dedup_entry, fict_entry,
+ rb_sha_dedup_entry_compare);
+
+static int
+rb_sim_dedup_entry_compare(struct sim_dedup_entry *sim_de1,
+ struct sim_dedup_entry *sim_de2)
+{
+ if (sim_de1->crc < sim_de2->crc)
+ return (-1);
+ if (sim_de1->crc > sim_de2->crc)
+ return (1);
+
+ return (0);
+}
+
+static int
+rb_dedup_entry_compare(struct dedup_entry *de1, struct dedup_entry *de2)
+{
+ if (de1->leaf.data_crc < de2->leaf.data_crc)
+ return (-1);
+ if (de1->leaf.data_crc > de2->leaf.data_crc)
+ return (1);
+
+ return (0);
+}
+
+static int
+rb_sha_dedup_entry_compare(struct sha_dedup_entry *sha_de1,
+ struct sha_dedup_entry *sha_de2)
+{
+ unsigned long *h1 = (unsigned long *)&sha_de1->sha_hash;
+ unsigned long *h2 = (unsigned long *)&sha_de2->sha_hash;
+ int i;
+
+ for (i = 0; i < SHA256_DIGEST_LENGTH / (int)sizeof(unsigned long); ++i) {
+ if (h1[i] < h2[i])
+ return (-1);
+ if (h1[i] > h2[i])
+ return (1);
+ }
+
+ return (0);
+}
+
+/*
+ * dedup-simulate <filesystem>
+ */
+void
+hammer_cmd_dedup_simulate(char **av, int ac)
+{
+ struct sim_dedup_entry *sim_de;
+
+ if (ac != 1)
+ dedup_usage(1);
+
+ glob_fd = getpfs(&glob_pfs, av[0]);
+
+ printf("Dedup-simulate ");
+ scan_pfs(av[0], &collect_btree_elm);
+ printf("Dedup-simulate %s succeeded\n", av[0]);
+
+ relpfs(glob_fd, &glob_pfs);
+
+ if (VerboseOpt >= 2)
+ dump_simulated_dedup();
+
+ /*
+ * Calculate simulated dedup ratio and get rid of the tree
+ */
+ while ((sim_de = RB_ROOT(&sim_dedup_tree)) != NULL) {
+ assert(sim_de->ref_blks != 0);
+ dedup_ref_size += sim_de->ref_size;
+ dedup_alloc_size += sim_de->ref_size / sim_de->ref_blks;
+
+ RB_REMOVE(sim_dedup_entry_rb_tree, &sim_dedup_tree, sim_de);
+ free(sim_de);
+ }
+
+ printf("Simulated dedup ratio = %.2f\n",
+ (double)dedup_ref_size / dedup_alloc_size);
+}
+
+/*
+ * dedup <filesystem>
+ */
+void
+hammer_cmd_dedup(char **av, int ac)
+{
+ struct dedup_entry *de;
+ struct sha_dedup_entry *sha_de;
+ struct pass2_dedup_entry *pass2_de;
+ char buf[8];
+
+ if (TimeoutOpt > 0)
+ alarm(TimeoutOpt);
+
+ if (ac != 1)
+ dedup_usage(1);
+
+ STAILQ_INIT(&pass2_dedup_queue);
+
+ glob_fd = getpfs(&glob_pfs, av[0]);
+
+ printf("Dedup ");
+ scan_pfs(av[0], &process_btree_elm);
+
+ while ((pass2_de = STAILQ_FIRST(&pass2_dedup_queue)) != NULL) {
+ if (process_btree_elm(&pass2_de->leaf, DEDUP_PASS2))
+ dedup_skipped_size -= pass2_de->leaf.data_len;
+
+ STAILQ_REMOVE_HEAD(&pass2_dedup_queue, sq_entry);
+ free(pass2_de);
+ }
+ assert(STAILQ_EMPTY(&pass2_dedup_queue));
+
+ printf("Dedup %s succeeded\n", av[0]);
+
+ relpfs(glob_fd, &glob_pfs);
+
+ if (VerboseOpt >= 2)
+ dump_real_dedup();
+
+ /*
+ * Calculate dedup ratio and get rid of the trees
+ */
+ while ((de = RB_ROOT(&dedup_tree)) != NULL) {
+ if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
+ while ((sha_de = RB_ROOT(&de->u.fict_root)) != NULL) {
+ assert(sha_de->ref_blks != 0);
+ dedup_ref_size += sha_de->ref_size;
+ dedup_alloc_size += sha_de->ref_size / sha_de->ref_blks;
+
+ RB_REMOVE(sha_dedup_entry_rb_tree,
+ &de->u.fict_root, sha_de);
+ free(sha_de);
+ }
+ assert(RB_EMPTY(&de->u.fict_root));
+ } else {
+ assert(de->u.de.ref_blks != 0);
+ dedup_ref_size += de->u.de.ref_size;
+ dedup_alloc_size += de->u.de.ref_size / de->u.de.ref_blks;
+ }
+
+ RB_REMOVE(dedup_entry_rb_tree, &dedup_tree, de);
+ free(de);
+ }
+ assert(RB_EMPTY(&dedup_tree));
+
+ assert(dedup_alloc_size != 0);
+ humanize_unsigned(buf, sizeof(buf), dedup_ref_size, "B", 1024);
+ printf("Dedup ratio = %.2f\n"
+ " %8s referenced\n",
+ (double)dedup_ref_size / dedup_alloc_size,
+ buf
+ );
+ humanize_unsigned(buf, sizeof(buf), dedup_alloc_size, "B", 1024);
+ printf(" %8s allocated\n", buf);
+ humanize_unsigned(buf, sizeof(buf), dedup_skipped_size, "B", 1024);
+ printf(" %8s skipped\n", buf);
+ printf(" %8jd CRC collisions\n"
+ " %8jd SHA collisions\n"
+ " %8jd bigblock underflows\n",
+ (intmax_t)dedup_crc_failures,
+ (intmax_t)dedup_sha_failures,
+ (intmax_t)dedup_underflows
+ );
+}
+
+static int
+collect_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags __unused)
+{
+ struct sim_dedup_entry *sim_de;
+
+ sim_de = RB_LOOKUP(sim_dedup_entry_rb_tree, &sim_dedup_tree, scan_leaf->data_crc);
+
+ if (sim_de == NULL) {
+ sim_de = calloc(sizeof(*sim_de), 1);
+ sim_de->crc = scan_leaf->data_crc;
+ RB_INSERT(sim_dedup_entry_rb_tree, &sim_dedup_tree, sim_de);
+ }
+
+ sim_de->ref_blks += 1;
+ sim_de->ref_size += scan_leaf->data_len;
+ return (1);
+}
+
+static __inline int
+validate_dedup_pair(hammer_btree_leaf_elm_t p, hammer_btree_leaf_elm_t q)
+{
+ if ((p->data_offset & HAMMER_OFF_ZONE_MASK) !=
+ (q->data_offset & HAMMER_OFF_ZONE_MASK)) {
+ return (1);
+ }
+ if (p->data_len != q->data_len) {
+ return (1);
+ }
+
+ return (0);
+}
+
+#define DEDUP_TECH_FAILURE 1
+#define DEDUP_CMP_FAILURE 2
+#define DEDUP_INVALID_ZONE 3
+#define DEDUP_UNDERFLOW 4
+
+static __inline int
+deduplicate(hammer_btree_leaf_elm_t p, hammer_btree_leaf_elm_t q)
+{
+ struct hammer_ioc_dedup dedup;
+
+ bzero(&dedup, sizeof(dedup));
+
+ /*
+ * If data_offset fields are the same there is no need to run ioctl,
+ * candidate is already dedup'ed.
+ */
+ if (p->data_offset == q->data_offset) {
+ return (0);
+ }
+
+ dedup.elm1 = p->base;
+ dedup.elm2 = q->base;
+ RunningIoctl = 1;
+ if (ioctl(glob_fd, HAMMERIOC_DEDUP, &dedup) < 0) {
+ /* Technical failure - locking or w/e */
+ return (DEDUP_TECH_FAILURE);
+ }
+ if (dedup.head.flags & HAMMER_IOC_DEDUP_CMP_FAILURE) {
+ return (DEDUP_CMP_FAILURE);
+ }
+ if (dedup.head.flags & HAMMER_IOC_DEDUP_INVALID_ZONE) {
+ return (DEDUP_INVALID_ZONE);
+ }
+ if (dedup.head.flags & HAMMER_IOC_DEDUP_UNDERFLOW) {
+ return (DEDUP_UNDERFLOW);
+ }
+ RunningIoctl = 0;
+
+ return (0);
+}
+
+static int
+process_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags)
+{
+ struct dedup_entry *de;
+ struct sha_dedup_entry *sha_de, temp;
+ struct pass2_dedup_entry *pass2_de;
+ int error;
+
+ de = RB_LOOKUP(dedup_entry_rb_tree, &dedup_tree, scan_leaf->data_crc);
+ if (de == NULL) {
+ /*
+ * Insert new entry into CRC tree
+ */
+ de = calloc(sizeof(*de), 1);
+ de->leaf = *scan_leaf;
+ RB_INSERT(dedup_entry_rb_tree, &dedup_tree, de);
+ goto upgrade_stats;
+ }
+
+ /*
+ * Found entry in CRC tree
+ */
+ if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
+ /*
+ * Entry in CRC tree is fictious, so we already had problems
+ * with this CRC. Upgrade (compute SHA) the candidate and
+ * dive into SHA subtree. If upgrade fails insert the candidate
+ * into Pass2 list (it will be processed later).
+ */
+ if (upgrade_chksum(scan_leaf, temp.sha_hash))
+ goto pass2_insert;
+
+ sha_de = RB_FIND(sha_dedup_entry_rb_tree, &de->u.fict_root, &temp);
+ if (sha_de == NULL) {
+ /*
+ * Nothing in SHA subtree so far, so this is a new
+ * 'dataset'. Insert new entry into SHA subtree.
+ */
+ sha_de = calloc(sizeof(*sha_de), 1);
+ sha_de->leaf = *scan_leaf;
+ memcpy(sha_de->sha_hash, temp.sha_hash, SHA256_DIGEST_LENGTH);
+ RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root, sha_de);
+ goto upgrade_stats_sha;
+ }
+
+ /*
+ * Found entry in SHA subtree, it means we have a potential
+ * dedup pair. Validate it (zones have to match and data_len
+ * field have to be the same too. If validation fails, treat
+ * it as a SHA collision (jump to sha256_failure).
+ */
+ if (validate_dedup_pair(&sha_de->leaf, scan_leaf))
+ goto sha256_failure;
+
+ /*
+ * We have a valid dedup pair (SHA match, validated).
+ *
+ * In case of technical failure (dedup pair was good, but
+ * ioctl failed anyways) insert the candidate into Pass2 list
+ * (we will try to dedup it after we are done with the rest of
+ * the tree).
+ *
+ * If ioctl fails because either of blocks is in the non-dedup
+ * zone (we can dedup only in LARGE_DATA and SMALL_DATA) don't
+ * bother with the candidate and terminate early.
+ *
+ * If ioctl fails because of bigblock underflow replace the
+ * leaf node that found dedup entry represents with scan_leaf.
+ */
+ error = deduplicate(&sha_de->leaf, scan_leaf);
+ switch(error) {
+ case DEDUP_TECH_FAILURE:
+ goto pass2_insert;
+ case DEDUP_CMP_FAILURE:
+ goto sha256_failure;
+ case DEDUP_INVALID_ZONE:
+ goto terminate_early;
+ case DEDUP_UNDERFLOW:
+ ++dedup_underflows;
+ sha_de->leaf = *scan_leaf;
+ memcpy(sha_de->sha_hash, temp.sha_hash, SHA256_DIGEST_LENGTH);
+ goto upgrade_stats_sha;
+ default:
+ goto upgrade_stats_sha;
+ }
+
+ /*
+ * Ooh la la.. SHA-256 collision. Terminate early, there's
+ * nothing we can do here.
+ */
+sha256_failure:
+ ++dedup_sha_failures;
+ goto terminate_early;
+ } else {
+ /*
+ * Candidate CRC is good for now (we found an entry in CRC
+ * tree and it's not fictious). This means we have a
+ * potential dedup pair.
+ */
+ if (validate_dedup_pair(&de->leaf, scan_leaf))
+ goto crc_failure;
+
+ /*
+ * We have a valid dedup pair (CRC match, validated)
+ */
+ error = deduplicate(&de->leaf, scan_leaf);
+ switch(error) {
+ case DEDUP_TECH_FAILURE:
+ goto pass2_insert;
+ case DEDUP_CMP_FAILURE:
+ goto crc_failure;
+ case DEDUP_INVALID_ZONE:
+ goto terminate_early;
+ case DEDUP_UNDERFLOW:
+ ++dedup_underflows;
+ de->leaf = *scan_leaf;
+ goto upgrade_stats;
+ default:
+ goto upgrade_stats;
+ }
+
+crc_failure:
+ /*
+ * We got a CRC collision - either ioctl failed because of
+ * the comparison failure or validation of the potential
+ * dedup pair went bad. In all cases insert both blocks
+ * into SHA subtree (this requires checksum upgrade) and mark
+ * entry that corresponds to this CRC in the CRC tree
+ * fictious, so that all futher operations with this CRC go
+ * through SHA subtree.
+ */
+ ++dedup_crc_failures;
+ /*
+ * Insert block that was represented by now fictious dedup
+ * entry (create a new SHA entry and preserve stats of the
+ * old CRC one). If checksum upgrade fails insert the
+ * candidate into Pass2 list and return - keep both trees
+ * unmodified.
+ */
+ sha_de = calloc(sizeof(*sha_de), 1);
+ sha_de->leaf = de->leaf;
+ sha_de->ref_blks = de->u.de.ref_blks;
+ sha_de->ref_size = de->u.de.ref_size;
+ if (upgrade_chksum(&sha_de->leaf, sha_de->sha_hash)) {
+ free(sha_de);
+ goto pass2_insert;
+ }
+
+ RB_INIT(&de->u.fict_root);
+ /*
+ * Here we can insert without prior checking because the tree
+ * is empty at this point
+ */
+ RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root, sha_de);
+
+ /*
+ * Mark entry in CRC tree fictious
+ */
+ de->flags |= HAMMER_DEDUP_ENTRY_FICTITIOUS;
+
+ /*
+ * Upgrade checksum of the candidate and insert it into
+ * SHA subtree. If upgrade fails insert the candidate into
+ * Pass2 list.
+ */
+ if (upgrade_chksum(scan_leaf, temp.sha_hash)) {
+ goto pass2_insert;
+ }
+ sha_de = RB_FIND(sha_dedup_entry_rb_tree, &de->u.fict_root, &temp);
+ if (sha_de != NULL)
+ /* There is an entry with this SHA already, but the only
+ * RB-tree element at this point is that entry we just
+ * added. We know for sure these blocks are different
+ * (this is crc_failure branch) so treat it as SHA
+ * collision.
+ */
+ goto sha256_failure;
+
+ sha_de = calloc(sizeof(*sha_de), 1);
+ sha_de->leaf = *scan_leaf;
+ memcpy(sha_de->sha_hash, temp.sha_hash, SHA256_DIGEST_LENGTH);
+ RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root, sha_de);
+ goto upgrade_stats_sha;
+ }
+
+upgrade_stats:
+ de->u.de.ref_blks += 1;
+ de->u.de.ref_size += scan_leaf->data_len;
+ return (1);
+
+upgrade_stats_sha:
+ sha_de->ref_blks += 1;
+ sha_de->ref_size += scan_leaf->data_len;
+ return (1);
+
+pass2_insert:
+ /*
+ * If in pass2 mode don't insert anything, fall through to
+ * terminate_early
+ */
+ if ((flags & DEDUP_PASS2) == 0) {
+ pass2_de = calloc(sizeof(*pass2_de), 1);
+ pass2_de->leaf = *scan_leaf;
+ STAILQ_INSERT_TAIL(&pass2_dedup_queue, pass2_de, sq_entry);
+ dedup_skipped_size += scan_leaf->data_len;
+ return (1);
+ }
+
+terminate_early:
+ /*
+ * Early termination path. Fixup stats.
+ */
+ dedup_alloc_size += scan_leaf->data_len;
+ dedup_ref_size += scan_leaf->data_len;
+ return (0);
+}
+
+static int
+upgrade_chksum(hammer_btree_leaf_elm_t leaf, u_int8_t *sha_hash)
+{
+ struct hammer_ioc_data data;
+ char *buf = malloc(DEDUP_BUF);
+ SHA256_CTX ctx;
+ int error;
+
+ bzero(&data, sizeof(data));
+ data.elm = leaf->base;
+ data.ubuf = buf;
+ data.size = DEDUP_BUF;
+
+ error = 0;
+ if (ioctl(glob_fd, HAMMERIOC_GET_DATA, &data) < 0) {
+ fprintf(stderr, "Get-data failed: %s\n", strerror(errno));
+ error = 1;
+ goto done;
+ }
+
+ if (data.leaf.data_len != leaf->data_len) {
+ error = 1;
+ goto done;
+ }
+
+ if (data.leaf.base.btype == HAMMER_BTREE_TYPE_RECORD &&
+ data.leaf.base.rec_type == HAMMER_RECTYPE_DATA) {
+ SHA256_Init(&ctx);
+ SHA256_Update(&ctx, (void *)buf, data.leaf.data_len);
+ SHA256_Final(sha_hash, &ctx);
+ }
+
+done:
+ free(buf);
+ return (error);
+}
+
+static void
+scan_pfs(char *filesystem, scan_pfs_cb_t func)
+{
+ struct hammer_ioc_mirror_rw mirror;
+ hammer_ioc_mrecord_any_t mrec;
+ struct hammer_btree_leaf_elm elm;
+ char *buf = malloc(DEDUP_BUF);
+ int offset, bytes;
+
+ bzero(&mirror, sizeof(mirror));
+ hammer_key_beg_init(&mirror.key_beg);
+ hammer_key_end_init(&mirror.key_end);
+
+ mirror.tid_beg = glob_pfs.ondisk->sync_beg_tid;
+ mirror.tid_end = glob_pfs.ondisk->sync_end_tid;
+ mirror.head.flags |= HAMMER_IOC_MIRROR_NODATA; /* we want only keys */
+ mirror.ubuf = buf;
+ mirror.size = DEDUP_BUF;
+ mirror.pfs_id = glob_pfs.pfs_id;
+ mirror.shared_uuid = glob_pfs.ondisk->shared_uuid;
+
+ printf("%s: objspace %016jx:%04x %016jx:%04x pfs_id %d\n",
+ filesystem,
+ (uintmax_t)mirror.key_beg.obj_id,
+ mirror.key_beg.localization,
+ (uintmax_t)mirror.key_end.obj_id,
+ mirror.key_end.localization,
+ glob_pfs.pfs_id);
+ fflush(stdout);
+
+ do {
+ mirror.count = 0;
+ mirror.pfs_id = glob_pfs.pfs_id;
+ mirror.shared_uuid = glob_pfs.ondisk->shared_uuid;
+ if (ioctl(glob_fd, HAMMERIOC_MIRROR_READ, &mirror) < 0) {
+ fprintf(stderr, "Mirror-read %s failed: %s\n",
+ filesystem, strerror(errno));
+ exit(1);
+ }
+ if (mirror.head.flags & HAMMER_IOC_HEAD_ERROR) {
+ fprintf(stderr, "Mirror-read %s fatal error %d\n",
+ filesystem, mirror.head.error);
+ exit(1);
+ }
+ if (mirror.count) {
+ offset = 0;
+ while (offset < mirror.count) {
+ mrec = (void *)((char *)buf + offset);
+ bytes = HAMMER_HEAD_DOALIGN(mrec->head.rec_size);
+ if (offset + bytes > mirror.count) {
+ fprintf(stderr, "Misaligned record\n");
+ exit(1);
+ }
+ assert((mrec->head.type &
+ HAMMER_MRECF_TYPE_MASK) == HAMMER_MREC_TYPE_REC);
+
+ elm = mrec->rec.leaf;
+ if (elm.base.btype == HAMMER_BTREE_TYPE_RECORD &&
+ elm.base.rec_type == HAMMER_RECTYPE_DATA) {
+ func(&elm, 0);
+ }
+ offset += bytes;
+ }
+ }
+ mirror.key_beg = mirror.key_cur;
+ } while (mirror.count != 0);
+
+ free(buf);
+}
+
+static void
+dump_simulated_dedup(void)
+{
+ struct sim_dedup_entry *sim_de;
+
+ printf("=== Dumping simulated dedup entries:\n");
+ RB_FOREACH(sim_de, sim_dedup_entry_rb_tree, &sim_dedup_tree) {
+ printf("\tcrc=%08x cnt=%llu size=%llu\n", sim_de->crc,
+ sim_de->ref_blks, sim_de->ref_size);
+ }
+ printf("end of dump ===\n");
+}
+
+static void
+dump_real_dedup(void)
+{
+ struct dedup_entry *de;
+ struct sha_dedup_entry *sha_de;
+ int i;
+
+ printf("=== Dumping dedup entries:\n");
+ RB_FOREACH(de, dedup_entry_rb_tree, &dedup_tree) {
+ if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
+ printf("\tcrc=%08x fictious\n", de->leaf.data_crc);
+
+ RB_FOREACH(sha_de, sha_dedup_entry_rb_tree, &de->u.fict_root) {
+ printf("\t\tcrc=%08x cnt=%llu size=%llu\n\t\t\tsha=",
+ sha_de->leaf.data_crc,
+ sha_de->ref_blks,
+ sha_de->ref_size);
+ for (i = 0; i < SHA256_DIGEST_LENGTH; ++i)
+ printf("%02x", sha_de->sha_hash[i]);
+ printf("\n");
+ }
+ } else {
+ printf("\tcrc=%08x cnt=%llu size=%llu\n",
+ de->leaf.data_crc,
+ de->u.de.ref_blks,
+ de->u.de.ref_size);
+ }
+ }
+ printf("end of dump ===\n");
+}
+
+static void
+dedup_usage(int code)
+{
+ fprintf(stderr,
+ "hammer dedup-simulate <filesystem>\n"
+ "hammer dedup <filesystem>\n"
+ );
+ exit(code);
+}
.\"
.\" $DragonFly: src/sbin/hammer/hammer.8,v 1.58 2008/11/13 02:04:27 dillon Exp $
.\"
-.Dd February 12, 2010
+.Dd November 3, 2010
.Dt HAMMER 8
.Os
.Sh NAME
it can be reused.
.Pp
Data blocks can be shared by deducting the space used from the free byte
-count for each shared references, though
-.Nm HAMMER
-does not yet make use of this feature.
+count for each shared references.
This means the free byte count can legally go negative.
.Pp
This command needs the
.El
.\" ==== cleanup ====
.It Cm cleanup Op Ar filesystem ...
-This is a meta-command which executes snapshot, prune, rebalance and reblock
-commands on the specified
+This is a meta-command which executes snapshot, prune, rebalance, dedup
+and reblock commands on the specified
.Nm HAMMER
file systems.
If no
snapshots <period> <retention-time> [any]
prune <period> <max-runtime>
rebalance <period> <max-runtime>
+dedup <period> <max-runtime>
reblock <period> <max-runtime>
recopy <period> <max-runtime>
.Ed
snapshots 1d 60d # 0d 0d for PFS /tmp, /var/tmp, /usr/obj
prune 1d 5m
rebalance 1d 5m
+dedup 1d 5m
reblock 1d 5m
recopy 30d 10m
.Ed
.Xr periodic 8 .
.Pp
The default configuration file will create a daily snapshot, do a daily
-pruning, rebalancing and reblocking run and a monthly recopy run.
+pruning, rebalancing, deduping and reblocking run and a monthly recopy run.
Reblocking is defragmentation with a level of 95%,
and recopy is full defragmentation.
.Pp
By default prune and rebalance operations are time limited to 5 minutes,
-reblock operations to a bit over 5 minutes,
+dedup and reblock operations to a bit over 5 minutes,
and recopy operations to a bit over 10 minutes.
Reblocking and recopy runs are each broken down into four separate functions:
btree, inodes, dirs and data.
Rebalancing is a per PFS operation, so a
.Nm HAMMER
file system and each PFS in it have to be rebalanced separately.
+.\" ==== dedup ====
+.It Cm dedup Ar filesystem
+.Nm ( HAMMER
+VERSION 5+)
+Perform offline (post-process) deduplication. Deduplication occurs at
+the block level, currently only data blocks of the same size can be
+deduped, metadata blocks can not. The hash function used for comparing
+data blocks is CRC-32 (CRCs are computed anyways as part of
+.Nm HAMMER
+data integrity features, so there's no additional overhead). Since CRC
+is a weak hash function a byte-by-byte comparison is done before actual
+deduping. In case of a CRC collision (two data blocks have the same CRC
+but different contents) the checksum is upgraded to SHA-256.
+.Pp
+Currently
+.Nm HAMMER
+reblocker may partially blow up (re-expand) dedup (reblocker's normal
+operation is to reallocate every record, so it's possible for deduped
+blocks to be re-expanded back).
+.Pp
+Deduplication is a per PFS operation, so a
+.Nm HAMMER
+file system and each PFS in it have to be deduped separately. This also
+means that if you have duplicated data in two different PFSs that data
+won't be deduped, however the addition of such feature is planned.
+.\" ==== dedup-simulate ====
+.It Cm dedup-simulate Ar filesystem
+.Nm ( HAMMER
+VERSION 5+)
+Shows potential space savings (simulated dedup ratio) one can get after
+running
+.Cm dedup
+command. If the estimated dedup ratio is greater than 1.00 you will see
+dedup space savings. Remember that this is an estimated number, in
+practice real dedup ratio will be slightly smaller because of
+.Nm HAMMER
+bigblock underflows, B-Tree locking issues and other factors.
.\" ==== reblock* ====
.It Cm reblock Ar filesystem Op Ar fill_percentage
.It Cm reblock-btree Ar filesystem Op Ar fill_percentage
usage(1);
exit(0);
}
+ if (strcmp(av[0], "dedup-simulate") == 0) {
+ hammer_cmd_dedup_simulate(av + 1, ac - 1);
+ exit(0);
+ }
+ if (strcmp(av[0], "dedup") == 0) {
+ hammer_cmd_dedup(av + 1, ac - 1);
+ exit(0);
+ }
if (strcmp(av[0], "version") == 0) {
hammer_cmd_get_version(av + 1, ac - 1);
exit(0);
"hammer -f blkdevs recover <target_dir>\n"
);
+ fprintf(stderr, "\nHAMMER utility version 5+ commands:\n");
+
+ fprintf(stderr,
+ "hammer dedup-simulate <filesystem>\n"
+ "hammer dedup <filesystem>\n"
+ );
+
exit(exit_code);
}
void hammer_cmd_set_version(char **av, int ac);
void hammer_cmd_volume_add(char **av, int ac);
void hammer_cmd_volume_del(char **av, int ac);
+void hammer_cmd_dedup_simulate(char **av, int ac);
+void hammer_cmd_dedup(char **av, int ac);
void hammer_get_cycle(hammer_base_elm_t base, hammer_tid_t *tidp);
void hammer_set_cycle(hammer_base_elm_t base, hammer_tid_t tid);
int getpfs(struct hammer_ioc_pseudofs_rw *pfs, const char *path);
void relpfs(int fd, struct hammer_ioc_pseudofs_rw *pfs);
-
vfs/hammer/hammer_redo.c optional hammer
vfs/hammer/hammer_vfsops.c optional hammer
vfs/hammer/hammer_vnops.c optional hammer
+vfs/hammer/hammer_dedup.c optional hammer
#tmpfs static filesystem for debugging should be marked as optional tmpfs
vfs/tmpfs/tmpfs_fifoops.c standard
vfs/tmpfs/tmpfs_subr.c standard
hammer_redo.c \
hammer_reblock.c hammer_rebalance.c \
hammer_flusher.c hammer_mirror.c \
- hammer_pfs.c hammer_prune.c hammer_volume.c
+ hammer_pfs.c hammer_prune.c hammer_volume.c \
+ hammer_dedup.c
SRCS+= opt_ktr.h
.include <bsd.kmod.mk>
int hammer_cursor_upgrade(hammer_cursor_t cursor);
int hammer_cursor_upgrade_node(hammer_cursor_t cursor);
void hammer_cursor_downgrade(hammer_cursor_t cursor);
+int hammer_cursor_upgrade2(hammer_cursor_t c1, hammer_cursor_t c2);
+void hammer_cursor_downgrade2(hammer_cursor_t c1, hammer_cursor_t c2);
int hammer_cursor_seek(hammer_cursor_t cursor, hammer_node_t node,
int index);
void hammer_lock_ex_ident(struct hammer_lock *lock, const char *ident);
int hammer_lock_ex_try(struct hammer_lock *lock);
void hammer_lock_sh(struct hammer_lock *lock);
int hammer_lock_sh_try(struct hammer_lock *lock);
-int hammer_lock_upgrade(struct hammer_lock *lock);
-void hammer_lock_downgrade(struct hammer_lock *lock);
+int hammer_lock_upgrade(struct hammer_lock *lock, int shcount);
+void hammer_lock_downgrade(struct hammer_lock *lock, int shcount);
int hammer_lock_status(struct hammer_lock *lock);
void hammer_unlock(struct hammer_lock *lock);
void hammer_ref(struct hammer_lock *lock);
void hammer_reserve_clrdelay(hammer_mount_t hmp, hammer_reserve_t resv);
void hammer_blockmap_free(hammer_transaction_t trans,
hammer_off_t bmap_off, int bytes);
+int hammer_blockmap_dedup(hammer_transaction_t trans,
+ hammer_off_t bmap_off, int bytes);
int hammer_blockmap_finalize(hammer_transaction_t trans,
hammer_reserve_t resv,
hammer_off_t bmap_off, int bytes);
struct hammer_ioc_volume *ioc);
int hammer_ioc_volume_del(hammer_transaction_t trans, hammer_inode_t ip,
struct hammer_ioc_volume *ioc);
+int hammer_ioc_dedup(hammer_transaction_t trans, hammer_inode_t ip,
+ struct hammer_ioc_dedup *dedup);
int hammer_signal_check(hammer_mount_t hmp);
hammer_rel_buffer(buffer2, 0);
}
+int
+hammer_blockmap_dedup(hammer_transaction_t trans,
+ hammer_off_t zone_offset, int bytes)
+{
+ hammer_mount_t hmp;
+ hammer_volume_t root_volume;
+ hammer_blockmap_t blockmap;
+ hammer_blockmap_t freemap;
+ struct hammer_blockmap_layer1 *layer1;
+ struct hammer_blockmap_layer2 *layer2;
+ hammer_buffer_t buffer1 = NULL;
+ hammer_buffer_t buffer2 = NULL;
+ hammer_off_t layer1_offset;
+ hammer_off_t layer2_offset;
+ int32_t temp;
+ int error;
+ int zone;
+
+ if (bytes == 0)
+ return (0);
+ hmp = trans->hmp;
+
+ /*
+ * Alignment
+ */
+ bytes = (bytes + 15) & ~15;
+ KKASSERT(bytes <= HAMMER_LARGEBLOCK_SIZE);
+ KKASSERT(((zone_offset ^ (zone_offset + (bytes - 1))) &
+ ~HAMMER_LARGEBLOCK_MASK64) == 0);
+
+ /*
+ * Basic zone validation & locking
+ */
+ zone = HAMMER_ZONE_DECODE(zone_offset);
+ KKASSERT(zone >= HAMMER_ZONE_BTREE_INDEX && zone < HAMMER_MAX_ZONES);
+ root_volume = trans->rootvol;
+ error = 0;
+
+ blockmap = &hmp->blockmap[zone];
+ freemap = &hmp->blockmap[HAMMER_ZONE_FREEMAP_INDEX];
+
+ /*
+ * Dive layer 1.
+ */
+ layer1_offset = freemap->phys_offset +
+ HAMMER_BLOCKMAP_LAYER1_OFFSET(zone_offset);
+ layer1 = hammer_bread(hmp, layer1_offset, &error, &buffer1);
+ if (error)
+ goto failed;
+ KKASSERT(layer1->phys_offset &&
+ layer1->phys_offset != HAMMER_BLOCKMAP_UNAVAIL);
+ if (layer1->layer1_crc != crc32(layer1, HAMMER_LAYER1_CRCSIZE)) {
+ hammer_lock_ex(&hmp->blkmap_lock);
+ if (layer1->layer1_crc != crc32(layer1, HAMMER_LAYER1_CRCSIZE))
+ panic("CRC FAILED: LAYER1");
+ hammer_unlock(&hmp->blkmap_lock);
+ }
+
+ /*
+ * Dive layer 2, each entry represents a large-block.
+ */
+ layer2_offset = layer1->phys_offset +
+ HAMMER_BLOCKMAP_LAYER2_OFFSET(zone_offset);
+ layer2 = hammer_bread(hmp, layer2_offset, &error, &buffer2);
+ if (error)
+ goto failed;
+ if (layer2->entry_crc != crc32(layer2, HAMMER_LAYER2_CRCSIZE)) {
+ hammer_lock_ex(&hmp->blkmap_lock);
+ if (layer2->entry_crc != crc32(layer2, HAMMER_LAYER2_CRCSIZE))
+ panic("CRC FAILED: LAYER2");
+ hammer_unlock(&hmp->blkmap_lock);
+ }
+
+ hammer_lock_ex(&hmp->blkmap_lock);
+
+ hammer_modify_buffer(trans, buffer2, layer2, sizeof(*layer2));
+
+ /*
+ * Free space previously allocated via blockmap_alloc().
+ *
+ * NOTE: bytes_free can be and remain negative due to de-dup ops
+ * but can never become larger than HAMMER_LARGEBLOCK_SIZE.
+ */
+ KKASSERT(layer2->zone == zone);
+ temp = layer2->bytes_free - HAMMER_LARGEBLOCK_SIZE * 2;
+ cpu_ccfence(); /* prevent gcc from optimizing temp out */
+ if (temp > layer2->bytes_free) {
+ error = ERANGE;
+ goto underflow;
+ }
+ layer2->bytes_free -= bytes;
+
+ KKASSERT(layer2->bytes_free <= HAMMER_LARGEBLOCK_SIZE);
+
+ layer2->entry_crc = crc32(layer2, HAMMER_LAYER2_CRCSIZE);
+underflow:
+ hammer_modify_buffer_done(buffer2);
+ hammer_unlock(&hmp->blkmap_lock);
+
+failed:
+ if (buffer1)
+ hammer_rel_buffer(buffer1, 0);
+ if (buffer2)
+ hammer_rel_buffer(buffer2, 0);
+ return (error);
+}
+
/*
* Backend function - finalize (offset, bytes) in a zone.
*
{
int error;
- error = hammer_lock_upgrade(&cursor->node->lock);
+ error = hammer_lock_upgrade(&cursor->node->lock, 1);
if (error && cursor->deadlk_node == NULL) {
cursor->deadlk_node = cursor->node;
hammer_ref_node(cursor->deadlk_node);
} else if (error == 0 && cursor->parent) {
- error = hammer_lock_upgrade(&cursor->parent->lock);
+ error = hammer_lock_upgrade(&cursor->parent->lock, 1);
if (error && cursor->deadlk_node == NULL) {
cursor->deadlk_node = cursor->parent;
hammer_ref_node(cursor->deadlk_node);
{
int error;
- error = hammer_lock_upgrade(&cursor->node->lock);
+ error = hammer_lock_upgrade(&cursor->node->lock, 1);
if (error && cursor->deadlk_node == NULL) {
cursor->deadlk_node = cursor->node;
hammer_ref_node(cursor->deadlk_node);
hammer_cursor_downgrade(hammer_cursor_t cursor)
{
if (hammer_lock_excl_owned(&cursor->node->lock, curthread))
- hammer_lock_downgrade(&cursor->node->lock);
+ hammer_lock_downgrade(&cursor->node->lock, 1);
if (cursor->parent &&
hammer_lock_excl_owned(&cursor->parent->lock, curthread)) {
- hammer_lock_downgrade(&cursor->parent->lock);
+ hammer_lock_downgrade(&cursor->parent->lock, 1);
}
}
/*
+ * Upgrade and downgrade pairs of cursors. This is used by the dedup
+ * code which must deal with two cursors. A special function is needed
+ * because some of the nodes may be shared between the two cursors,
+ * resulting in share counts > 1 which will normally cause an upgrade
+ * to fail.
+ */
+static __noinline
+int
+collect_node(hammer_node_t *array, int *counts, int n, hammer_node_t node)
+{
+ int i;
+
+ for (i = 0; i < n; ++i) {
+ if (array[i] == node)
+ break;
+ }
+ if (i == n) {
+ array[i] = node;
+ counts[i] = 1;
+ ++i;
+ } else {
+ ++counts[i];
+ }
+ return(i);
+}
+
+int
+hammer_cursor_upgrade2(hammer_cursor_t cursor1, hammer_cursor_t cursor2)
+{
+ hammer_node_t nodes[4];
+ int counts[4];
+ int error;
+ int i;
+ int n;
+
+ n = collect_node(nodes, counts, 0, cursor1->node);
+ if (cursor1->parent)
+ n = collect_node(nodes, counts, n, cursor1->parent);
+ n = collect_node(nodes, counts, n, cursor2->node);
+ if (cursor2->parent)
+ n = collect_node(nodes, counts, n, cursor2->parent);
+
+ error = 0;
+ for (i = 0; i < n; ++i) {
+ error = hammer_lock_upgrade(&nodes[i]->lock, counts[i]);
+ if (error)
+ break;
+ }
+ if (error) {
+ while (--i >= 0)
+ hammer_lock_downgrade(&nodes[i]->lock, counts[i]);
+ }
+ return (error);
+}
+
+void
+hammer_cursor_downgrade2(hammer_cursor_t cursor1, hammer_cursor_t cursor2)
+{
+ hammer_node_t nodes[4];
+ int counts[4];
+ int i;
+ int n;
+
+ n = collect_node(nodes, counts, 0, cursor1->node);
+ if (cursor1->parent)
+ n = collect_node(nodes, counts, n, cursor1->parent);
+ n = collect_node(nodes, counts, n, cursor2->node);
+ if (cursor2->parent)
+ n = collect_node(nodes, counts, n, cursor2->parent);
+
+ for (i = 0; i < n; ++i)
+ hammer_lock_downgrade(&nodes[i]->lock, counts[i]);
+}
+
+/*
* Seek the cursor to the specified node and index.
*
* The caller must ref the node prior to calling this routine and release
--- /dev/null
+/*
+ * Copyright (c) 2010 The DragonFly Project. All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Ilya Dryomov <idryomov@gmail.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "hammer.h"
+
+static __inline int validate_zone(hammer_off_t data_offset);
+
+int
+hammer_ioc_dedup(hammer_transaction_t trans, hammer_inode_t ip,
+ struct hammer_ioc_dedup *dedup)
+{
+ struct hammer_cursor cursor1, cursor2;
+ int error;
+
+ /*
+ * Cursor1, return an error -> candidate goes to pass2 list
+ */
+ error = hammer_init_cursor(trans, &cursor1, NULL, NULL);
+ if (error)
+ goto done_cursor;
+ cursor1.key_beg = dedup->elm1;
+ cursor1.flags |= HAMMER_CURSOR_BACKEND;
+
+ error = hammer_btree_lookup(&cursor1);
+ if (error)
+ goto done_cursor;
+ error = hammer_btree_extract(&cursor1, HAMMER_CURSOR_GET_LEAF |
+ HAMMER_CURSOR_GET_DATA);
+ if (error)
+ goto done_cursor;
+
+ /*
+ * Cursor2, return an error -> candidate goes to pass2 list
+ */
+ error = hammer_init_cursor(trans, &cursor2, NULL, NULL);
+ if (error)
+ goto done_cursors;
+ cursor2.key_beg = dedup->elm2;
+ cursor2.flags |= HAMMER_CURSOR_BACKEND;
+
+ error = hammer_btree_lookup(&cursor2);
+ if (error)
+ goto done_cursors;
+ error = hammer_btree_extract(&cursor2, HAMMER_CURSOR_GET_LEAF |
+ HAMMER_CURSOR_GET_DATA);
+ if (error)
+ goto done_cursors;
+
+ /*
+ * Zone validation. We can't de-dup any of the other zones
+ * (BTREE or META) or bad things will happen.
+ *
+ * Return with error = 0, but set an INVALID_ZONE flag.
+ */
+ error = validate_zone(cursor1.leaf->data_offset) +
+ validate_zone(cursor2.leaf->data_offset);
+ if (error) {
+ dedup->head.flags |= HAMMER_IOC_DEDUP_INVALID_ZONE;
+ error = 0;
+ goto done_cursors;
+ }
+
+ /*
+ * Comparison checks
+ *
+ * If zones don't match or data_len fields aren't the same
+ * we consider it to be a comparison failure.
+ *
+ * Return with error = 0, but set a CMP_FAILURE flag.
+ */
+ if ((cursor1.leaf->data_offset & HAMMER_OFF_ZONE_MASK) !=
+ (cursor2.leaf->data_offset & HAMMER_OFF_ZONE_MASK)) {
+ dedup->head.flags |= HAMMER_IOC_DEDUP_CMP_FAILURE;
+ goto done_cursors;
+ }
+ if (cursor1.leaf->data_len != cursor2.leaf->data_len) {
+ dedup->head.flags |= HAMMER_IOC_DEDUP_CMP_FAILURE;
+ goto done_cursors;
+ }
+
+ /* byte-by-byte comparison to be sure */
+ if (bcmp(cursor1.data, cursor2.data, cursor1.leaf->data_len)) {
+ dedup->head.flags |= HAMMER_IOC_DEDUP_CMP_FAILURE;
+ goto done_cursors;
+ }
+
+ /*
+ * Upgrade both cursors together to an exclusive lock
+ *
+ * Return an error -> candidate goes to pass2 list
+ */
+ hammer_sync_lock_sh(trans);
+ error = hammer_cursor_upgrade2(&cursor1, &cursor2);
+ if (error) {
+ hammer_sync_unlock(trans);
+ goto done_cursors;
+ }
+
+ error = hammer_blockmap_dedup(cursor1.trans,
+ cursor1.leaf->data_offset, cursor1.leaf->data_len);
+ if (error) {
+ if (error == ERANGE) {
+ /*
+ * Return with error = 0, but set an UNDERFLOW flag
+ */
+ dedup->head.flags |= HAMMER_IOC_DEDUP_UNDERFLOW;
+ error = 0;
+ goto downgrade_cursors;
+ } else {
+ /*
+ * Return an error -> block goes to pass2 list
+ */
+ goto downgrade_cursors;
+ }
+ }
+
+ /*
+ * The cursor2's cache must be invalidated before calling
+ * hammer_blockmap_free(), otherwise it will not be able to
+ * invalidate the underlying data buffer.
+ */
+ hammer_cursor_invalidate_cache(&cursor2);
+ hammer_blockmap_free(cursor2.trans,
+ cursor2.leaf->data_offset, cursor2.leaf->data_len);
+
+ hammer_modify_node(cursor2.trans, cursor2.node,
+ &cursor2.leaf->data_offset, sizeof(hammer_off_t));
+ cursor2.leaf->data_offset = cursor1.leaf->data_offset;
+ hammer_modify_node_done(cursor2.node);
+
+downgrade_cursors:
+ hammer_cursor_downgrade2(&cursor1, &cursor2);
+ hammer_sync_unlock(trans);
+done_cursors:
+ hammer_done_cursor(&cursor2);
+done_cursor:
+ hammer_done_cursor(&cursor1);
+ return (error);
+}
+
+static __inline int
+validate_zone(hammer_off_t data_offset)
+{
+ switch(data_offset & HAMMER_OFF_ZONE_MASK) {
+ case HAMMER_ZONE_LARGE_DATA:
+ case HAMMER_ZONE_SMALL_DATA:
+ return (0);
+ default:
+ return (1);
+ }
+}
struct hammer_ioc_config *snap);
static int hammer_ioc_set_config(hammer_transaction_t trans, hammer_inode_t ip,
struct hammer_ioc_config *snap);
+static int hammer_ioc_get_data(hammer_transaction_t trans, hammer_inode_t ip,
+ struct hammer_ioc_data *data);
int
hammer_ioctl(hammer_inode_t ip, u_long com, caddr_t data, int fflag,
&trans, ip, (struct hammer_ioc_config *)data);
}
break;
+ case HAMMERIOC_DEDUP:
+ if (error == 0) {
+ error = hammer_ioc_dedup(
+ &trans, ip, (struct hammer_ioc_dedup *)data);
+ }
+ break;
+ case HAMMERIOC_GET_DATA:
+ if (error == 0) {
+ error = hammer_ioc_get_data(
+ &trans, ip, (struct hammer_ioc_data *)data);
+ }
+ break;
default:
error = EOPNOTSUPP;
break;
hammer_done_cursor(&cursor);
return(0);
}
+
+static
+int
+hammer_ioc_get_data(hammer_transaction_t trans, hammer_inode_t ip,
+ struct hammer_ioc_data *data)
+{
+ struct hammer_cursor cursor;
+ int bytes;
+ int error;
+
+ /* XXX cached inode ? */
+ error = hammer_init_cursor(trans, &cursor, NULL, NULL);
+ if (error)
+ goto failed;
+
+ cursor.key_beg = data->elm;
+ cursor.flags |= HAMMER_CURSOR_BACKEND;
+
+ error = hammer_btree_lookup(&cursor);
+ if (error == 0) {
+ error = hammer_btree_extract(&cursor, HAMMER_CURSOR_GET_LEAF |
+ HAMMER_CURSOR_GET_DATA);
+ if (error == 0) {
+ data->leaf = *cursor.leaf;
+ bytes = cursor.leaf->data_len;
+ if (bytes > data->size)
+ bytes = data->size;
+ error = copyout(cursor.data, data->ubuf, bytes);
+ }
+ }
+
+failed:
+ hammer_done_cursor(&cursor);
+ return (error);
+}
};
/*
+ * HAMMERIOC_DEDUP
+ */
+struct hammer_ioc_dedup {
+ struct hammer_ioc_head head;
+ struct hammer_base_elm elm1;
+ struct hammer_base_elm elm2; /* candidate for dedup */
+};
+
+#define HAMMER_IOC_DEDUP_CMP_FAILURE 0x0001 /* verification failed */
+#define HAMMER_IOC_DEDUP_UNDERFLOW 0x0002 /* bigblock underflow */
+#define HAMMER_IOC_DEDUP_INVALID_ZONE 0x0004 /* we can't dedup all zones */
+
+/*
+ * HAMMERIOC_GET_DATA
+ */
+struct hammer_ioc_data {
+ struct hammer_ioc_head head;
+ struct hammer_base_elm elm; /* btree key to lookup */
+ struct hammer_btree_leaf_elm leaf;
+ void *ubuf; /* user buffer */
+ int size; /* max size */
+};
+
+/*
* Ioctl cmd ids
*/
#define HAMMERIOC_PRUNE _IOWR('h',1,struct hammer_ioc_prune)
#define HAMMERIOC_GET_CONFIG _IOWR('h',21,struct hammer_ioc_config)
#define HAMMERIOC_SET_CONFIG _IOWR('h',22,struct hammer_ioc_config)
#define HAMMERIOC_DEL_VOLUME _IOWR('h',24,struct hammer_ioc_volume)
+#define HAMMERIOC_DEDUP _IOWR('h',25,struct hammer_ioc_dedup)
+#define HAMMERIOC_GET_DATA _IOWR('h',26,struct hammer_ioc_data)
#endif
* by someone else, this function will panic.
*/
int
-hammer_lock_upgrade(struct hammer_lock *lock)
+hammer_lock_upgrade(struct hammer_lock *lock, int shcount)
{
thread_t td = curthread;
u_int lv;
for (;;) {
lv = lock->lockval;
- if ((lv & ~HAMMER_LOCKF_WANTED) == 1) {
+ if ((lv & ~HAMMER_LOCKF_WANTED) == shcount) {
nlv = lv | HAMMER_LOCKF_EXCLUSIVE;
if (atomic_cmpset_int(&lock->lockval, lv, nlv)) {
lock->lowner = td;
* Downgrade an exclusively held lock to a shared lock.
*/
void
-hammer_lock_downgrade(struct hammer_lock *lock)
+hammer_lock_downgrade(struct hammer_lock *lock, int shcount)
{
thread_t td __debugvar = curthread;
u_int lv;
u_int nlv;
KKASSERT((lock->lockval & ~HAMMER_LOCKF_WANTED) ==
- (HAMMER_LOCKF_EXCLUSIVE | 1));
+ (HAMMER_LOCKF_EXCLUSIVE | shcount));
KKASSERT(lock->lowner == td);
/*