HAMMER - Add hammer dedup directive and support
authorMatthew Dillon <dillon@apollo.backplane.com>
Sun, 7 Nov 2010 17:29:39 +0000 (09:29 -0800)
committerMatthew Dillon <dillon@apollo.backplane.com>
Thu, 18 Nov 2010 17:42:30 +0000 (09:42 -0800)
* Implements all the logic required to dedup a HAMMER filesystem.

* There is one remaining issue and that is the reblocker's propensity to
  undo de-dup's hard work in certain cases.

* Code bounty for hammer_dedup

Submitted-by: Ilya Dryomov <idryomov@gmail.com>
16 files changed:
sbin/hammer/Makefile
sbin/hammer/cmd_cleanup.c
sbin/hammer/cmd_config.c
sbin/hammer/cmd_dedup.c [new file with mode: 0644]
sbin/hammer/hammer.8
sbin/hammer/hammer.c
sbin/hammer/hammer.h
sys/conf/files
sys/vfs/hammer/Makefile
sys/vfs/hammer/hammer.h
sys/vfs/hammer/hammer_blockmap.c
sys/vfs/hammer/hammer_cursor.c
sys/vfs/hammer/hammer_dedup.c [new file with mode: 0644]
sys/vfs/hammer/hammer_ioctl.c
sys/vfs/hammer/hammer_ioctl.h
sys/vfs/hammer/hammer_subs.c

index 7ba6a4d..9c4df8e 100644 (file)
@@ -8,11 +8,11 @@ SRCS= hammer.c ondisk.c blockmap.c cache.c misc.c cycle.c \
        cmd_synctid.c cmd_stats.c \
        cmd_pseudofs.c cmd_snapshot.c cmd_mirror.c cmd_status.c \
        cmd_cleanup.c cmd_info.c cmd_version.c cmd_volume.c \
-       cmd_config.c cmd_recover.c
+       cmd_config.c cmd_recover.c cmd_dedup.c
 MAN=   hammer.8
 
 CFLAGS+= -I${.CURDIR}/../../sys -DALIST_NO_DEBUG
-LDADD= -lm -lutil
+LDADD= -lm -lutil -lmd
 
 .PATH: ${.CURDIR}/../../sys/libkern
 SRCS+= crc32.c
index 8342684..d0f3a0e 100644 (file)
@@ -46,6 +46,7 @@
  *     snapshots 1d 60d        (0d 0d for /tmp, /var/tmp, /usr/obj)
  *     prune     1d 5m
  *     rebalance 1d 5m
+ *     #dedup    1d 5m         (not enabled by default)
  *     reblock   1d 5m
  *     recopy    30d 10m
  *
@@ -93,6 +94,8 @@ static int cleanup_reblock(const char *path, const char *snapshots_path,
                        int arg1, int arg2);
 static int cleanup_recopy(const char *path, const char *snapshots_path,
                        int arg1, int arg2);
+static int cleanup_dedup(const char *path, const char *snapshots_path,
+                       int arg1, int arg2);
 
 static void runcmd(int *resp, const char *ctl, ...);
 
@@ -494,6 +497,17 @@ do_cleanup(const char *path)
                        } else {
                                printf("skip\n");
                        }
+               } else if (strcmp(cmd, "dedup") == 0) {
+                       if (check_period(snapshots_path, cmd, arg1, &savet)) {
+                               printf("run");
+                               fflush(stdout);
+                               if (VerboseOpt)
+                                       printf("\n");
+                               r = cleanup_dedup(path, snapshots_path,
+                                               arg1, arg2);
+                       } else {
+                               printf("skip\n");
+                       }
                } else {
                        printf("unknown directive\n");
                        r = 1;
@@ -541,6 +555,7 @@ config_init(const char *path, struct hammer_ioc_config *config)
                snapshots,
                "prune     1d 5m\n"
                "rebalance 1d 5m\n"
+               "#dedup    1d 5m\n"
                "reblock   1d 5m\n"
                "recopy    30d 10m\n");
 }
@@ -1168,6 +1183,25 @@ cleanup_recopy(const char *path, const char *snapshots_path,
        return(0);
 }
 
+static int
+cleanup_dedup(const char *path, const char *snapshots_path __unused,
+                 int arg1 __unused, int arg2)
+{
+       if (VerboseOpt == 0) {
+               printf(".");
+               fflush(stdout);
+       }
+
+       runcmd(NULL, "hammer -t %d dedup %s", arg2, path);
+       if (VerboseOpt == 0) {
+               printf(".");
+               fflush(stdout);
+       }
+       if (VerboseOpt == 0)
+               printf("\n");
+       return(0);
+}
+
 static
 void
 runcmd(int *resp, const char *ctl, ...)
index 9a60146..5e68e60 100644 (file)
@@ -131,6 +131,7 @@ hammer_cmd_viconfig(char **av, int ac)
                         "#snapshots 1d 60d\n"
                         "#prune     1d 5m\n"
                         "#rebalance 1d 5m\n"
+                        "#dedup     1d 5m\n"
                         "#reblock   1d 5m\n"
                         "#recopy    30d 10m\n");
                config.head.error = 0;
diff --git a/sbin/hammer/cmd_dedup.c b/sbin/hammer/cmd_dedup.c
new file mode 100644 (file)
index 0000000..8dbdf85
--- /dev/null
@@ -0,0 +1,755 @@
+/*
+ * Copyright (c) 2010 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Ilya Dryomov <idryomov@gmail.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "hammer.h"
+#include <libutil.h>
+#include <crypto/sha2/sha2.h>
+
+#define DEDUP_BUF (64 * 1024)
+
+/* Sorted list of block CRCs - light version for dedup-simulate */
+struct sim_dedup_entry_rb_tree;
+RB_HEAD(sim_dedup_entry_rb_tree, sim_dedup_entry) sim_dedup_tree =
+                                       RB_INITIALIZER(&sim_dedup_tree);
+RB_PROTOTYPE2(sim_dedup_entry_rb_tree, sim_dedup_entry, rb_entry,
+               rb_sim_dedup_entry_compare, hammer_crc_t);
+
+struct sim_dedup_entry {
+       hammer_crc_t    crc;
+       u_int64_t       ref_blks; /* number of blocks referenced */
+       u_int64_t       ref_size; /* size of data referenced */
+       RB_ENTRY(sim_dedup_entry) rb_entry;
+};
+
+/* Sorted list of HAMMER B-Tree keys */
+struct dedup_entry_rb_tree;
+struct sha_dedup_entry_rb_tree;
+
+RB_HEAD(dedup_entry_rb_tree, dedup_entry) dedup_tree =
+                                       RB_INITIALIZER(&dedup_tree);
+RB_PROTOTYPE2(dedup_entry_rb_tree, dedup_entry, rb_entry,
+               rb_dedup_entry_compare, hammer_crc_t);
+
+RB_PROTOTYPE(sha_dedup_entry_rb_tree, sha_dedup_entry, fict_entry,
+               rb_sha_dedup_entry_compare);
+
+struct dedup_entry {
+       struct hammer_btree_leaf_elm leaf;
+       union {
+               struct {
+                       u_int64_t ref_blks;
+                       u_int64_t ref_size;
+               } de;
+               RB_HEAD(sha_dedup_entry_rb_tree, sha_dedup_entry) fict_root;
+       } u;
+       u_int8_t flags;
+       RB_ENTRY(dedup_entry) rb_entry;
+};
+
+#define HAMMER_DEDUP_ENTRY_FICTITIOUS  0x0001
+
+struct sha_dedup_entry {
+       struct hammer_btree_leaf_elm    leaf;
+       u_int64_t                       ref_blks;
+       u_int64_t                       ref_size;
+       u_int8_t                        sha_hash[SHA256_DIGEST_LENGTH];
+       RB_ENTRY(sha_dedup_entry)       fict_entry;
+};
+
+/*
+ * Pass2 list - contains entries that were not dedup'ed because ioctl failed
+ */
+STAILQ_HEAD(, pass2_dedup_entry) pass2_dedup_queue =
+                               STAILQ_HEAD_INITIALIZER(pass2_dedup_queue);
+
+struct pass2_dedup_entry {
+       struct hammer_btree_leaf_elm    leaf;
+       STAILQ_ENTRY(pass2_dedup_entry) sq_entry;
+};
+
+#define DEDUP_PASS2    0x0001 /* process_btree_elm() mode */
+
+/* PFS global ids - we deal with just one PFS at a run */
+int glob_fd;
+struct hammer_ioc_pseudofs_rw glob_pfs;
+
+/*
+ * Global accounting variables
+ *
+ * Last three don't have to be 64-bit, just to be safe..
+ */
+u_int64_t dedup_alloc_size = 0;
+u_int64_t dedup_ref_size = 0;
+u_int64_t dedup_skipped_size = 0;
+u_int64_t dedup_crc_failures = 0;
+u_int64_t dedup_sha_failures = 0;
+u_int64_t dedup_underflows = 0;
+
+static int rb_sim_dedup_entry_compare(struct sim_dedup_entry *sim_de1,
+                               struct sim_dedup_entry *sim_de2);
+static int rb_dedup_entry_compare(struct dedup_entry *de1,
+                               struct dedup_entry *de2);
+static int rb_sha_dedup_entry_compare(struct sha_dedup_entry *sha_de1,
+                               struct sha_dedup_entry *sha_de2);
+typedef int (*scan_pfs_cb_t)(hammer_btree_leaf_elm_t scan_leaf, int flags);
+static void scan_pfs(char *filesystem, scan_pfs_cb_t func);
+static int collect_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags);
+static int process_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags);
+static int upgrade_chksum(hammer_btree_leaf_elm_t leaf, u_int8_t *sha_hash);
+static void dump_simulated_dedup(void);
+static void dump_real_dedup(void);
+static void dedup_usage(int code);
+
+RB_GENERATE2(sim_dedup_entry_rb_tree, sim_dedup_entry, rb_entry,
+               rb_sim_dedup_entry_compare, hammer_crc_t, crc);
+RB_GENERATE2(dedup_entry_rb_tree, dedup_entry, rb_entry,
+               rb_dedup_entry_compare, hammer_crc_t, leaf.data_crc);
+RB_GENERATE(sha_dedup_entry_rb_tree, sha_dedup_entry, fict_entry,
+               rb_sha_dedup_entry_compare);
+
+static int
+rb_sim_dedup_entry_compare(struct sim_dedup_entry *sim_de1,
+                       struct sim_dedup_entry *sim_de2)
+{
+       if (sim_de1->crc < sim_de2->crc)
+               return (-1);
+       if (sim_de1->crc > sim_de2->crc)
+               return (1);
+
+       return (0);
+}
+
+static int
+rb_dedup_entry_compare(struct dedup_entry *de1, struct dedup_entry *de2)
+{
+       if (de1->leaf.data_crc < de2->leaf.data_crc)
+               return (-1);
+       if (de1->leaf.data_crc > de2->leaf.data_crc)
+               return (1);
+
+       return (0);
+}
+
+static int
+rb_sha_dedup_entry_compare(struct sha_dedup_entry *sha_de1,
+                       struct sha_dedup_entry *sha_de2)
+{
+       unsigned long *h1 = (unsigned long *)&sha_de1->sha_hash;
+       unsigned long *h2 = (unsigned long *)&sha_de2->sha_hash;
+       int i;
+
+       for (i = 0; i < SHA256_DIGEST_LENGTH / (int)sizeof(unsigned long); ++i) {
+               if (h1[i] < h2[i])
+                       return (-1);
+               if (h1[i] > h2[i])
+                       return (1);
+       }
+
+       return (0);
+}
+
+/*
+ * dedup-simulate <filesystem>
+ */
+void
+hammer_cmd_dedup_simulate(char **av, int ac)
+{
+       struct sim_dedup_entry *sim_de;
+
+       if (ac != 1)
+               dedup_usage(1);
+
+       glob_fd = getpfs(&glob_pfs, av[0]);
+
+       printf("Dedup-simulate ");
+       scan_pfs(av[0], &collect_btree_elm);
+       printf("Dedup-simulate %s succeeded\n", av[0]);
+
+       relpfs(glob_fd, &glob_pfs);
+
+       if (VerboseOpt >= 2)
+               dump_simulated_dedup();
+
+       /*
+        * Calculate simulated dedup ratio and get rid of the tree
+        */
+       while ((sim_de = RB_ROOT(&sim_dedup_tree)) != NULL) {
+               assert(sim_de->ref_blks != 0);
+               dedup_ref_size += sim_de->ref_size;
+               dedup_alloc_size += sim_de->ref_size / sim_de->ref_blks;
+
+               RB_REMOVE(sim_dedup_entry_rb_tree, &sim_dedup_tree, sim_de);
+               free(sim_de);
+       }
+
+       printf("Simulated dedup ratio = %.2f\n",
+           (double)dedup_ref_size / dedup_alloc_size);
+}
+
+/*
+ * dedup <filesystem>
+ */
+void
+hammer_cmd_dedup(char **av, int ac)
+{
+       struct dedup_entry *de;
+       struct sha_dedup_entry *sha_de;
+       struct pass2_dedup_entry *pass2_de;
+       char buf[8];
+
+       if (TimeoutOpt > 0)
+               alarm(TimeoutOpt);
+
+       if (ac != 1)
+               dedup_usage(1);
+
+       STAILQ_INIT(&pass2_dedup_queue);
+
+       glob_fd = getpfs(&glob_pfs, av[0]);
+
+       printf("Dedup ");
+       scan_pfs(av[0], &process_btree_elm);
+
+       while ((pass2_de = STAILQ_FIRST(&pass2_dedup_queue)) != NULL) {
+               if (process_btree_elm(&pass2_de->leaf, DEDUP_PASS2))
+                       dedup_skipped_size -= pass2_de->leaf.data_len;
+
+               STAILQ_REMOVE_HEAD(&pass2_dedup_queue, sq_entry);
+               free(pass2_de);
+       }
+       assert(STAILQ_EMPTY(&pass2_dedup_queue));
+
+       printf("Dedup %s succeeded\n", av[0]);
+
+       relpfs(glob_fd, &glob_pfs);
+
+       if (VerboseOpt >= 2)
+               dump_real_dedup();
+
+       /*
+        * Calculate dedup ratio and get rid of the trees
+        */
+       while ((de = RB_ROOT(&dedup_tree)) != NULL) {
+               if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
+                       while ((sha_de = RB_ROOT(&de->u.fict_root)) != NULL) {
+                               assert(sha_de->ref_blks != 0);
+                               dedup_ref_size += sha_de->ref_size;
+                               dedup_alloc_size += sha_de->ref_size / sha_de->ref_blks;
+
+                               RB_REMOVE(sha_dedup_entry_rb_tree,
+                                               &de->u.fict_root, sha_de);
+                               free(sha_de);
+                       }
+                       assert(RB_EMPTY(&de->u.fict_root));
+               } else {
+                       assert(de->u.de.ref_blks != 0);
+                       dedup_ref_size += de->u.de.ref_size;
+                       dedup_alloc_size += de->u.de.ref_size / de->u.de.ref_blks;
+               }
+
+               RB_REMOVE(dedup_entry_rb_tree, &dedup_tree, de);
+               free(de);
+       }
+       assert(RB_EMPTY(&dedup_tree));
+
+       assert(dedup_alloc_size != 0);
+       humanize_unsigned(buf, sizeof(buf), dedup_ref_size, "B", 1024);
+       printf("Dedup ratio = %.2f\n"
+              "    %8s referenced\n",
+              (double)dedup_ref_size / dedup_alloc_size,
+              buf
+       );
+       humanize_unsigned(buf, sizeof(buf), dedup_alloc_size, "B", 1024);
+       printf("    %8s allocated\n", buf);
+       humanize_unsigned(buf, sizeof(buf), dedup_skipped_size, "B", 1024);
+       printf("    %8s skipped\n", buf);
+       printf("    %8jd CRC collisions\n"
+              "    %8jd SHA collisions\n"
+              "    %8jd bigblock underflows\n",
+              (intmax_t)dedup_crc_failures,
+              (intmax_t)dedup_sha_failures,
+               (intmax_t)dedup_underflows
+       );
+}
+
+static int
+collect_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags __unused)
+{
+       struct sim_dedup_entry *sim_de;
+
+       sim_de = RB_LOOKUP(sim_dedup_entry_rb_tree, &sim_dedup_tree, scan_leaf->data_crc);
+
+       if (sim_de == NULL) {
+               sim_de = calloc(sizeof(*sim_de), 1);
+               sim_de->crc = scan_leaf->data_crc;
+               RB_INSERT(sim_dedup_entry_rb_tree, &sim_dedup_tree, sim_de);
+       }
+
+       sim_de->ref_blks += 1;
+       sim_de->ref_size += scan_leaf->data_len;
+       return (1);
+}
+
+static __inline int
+validate_dedup_pair(hammer_btree_leaf_elm_t p, hammer_btree_leaf_elm_t q)
+{
+       if ((p->data_offset & HAMMER_OFF_ZONE_MASK) !=
+           (q->data_offset & HAMMER_OFF_ZONE_MASK)) {
+               return (1);
+       }
+       if (p->data_len != q->data_len) {
+               return (1);
+       }
+
+       return (0);
+}
+
+#define DEDUP_TECH_FAILURE     1
+#define DEDUP_CMP_FAILURE      2
+#define DEDUP_INVALID_ZONE     3
+#define DEDUP_UNDERFLOW                4
+
+static __inline int
+deduplicate(hammer_btree_leaf_elm_t p, hammer_btree_leaf_elm_t q)
+{
+       struct hammer_ioc_dedup dedup;
+
+       bzero(&dedup, sizeof(dedup));
+
+       /*
+        * If data_offset fields are the same there is no need to run ioctl,
+        * candidate is already dedup'ed.
+        */
+       if (p->data_offset == q->data_offset) {
+               return (0);
+       }
+
+       dedup.elm1 = p->base;
+       dedup.elm2 = q->base;
+       RunningIoctl = 1;
+       if (ioctl(glob_fd, HAMMERIOC_DEDUP, &dedup) < 0) {
+               /* Technical failure - locking or w/e */
+               return (DEDUP_TECH_FAILURE);
+       }
+       if (dedup.head.flags & HAMMER_IOC_DEDUP_CMP_FAILURE) {
+               return (DEDUP_CMP_FAILURE);
+       }
+       if (dedup.head.flags & HAMMER_IOC_DEDUP_INVALID_ZONE) {
+               return (DEDUP_INVALID_ZONE);
+       }
+       if (dedup.head.flags & HAMMER_IOC_DEDUP_UNDERFLOW) {
+               return (DEDUP_UNDERFLOW);
+       }
+       RunningIoctl = 0;
+
+       return (0);
+}
+
+static int
+process_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags)
+{
+       struct dedup_entry *de;
+       struct sha_dedup_entry *sha_de, temp;
+       struct pass2_dedup_entry *pass2_de;
+       int error;
+
+       de = RB_LOOKUP(dedup_entry_rb_tree, &dedup_tree, scan_leaf->data_crc);
+       if (de == NULL) {
+               /*
+                * Insert new entry into CRC tree
+                */
+               de = calloc(sizeof(*de), 1);
+               de->leaf = *scan_leaf;
+               RB_INSERT(dedup_entry_rb_tree, &dedup_tree, de);
+               goto upgrade_stats;
+       }
+
+       /*
+        * Found entry in CRC tree
+        */
+       if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
+               /*
+                * Entry in CRC tree is fictious, so we already had problems
+                * with this CRC. Upgrade (compute SHA) the candidate and
+                * dive into SHA subtree. If upgrade fails insert the candidate
+                * into Pass2 list (it will be processed later).
+                */
+               if (upgrade_chksum(scan_leaf, temp.sha_hash))
+                       goto pass2_insert;
+
+               sha_de = RB_FIND(sha_dedup_entry_rb_tree, &de->u.fict_root, &temp);
+               if (sha_de == NULL) {
+                       /*
+                        * Nothing in SHA subtree so far, so this is a new
+                        * 'dataset'. Insert new entry into SHA subtree.
+                        */
+                       sha_de = calloc(sizeof(*sha_de), 1);
+                       sha_de->leaf = *scan_leaf;
+                       memcpy(sha_de->sha_hash, temp.sha_hash, SHA256_DIGEST_LENGTH);
+                       RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root, sha_de);
+                       goto upgrade_stats_sha;
+               }
+
+               /*
+                * Found entry in SHA subtree, it means we have a potential
+                * dedup pair. Validate it (zones have to match and data_len
+                * field have to be the same too. If validation fails, treat
+                * it as a SHA collision (jump to sha256_failure).
+                */
+               if (validate_dedup_pair(&sha_de->leaf, scan_leaf))
+                       goto sha256_failure;
+
+               /*
+                * We have a valid dedup pair (SHA match, validated).
+                *
+                * In case of technical failure (dedup pair was good, but
+                * ioctl failed anyways) insert the candidate into Pass2 list
+                * (we will try to dedup it after we are done with the rest of
+                * the tree).
+                *
+                * If ioctl fails because either of blocks is in the non-dedup
+                * zone (we can dedup only in LARGE_DATA and SMALL_DATA) don't
+                * bother with the candidate and terminate early.
+                *
+                * If ioctl fails because of bigblock underflow replace the
+                * leaf node that found dedup entry represents with scan_leaf.
+                */
+               error = deduplicate(&sha_de->leaf, scan_leaf);
+               switch(error) {
+               case DEDUP_TECH_FAILURE:
+                       goto pass2_insert;
+               case DEDUP_CMP_FAILURE:
+                       goto sha256_failure;
+               case DEDUP_INVALID_ZONE:
+                       goto terminate_early;
+               case DEDUP_UNDERFLOW:
+                       ++dedup_underflows;
+                       sha_de->leaf = *scan_leaf;
+                       memcpy(sha_de->sha_hash, temp.sha_hash, SHA256_DIGEST_LENGTH);
+                       goto upgrade_stats_sha;
+               default:
+                       goto upgrade_stats_sha;
+               }
+
+               /*
+                * Ooh la la.. SHA-256 collision. Terminate early, there's
+                * nothing we can do here.
+                */
+sha256_failure:
+               ++dedup_sha_failures;
+               goto terminate_early;
+       } else {
+               /*
+                * Candidate CRC is good for now (we found an entry in CRC
+                * tree and it's not fictious). This means we have a
+                * potential dedup pair.
+                */
+               if (validate_dedup_pair(&de->leaf, scan_leaf))
+                       goto crc_failure;
+
+               /*
+                * We have a valid dedup pair (CRC match, validated)
+                */
+               error = deduplicate(&de->leaf, scan_leaf);
+               switch(error) {
+               case DEDUP_TECH_FAILURE:
+                       goto pass2_insert;
+               case DEDUP_CMP_FAILURE:
+                       goto crc_failure;
+               case DEDUP_INVALID_ZONE:
+                       goto terminate_early;
+               case DEDUP_UNDERFLOW:
+                       ++dedup_underflows;
+                       de->leaf = *scan_leaf;
+                       goto upgrade_stats;
+               default:
+                       goto upgrade_stats;
+               }
+
+crc_failure:
+               /*
+                * We got a CRC collision - either ioctl failed because of
+                * the comparison failure or validation of the potential
+                * dedup pair went bad. In all cases insert both blocks
+                * into SHA subtree (this requires checksum upgrade) and mark
+                * entry that corresponds to this CRC in the CRC tree
+                * fictious, so that all futher operations with this CRC go
+                * through SHA subtree.
+                */
+               ++dedup_crc_failures;
+               /*
+                * Insert block that was represented by now fictious dedup
+                * entry (create a new SHA entry and preserve stats of the
+                * old CRC one). If checksum upgrade fails insert the
+                * candidate into Pass2 list and return - keep both trees
+                * unmodified.
+                */
+               sha_de = calloc(sizeof(*sha_de), 1);
+               sha_de->leaf = de->leaf;
+               sha_de->ref_blks = de->u.de.ref_blks;
+               sha_de->ref_size = de->u.de.ref_size;
+               if (upgrade_chksum(&sha_de->leaf, sha_de->sha_hash)) {
+                       free(sha_de);
+                       goto pass2_insert;
+               }
+
+               RB_INIT(&de->u.fict_root);
+               /*
+                * Here we can insert without prior checking because the tree
+                * is empty at this point
+                */
+               RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root, sha_de);
+
+               /*
+                * Mark entry in CRC tree fictious
+                */
+               de->flags |= HAMMER_DEDUP_ENTRY_FICTITIOUS;
+
+               /*
+                * Upgrade checksum of the candidate and insert it into
+                * SHA subtree. If upgrade fails insert the candidate into
+                * Pass2 list.
+                */
+               if (upgrade_chksum(scan_leaf, temp.sha_hash)) {
+                       goto pass2_insert;
+               }
+               sha_de = RB_FIND(sha_dedup_entry_rb_tree, &de->u.fict_root, &temp);
+               if (sha_de != NULL)
+                       /* There is an entry with this SHA already, but the only
+                        * RB-tree element at this point is that entry we just
+                        * added. We know for sure these blocks are different
+                        * (this is crc_failure branch) so treat it as SHA
+                        * collision.
+                        */
+                       goto sha256_failure;
+
+               sha_de = calloc(sizeof(*sha_de), 1);
+               sha_de->leaf = *scan_leaf;
+               memcpy(sha_de->sha_hash, temp.sha_hash, SHA256_DIGEST_LENGTH);
+               RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root, sha_de);
+               goto upgrade_stats_sha;
+       }
+
+upgrade_stats:
+       de->u.de.ref_blks += 1;
+       de->u.de.ref_size += scan_leaf->data_len;
+       return (1);
+
+upgrade_stats_sha:
+       sha_de->ref_blks += 1;
+       sha_de->ref_size += scan_leaf->data_len;
+       return (1);
+
+pass2_insert:
+       /*
+        * If in pass2 mode don't insert anything, fall through to
+        * terminate_early
+        */
+       if ((flags & DEDUP_PASS2) == 0) {
+               pass2_de = calloc(sizeof(*pass2_de), 1);
+               pass2_de->leaf = *scan_leaf;
+               STAILQ_INSERT_TAIL(&pass2_dedup_queue, pass2_de, sq_entry);
+               dedup_skipped_size += scan_leaf->data_len;
+               return (1);
+       }
+
+terminate_early:
+       /*
+        * Early termination path. Fixup stats.
+        */
+       dedup_alloc_size += scan_leaf->data_len;
+       dedup_ref_size += scan_leaf->data_len;
+       return (0);
+}
+
+static int
+upgrade_chksum(hammer_btree_leaf_elm_t leaf, u_int8_t *sha_hash)
+{
+       struct hammer_ioc_data data;
+       char *buf = malloc(DEDUP_BUF);
+       SHA256_CTX ctx;
+       int error;
+
+       bzero(&data, sizeof(data));
+       data.elm = leaf->base;
+       data.ubuf = buf;
+       data.size = DEDUP_BUF;
+
+       error = 0;
+       if (ioctl(glob_fd, HAMMERIOC_GET_DATA, &data) < 0) {
+               fprintf(stderr, "Get-data failed: %s\n", strerror(errno));
+               error = 1;
+               goto done;
+       }
+
+       if (data.leaf.data_len != leaf->data_len) {
+               error = 1;
+               goto done;
+       }
+
+       if (data.leaf.base.btype == HAMMER_BTREE_TYPE_RECORD &&
+           data.leaf.base.rec_type == HAMMER_RECTYPE_DATA) {
+               SHA256_Init(&ctx);
+               SHA256_Update(&ctx, (void *)buf, data.leaf.data_len);
+               SHA256_Final(sha_hash, &ctx);
+       }
+
+done:
+       free(buf);
+       return (error);
+}
+
+static void
+scan_pfs(char *filesystem, scan_pfs_cb_t func)
+{
+       struct hammer_ioc_mirror_rw mirror;
+       hammer_ioc_mrecord_any_t mrec;
+       struct hammer_btree_leaf_elm elm;
+       char *buf = malloc(DEDUP_BUF);
+       int offset, bytes;
+
+       bzero(&mirror, sizeof(mirror));
+       hammer_key_beg_init(&mirror.key_beg);
+       hammer_key_end_init(&mirror.key_end);
+
+       mirror.tid_beg = glob_pfs.ondisk->sync_beg_tid;
+       mirror.tid_end = glob_pfs.ondisk->sync_end_tid;
+       mirror.head.flags |= HAMMER_IOC_MIRROR_NODATA; /* we want only keys */
+       mirror.ubuf = buf;
+       mirror.size = DEDUP_BUF;
+       mirror.pfs_id = glob_pfs.pfs_id;
+       mirror.shared_uuid = glob_pfs.ondisk->shared_uuid;
+
+       printf("%s: objspace %016jx:%04x %016jx:%04x pfs_id %d\n",
+               filesystem,
+               (uintmax_t)mirror.key_beg.obj_id,
+               mirror.key_beg.localization,
+               (uintmax_t)mirror.key_end.obj_id,
+               mirror.key_end.localization,
+               glob_pfs.pfs_id);
+       fflush(stdout);
+
+       do {
+               mirror.count = 0;
+               mirror.pfs_id = glob_pfs.pfs_id;
+               mirror.shared_uuid = glob_pfs.ondisk->shared_uuid;
+               if (ioctl(glob_fd, HAMMERIOC_MIRROR_READ, &mirror) < 0) {
+                       fprintf(stderr, "Mirror-read %s failed: %s\n",
+                               filesystem, strerror(errno));
+                       exit(1);
+               }
+               if (mirror.head.flags & HAMMER_IOC_HEAD_ERROR) {
+                       fprintf(stderr, "Mirror-read %s fatal error %d\n",
+                               filesystem, mirror.head.error);
+                       exit(1);
+               }
+               if (mirror.count) {
+                       offset = 0;
+                       while (offset < mirror.count) {
+                               mrec = (void *)((char *)buf + offset);
+                               bytes = HAMMER_HEAD_DOALIGN(mrec->head.rec_size);
+                               if (offset + bytes > mirror.count) {
+                                       fprintf(stderr, "Misaligned record\n");
+                                       exit(1);
+                               }
+                               assert((mrec->head.type &
+                                   HAMMER_MRECF_TYPE_MASK) == HAMMER_MREC_TYPE_REC);
+
+                               elm = mrec->rec.leaf;
+                               if (elm.base.btype == HAMMER_BTREE_TYPE_RECORD &&
+                                   elm.base.rec_type == HAMMER_RECTYPE_DATA) {
+                                       func(&elm, 0);
+                               }
+                               offset += bytes;
+                       }
+               }
+               mirror.key_beg = mirror.key_cur;
+       } while (mirror.count != 0);
+
+       free(buf);
+}
+
+static void
+dump_simulated_dedup(void)
+{
+       struct sim_dedup_entry *sim_de;
+
+       printf("=== Dumping simulated dedup entries:\n");
+       RB_FOREACH(sim_de, sim_dedup_entry_rb_tree, &sim_dedup_tree) {
+               printf("\tcrc=%08x cnt=%llu size=%llu\n", sim_de->crc,
+                   sim_de->ref_blks, sim_de->ref_size);
+       }
+       printf("end of dump ===\n");
+}
+
+static void
+dump_real_dedup(void)
+{
+       struct dedup_entry *de;
+       struct sha_dedup_entry *sha_de;
+       int i;
+
+       printf("=== Dumping dedup entries:\n");
+       RB_FOREACH(de, dedup_entry_rb_tree, &dedup_tree) {
+               if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
+                       printf("\tcrc=%08x fictious\n", de->leaf.data_crc);
+
+                       RB_FOREACH(sha_de, sha_dedup_entry_rb_tree, &de->u.fict_root) {
+                               printf("\t\tcrc=%08x cnt=%llu size=%llu\n\t\t\tsha=",
+                                       sha_de->leaf.data_crc,
+                                       sha_de->ref_blks,
+                                       sha_de->ref_size);
+                               for (i = 0; i < SHA256_DIGEST_LENGTH; ++i)
+                                       printf("%02x", sha_de->sha_hash[i]);
+                               printf("\n");
+                       }
+               } else {
+                       printf("\tcrc=%08x cnt=%llu size=%llu\n",
+                               de->leaf.data_crc,
+                               de->u.de.ref_blks,
+                               de->u.de.ref_size);
+               }
+       }
+       printf("end of dump ===\n");
+}
+
+static void
+dedup_usage(int code)
+{
+       fprintf(stderr,
+               "hammer dedup-simulate <filesystem>\n"
+               "hammer dedup <filesystem>\n"
+       );
+       exit(code);
+}
index 9685667..ed007b2 100644 (file)
@@ -32,7 +32,7 @@
 .\"
 .\" $DragonFly: src/sbin/hammer/hammer.8,v 1.58 2008/11/13 02:04:27 dillon Exp $
 .\"
-.Dd February 12, 2010
+.Dd November 3, 2010
 .Dt HAMMER 8
 .Os
 .Sh NAME
@@ -290,9 +290,7 @@ through normal file system operations or through reblocking, before
 it can be reused.
 .Pp
 Data blocks can be shared by deducting the space used from the free byte
-count for each shared references, though
-.Nm HAMMER
-does not yet make use of this feature.
+count for each shared references.
 This means the free byte count can legally go negative.
 .Pp
 This command needs the
@@ -427,8 +425,8 @@ displays the mount point of the PFS is currently mounted on (if any).
 .El
 .\" ==== cleanup ====
 .It Cm cleanup Op Ar filesystem ...
-This is a meta-command which executes snapshot, prune, rebalance and reblock
-commands on the specified
+This is a meta-command which executes snapshot, prune, rebalance, dedup
+and reblock commands on the specified
 .Nm HAMMER
 file systems.
 If no
@@ -467,6 +465,7 @@ The format of the configuration file is:
 snapshots  <period> <retention-time> [any]
 prune      <period> <max-runtime>
 rebalance  <period> <max-runtime>
+dedup      <period> <max-runtime>
 reblock    <period> <max-runtime>
 recopy     <period> <max-runtime>
 .Ed
@@ -476,6 +475,7 @@ Defaults are:
 snapshots  1d 60d  # 0d 0d  for PFS /tmp, /var/tmp, /usr/obj
 prune      1d 5m
 rebalance  1d 5m
+dedup      1d 5m
 reblock    1d 5m
 recopy     30d 10m
 .Ed
@@ -527,12 +527,12 @@ nightly via
 .Xr periodic 8 .
 .Pp
 The default configuration file will create a daily snapshot, do a daily
-pruning, rebalancing and reblocking run and a monthly recopy run.
+pruning, rebalancing, deduping and reblocking run and a monthly recopy run.
 Reblocking is defragmentation with a level of 95%,
 and recopy is full defragmentation.
 .Pp
 By default prune and rebalance operations are time limited to 5 minutes,
-reblock operations to a bit over 5 minutes,
+dedup and reblock operations to a bit over 5 minutes,
 and recopy operations to a bit over 10 minutes.
 Reblocking and recopy runs are each broken down into four separate functions:
 btree, inodes, dirs and data.
@@ -854,6 +854,43 @@ suffix is not needed).
 Rebalancing is a per PFS operation, so a
 .Nm HAMMER
 file system and each PFS in it have to be rebalanced separately.
+.\" ==== dedup ====
+.It Cm dedup Ar filesystem
+.Nm ( HAMMER
+VERSION 5+)
+Perform offline (post-process) deduplication.  Deduplication occurs at
+the block level, currently only data blocks of the same size can be
+deduped, metadata blocks can not.  The hash function used for comparing
+data blocks is CRC-32 (CRCs are computed anyways as part of
+.Nm HAMMER
+data integrity features, so there's no additional overhead).  Since CRC
+is a weak hash function a byte-by-byte comparison is done before actual
+deduping.  In case of a CRC collision (two data blocks have the same CRC
+but different contents) the checksum is upgraded to SHA-256.
+.Pp
+Currently
+.Nm HAMMER
+reblocker may partially blow up (re-expand) dedup (reblocker's normal
+operation is to reallocate every record, so it's possible for deduped
+blocks to be re-expanded back).
+.Pp
+Deduplication is a per PFS operation, so a
+.Nm HAMMER
+file system and each PFS in it have to be deduped separately.  This also
+means that if you have duplicated data in two different PFSs that data
+won't be deduped, however the addition of such feature is planned.
+.\" ==== dedup-simulate ====
+.It Cm dedup-simulate Ar filesystem
+.Nm ( HAMMER
+VERSION 5+)
+Shows potential space savings (simulated dedup ratio) one can get after
+running
+.Cm dedup
+command.  If the estimated dedup ratio is greater than 1.00 you will see
+dedup space savings.  Remember that this is an estimated number, in
+practice real dedup ratio will be slightly smaller because of
+.Nm HAMMER
+bigblock underflows, B-Tree locking issues and other factors.
 .\" ==== reblock* ====
 .It Cm reblock Ar filesystem Op Ar fill_percentage
 .It Cm reblock-btree Ar filesystem Op Ar fill_percentage
index 80eedb0..b5c7f37 100644 (file)
@@ -416,6 +416,14 @@ main(int ac, char **av)
                        usage(1);
                exit(0);
        }
+       if (strcmp(av[0], "dedup-simulate") == 0) {
+               hammer_cmd_dedup_simulate(av + 1, ac - 1);
+               exit(0);
+       }
+       if (strcmp(av[0], "dedup") == 0) {
+               hammer_cmd_dedup(av + 1, ac - 1);
+               exit(0);
+       }
        if (strcmp(av[0], "version") == 0) {
                hammer_cmd_get_version(av + 1, ac - 1);
                exit(0);
@@ -595,6 +603,13 @@ usage(int exit_code)
                "hammer -f blkdevs recover <target_dir>\n"
        );
 
+       fprintf(stderr, "\nHAMMER utility version 5+ commands:\n");
+
+       fprintf(stderr,
+               "hammer dedup-simulate <filesystem>\n"
+               "hammer dedup <filesystem>\n"
+       );
+
        exit(exit_code);
 }
 
index 30edf2c..91ca70e 100644 (file)
@@ -116,6 +116,8 @@ void hammer_cmd_get_version(char **av, int ac);
 void hammer_cmd_set_version(char **av, int ac);
 void hammer_cmd_volume_add(char **av, int ac);
 void hammer_cmd_volume_del(char **av, int ac);
+void hammer_cmd_dedup_simulate(char **av, int ac);
+void hammer_cmd_dedup(char **av, int ac);
 
 void hammer_get_cycle(hammer_base_elm_t base, hammer_tid_t *tidp);
 void hammer_set_cycle(hammer_base_elm_t base, hammer_tid_t tid);
@@ -123,4 +125,3 @@ void hammer_reset_cycle(void);
 
 int getpfs(struct hammer_ioc_pseudofs_rw *pfs, const char *path);
 void relpfs(int fd, struct hammer_ioc_pseudofs_rw *pfs);
-
index 967c589..9e93768 100644 (file)
@@ -1703,6 +1703,7 @@ vfs/hammer/hammer_undo.c  optional hammer
 vfs/hammer/hammer_redo.c       optional hammer
 vfs/hammer/hammer_vfsops.c     optional hammer
 vfs/hammer/hammer_vnops.c      optional hammer
+vfs/hammer/hammer_dedup.c      optional hammer
 #tmpfs static filesystem for debugging should be marked as optional tmpfs
 vfs/tmpfs/tmpfs_fifoops.c      standard
 vfs/tmpfs/tmpfs_subr.c         standard
index d548fef..4d33e40 100644 (file)
@@ -10,7 +10,8 @@ SRCS= hammer_vfsops.c hammer_vnops.c hammer_inode.c \
        hammer_redo.c \
        hammer_reblock.c hammer_rebalance.c \
        hammer_flusher.c hammer_mirror.c \
-       hammer_pfs.c hammer_prune.c hammer_volume.c
+       hammer_pfs.c hammer_prune.c hammer_volume.c \
+       hammer_dedup.c
 SRCS+= opt_ktr.h
 
 .include <bsd.kmod.mk>
index 6fab020..3f8c5a6 100644 (file)
@@ -1081,14 +1081,16 @@ int     hammer_cursor_down(hammer_cursor_t cursor);
 int    hammer_cursor_upgrade(hammer_cursor_t cursor);
 int    hammer_cursor_upgrade_node(hammer_cursor_t cursor);
 void   hammer_cursor_downgrade(hammer_cursor_t cursor);
+int    hammer_cursor_upgrade2(hammer_cursor_t c1, hammer_cursor_t c2);
+void   hammer_cursor_downgrade2(hammer_cursor_t c1, hammer_cursor_t c2);
 int    hammer_cursor_seek(hammer_cursor_t cursor, hammer_node_t node,
                        int index);
 void   hammer_lock_ex_ident(struct hammer_lock *lock, const char *ident);
 int    hammer_lock_ex_try(struct hammer_lock *lock);
 void   hammer_lock_sh(struct hammer_lock *lock);
 int    hammer_lock_sh_try(struct hammer_lock *lock);
-int    hammer_lock_upgrade(struct hammer_lock *lock);
-void   hammer_lock_downgrade(struct hammer_lock *lock);
+int    hammer_lock_upgrade(struct hammer_lock *lock, int shcount);
+void   hammer_lock_downgrade(struct hammer_lock *lock, int shcount);
 int    hammer_lock_status(struct hammer_lock *lock);
 void   hammer_unlock(struct hammer_lock *lock);
 void   hammer_ref(struct hammer_lock *lock);
@@ -1276,6 +1278,8 @@ void hammer_blockmap_reserve_complete(hammer_mount_t hmp,
 void hammer_reserve_clrdelay(hammer_mount_t hmp, hammer_reserve_t resv);
 void hammer_blockmap_free(hammer_transaction_t trans,
                        hammer_off_t bmap_off, int bytes);
+int hammer_blockmap_dedup(hammer_transaction_t trans,
+                       hammer_off_t bmap_off, int bytes);
 int hammer_blockmap_finalize(hammer_transaction_t trans,
                        hammer_reserve_t resv,
                        hammer_off_t bmap_off, int bytes);
@@ -1410,6 +1414,8 @@ int hammer_ioc_volume_add(hammer_transaction_t trans, hammer_inode_t ip,
                         struct hammer_ioc_volume *ioc);
 int hammer_ioc_volume_del(hammer_transaction_t trans, hammer_inode_t ip,
                         struct hammer_ioc_volume *ioc);
+int hammer_ioc_dedup(hammer_transaction_t trans, hammer_inode_t ip,
+                       struct hammer_ioc_dedup *dedup);
 
 int hammer_signal_check(hammer_mount_t hmp);
 
index b308eab..35a0ccd 100644 (file)
@@ -898,6 +898,113 @@ failed:
                hammer_rel_buffer(buffer2, 0);
 }
 
+int
+hammer_blockmap_dedup(hammer_transaction_t trans,
+                    hammer_off_t zone_offset, int bytes)
+{
+       hammer_mount_t hmp;
+       hammer_volume_t root_volume;
+       hammer_blockmap_t blockmap;
+       hammer_blockmap_t freemap;
+       struct hammer_blockmap_layer1 *layer1;
+       struct hammer_blockmap_layer2 *layer2;
+       hammer_buffer_t buffer1 = NULL;
+       hammer_buffer_t buffer2 = NULL;
+       hammer_off_t layer1_offset;
+       hammer_off_t layer2_offset;
+       int32_t temp;
+       int error;
+       int zone;
+
+       if (bytes == 0)
+               return (0);
+       hmp = trans->hmp;
+
+       /*
+        * Alignment
+        */
+       bytes = (bytes + 15) & ~15;
+       KKASSERT(bytes <= HAMMER_LARGEBLOCK_SIZE);
+       KKASSERT(((zone_offset ^ (zone_offset + (bytes - 1))) &
+                 ~HAMMER_LARGEBLOCK_MASK64) == 0);
+
+       /*
+        * Basic zone validation & locking
+        */
+       zone = HAMMER_ZONE_DECODE(zone_offset);
+       KKASSERT(zone >= HAMMER_ZONE_BTREE_INDEX && zone < HAMMER_MAX_ZONES);
+       root_volume = trans->rootvol;
+       error = 0;
+
+       blockmap = &hmp->blockmap[zone];
+       freemap = &hmp->blockmap[HAMMER_ZONE_FREEMAP_INDEX];
+
+       /*
+        * Dive layer 1.
+        */
+       layer1_offset = freemap->phys_offset +
+                       HAMMER_BLOCKMAP_LAYER1_OFFSET(zone_offset);
+       layer1 = hammer_bread(hmp, layer1_offset, &error, &buffer1);
+       if (error)
+               goto failed;
+       KKASSERT(layer1->phys_offset &&
+                layer1->phys_offset != HAMMER_BLOCKMAP_UNAVAIL);
+       if (layer1->layer1_crc != crc32(layer1, HAMMER_LAYER1_CRCSIZE)) {
+               hammer_lock_ex(&hmp->blkmap_lock);
+               if (layer1->layer1_crc != crc32(layer1, HAMMER_LAYER1_CRCSIZE))
+                       panic("CRC FAILED: LAYER1");
+               hammer_unlock(&hmp->blkmap_lock);
+       }
+
+       /*
+        * Dive layer 2, each entry represents a large-block.
+        */
+       layer2_offset = layer1->phys_offset +
+                       HAMMER_BLOCKMAP_LAYER2_OFFSET(zone_offset);
+       layer2 = hammer_bread(hmp, layer2_offset, &error, &buffer2);
+       if (error)
+               goto failed;
+       if (layer2->entry_crc != crc32(layer2, HAMMER_LAYER2_CRCSIZE)) {
+               hammer_lock_ex(&hmp->blkmap_lock);
+               if (layer2->entry_crc != crc32(layer2, HAMMER_LAYER2_CRCSIZE))
+                       panic("CRC FAILED: LAYER2");
+               hammer_unlock(&hmp->blkmap_lock);
+       }
+
+       hammer_lock_ex(&hmp->blkmap_lock);
+
+       hammer_modify_buffer(trans, buffer2, layer2, sizeof(*layer2));
+
+       /*
+        * Free space previously allocated via blockmap_alloc().
+        *
+        * NOTE: bytes_free can be and remain negative due to de-dup ops
+        *       but can never become larger than HAMMER_LARGEBLOCK_SIZE.
+        */
+       KKASSERT(layer2->zone == zone);
+       temp = layer2->bytes_free - HAMMER_LARGEBLOCK_SIZE * 2;
+       cpu_ccfence(); /* prevent gcc from optimizing temp out */
+       if (temp > layer2->bytes_free) {
+               error = ERANGE;
+               goto underflow;
+       }
+       layer2->bytes_free -= bytes;
+
+       KKASSERT(layer2->bytes_free <= HAMMER_LARGEBLOCK_SIZE);
+
+       layer2->entry_crc = crc32(layer2, HAMMER_LAYER2_CRCSIZE);
+underflow:
+       hammer_modify_buffer_done(buffer2);
+       hammer_unlock(&hmp->blkmap_lock);
+
+failed:
+       if (buffer1)
+               hammer_rel_buffer(buffer1, 0);
+       if (buffer2)
+               hammer_rel_buffer(buffer2, 0);
+       return (error);
+}
+
 /*
  * Backend function - finalize (offset, bytes) in a zone.
  *
index f6de3fa..a88f5e1 100644 (file)
@@ -220,12 +220,12 @@ hammer_cursor_upgrade(hammer_cursor_t cursor)
 {
        int error;
 
-       error = hammer_lock_upgrade(&cursor->node->lock);
+       error = hammer_lock_upgrade(&cursor->node->lock, 1);
        if (error && cursor->deadlk_node == NULL) {
                cursor->deadlk_node = cursor->node;
                hammer_ref_node(cursor->deadlk_node);
        } else if (error == 0 && cursor->parent) {
-               error = hammer_lock_upgrade(&cursor->parent->lock);
+               error = hammer_lock_upgrade(&cursor->parent->lock, 1);
                if (error && cursor->deadlk_node == NULL) {
                        cursor->deadlk_node = cursor->parent;
                        hammer_ref_node(cursor->deadlk_node);
@@ -239,7 +239,7 @@ hammer_cursor_upgrade_node(hammer_cursor_t cursor)
 {
        int error;
 
-       error = hammer_lock_upgrade(&cursor->node->lock);
+       error = hammer_lock_upgrade(&cursor->node->lock, 1);
        if (error && cursor->deadlk_node == NULL) {
                cursor->deadlk_node = cursor->node;
                hammer_ref_node(cursor->deadlk_node);
@@ -255,14 +255,89 @@ void
 hammer_cursor_downgrade(hammer_cursor_t cursor)
 {
        if (hammer_lock_excl_owned(&cursor->node->lock, curthread))
-               hammer_lock_downgrade(&cursor->node->lock);
+               hammer_lock_downgrade(&cursor->node->lock, 1);
        if (cursor->parent &&
            hammer_lock_excl_owned(&cursor->parent->lock, curthread)) {
-               hammer_lock_downgrade(&cursor->parent->lock);
+               hammer_lock_downgrade(&cursor->parent->lock, 1);
        }
 }
 
 /*
+ * Upgrade and downgrade pairs of cursors.  This is used by the dedup
+ * code which must deal with two cursors.  A special function is needed
+ * because some of the nodes may be shared between the two cursors,
+ * resulting in share counts > 1 which will normally cause an upgrade
+ * to fail.
+ */
+static __noinline
+int
+collect_node(hammer_node_t *array, int *counts, int n, hammer_node_t node)
+{
+       int i;
+
+       for (i = 0; i < n; ++i) {
+               if (array[i] == node)
+                       break;
+       }
+       if (i == n) {
+               array[i] = node;
+               counts[i] = 1;
+               ++i;
+       } else {
+               ++counts[i];
+       }
+       return(i);
+}
+
+int
+hammer_cursor_upgrade2(hammer_cursor_t cursor1, hammer_cursor_t cursor2)
+{
+       hammer_node_t nodes[4];
+       int counts[4];
+       int error;
+       int i;
+       int n;
+
+       n = collect_node(nodes, counts, 0, cursor1->node);
+       if (cursor1->parent)
+               n = collect_node(nodes, counts, n, cursor1->parent);
+       n = collect_node(nodes, counts, n, cursor2->node);
+       if (cursor2->parent)
+               n = collect_node(nodes, counts, n, cursor2->parent);
+
+       error = 0;
+       for (i = 0; i < n; ++i) {
+               error = hammer_lock_upgrade(&nodes[i]->lock, counts[i]);
+               if (error)
+                       break;
+       }
+       if (error) {
+               while (--i >= 0)
+                       hammer_lock_downgrade(&nodes[i]->lock, counts[i]);
+       }
+       return (error);
+}
+
+void
+hammer_cursor_downgrade2(hammer_cursor_t cursor1, hammer_cursor_t cursor2)
+{
+       hammer_node_t nodes[4];
+       int counts[4];
+       int i;
+       int n;
+
+       n = collect_node(nodes, counts, 0, cursor1->node);
+       if (cursor1->parent)
+               n = collect_node(nodes, counts, n, cursor1->parent);
+       n = collect_node(nodes, counts, n, cursor2->node);
+       if (cursor2->parent)
+               n = collect_node(nodes, counts, n, cursor2->parent);
+
+       for (i = 0; i < n; ++i)
+               hammer_lock_downgrade(&nodes[i]->lock, counts[i]);
+}
+
+/*
  * Seek the cursor to the specified node and index.
  *
  * The caller must ref the node prior to calling this routine and release
diff --git a/sys/vfs/hammer/hammer_dedup.c b/sys/vfs/hammer/hammer_dedup.c
new file mode 100644 (file)
index 0000000..7c724cd
--- /dev/null
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2010 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Ilya Dryomov <idryomov@gmail.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "hammer.h"
+
+static __inline int validate_zone(hammer_off_t data_offset);
+
+int
+hammer_ioc_dedup(hammer_transaction_t trans, hammer_inode_t ip,
+                struct hammer_ioc_dedup *dedup)
+{
+       struct hammer_cursor cursor1, cursor2;
+       int error;
+
+       /*
+        * Cursor1, return an error -> candidate goes to pass2 list
+        */
+       error = hammer_init_cursor(trans, &cursor1, NULL, NULL);
+       if (error)
+               goto done_cursor;
+       cursor1.key_beg = dedup->elm1;
+       cursor1.flags |= HAMMER_CURSOR_BACKEND;
+
+       error = hammer_btree_lookup(&cursor1);
+       if (error)
+               goto done_cursor;
+       error = hammer_btree_extract(&cursor1, HAMMER_CURSOR_GET_LEAF |
+                                               HAMMER_CURSOR_GET_DATA);
+       if (error)
+               goto done_cursor;
+
+       /*
+        * Cursor2, return an error -> candidate goes to pass2 list
+        */
+       error = hammer_init_cursor(trans, &cursor2, NULL, NULL);
+       if (error)
+               goto done_cursors;
+       cursor2.key_beg = dedup->elm2;
+       cursor2.flags |= HAMMER_CURSOR_BACKEND;
+
+       error = hammer_btree_lookup(&cursor2);
+       if (error)
+               goto done_cursors;
+       error = hammer_btree_extract(&cursor2, HAMMER_CURSOR_GET_LEAF |
+                                               HAMMER_CURSOR_GET_DATA);
+       if (error)
+               goto done_cursors;
+
+       /*
+        * Zone validation. We can't de-dup any of the other zones
+        * (BTREE or META) or bad things will happen.
+        *
+        * Return with error = 0, but set an INVALID_ZONE flag.
+        */
+       error = validate_zone(cursor1.leaf->data_offset) +
+                           validate_zone(cursor2.leaf->data_offset);
+       if (error) {
+               dedup->head.flags |= HAMMER_IOC_DEDUP_INVALID_ZONE;
+               error = 0;
+               goto done_cursors;
+       }
+
+       /*
+        * Comparison checks
+        *
+        * If zones don't match or data_len fields aren't the same
+        * we consider it to be a comparison failure.
+        *
+        * Return with error = 0, but set a CMP_FAILURE flag.
+        */
+       if ((cursor1.leaf->data_offset & HAMMER_OFF_ZONE_MASK) !=
+           (cursor2.leaf->data_offset & HAMMER_OFF_ZONE_MASK)) {
+               dedup->head.flags |= HAMMER_IOC_DEDUP_CMP_FAILURE;
+               goto done_cursors;
+       }
+       if (cursor1.leaf->data_len != cursor2.leaf->data_len) {
+               dedup->head.flags |= HAMMER_IOC_DEDUP_CMP_FAILURE;
+               goto done_cursors;
+       }
+
+       /* byte-by-byte comparison to be sure */
+       if (bcmp(cursor1.data, cursor2.data, cursor1.leaf->data_len)) {
+               dedup->head.flags |= HAMMER_IOC_DEDUP_CMP_FAILURE;
+               goto done_cursors;
+       }
+
+       /*
+        * Upgrade both cursors together to an exclusive lock
+        *
+        * Return an error -> candidate goes to pass2 list
+        */
+       hammer_sync_lock_sh(trans);
+       error = hammer_cursor_upgrade2(&cursor1, &cursor2);
+       if (error) {
+               hammer_sync_unlock(trans);
+               goto done_cursors;
+       }
+
+       error = hammer_blockmap_dedup(cursor1.trans,
+                       cursor1.leaf->data_offset, cursor1.leaf->data_len);
+       if (error) {
+               if (error == ERANGE) {
+                       /*
+                        * Return with error = 0, but set an UNDERFLOW flag
+                        */
+                       dedup->head.flags |= HAMMER_IOC_DEDUP_UNDERFLOW;
+                       error = 0;
+                       goto downgrade_cursors;
+               } else {
+                       /*
+                        * Return an error -> block goes to pass2 list
+                        */
+                       goto downgrade_cursors;
+               }
+       }
+
+       /*
+        * The cursor2's cache must be invalidated before calling
+        * hammer_blockmap_free(), otherwise it will not be able to
+        * invalidate the underlying data buffer.
+        */
+       hammer_cursor_invalidate_cache(&cursor2);
+       hammer_blockmap_free(cursor2.trans,
+                       cursor2.leaf->data_offset, cursor2.leaf->data_len);
+
+       hammer_modify_node(cursor2.trans, cursor2.node,
+                       &cursor2.leaf->data_offset, sizeof(hammer_off_t));
+       cursor2.leaf->data_offset = cursor1.leaf->data_offset;
+       hammer_modify_node_done(cursor2.node);
+
+downgrade_cursors:
+       hammer_cursor_downgrade2(&cursor1, &cursor2);
+       hammer_sync_unlock(trans);
+done_cursors:
+       hammer_done_cursor(&cursor2);
+done_cursor:
+       hammer_done_cursor(&cursor1);
+       return (error);
+}
+
+static __inline int
+validate_zone(hammer_off_t data_offset)
+{
+       switch(data_offset & HAMMER_OFF_ZONE_MASK) {
+       case HAMMER_ZONE_LARGE_DATA:
+       case HAMMER_ZONE_SMALL_DATA:
+               return (0);
+       default:
+               return (1);
+       }
+}
index f8fe994..7d8653d 100644 (file)
@@ -58,6 +58,8 @@ static int hammer_ioc_get_config(hammer_transaction_t trans, hammer_inode_t ip,
                                struct hammer_ioc_config *snap);
 static int hammer_ioc_set_config(hammer_transaction_t trans, hammer_inode_t ip,
                                struct hammer_ioc_config *snap);
+static int hammer_ioc_get_data(hammer_transaction_t trans, hammer_inode_t ip,
+                               struct hammer_ioc_data *data);
 
 int
 hammer_ioctl(hammer_inode_t ip, u_long com, caddr_t data, int fflag,
@@ -210,6 +212,18 @@ hammer_ioctl(hammer_inode_t ip, u_long com, caddr_t data, int fflag,
                                        &trans, ip, (struct hammer_ioc_config *)data);
                }
                break;
+       case HAMMERIOC_DEDUP:
+               if (error == 0) {
+                       error = hammer_ioc_dedup(
+                                       &trans, ip, (struct hammer_ioc_dedup *)data);
+               }
+               break;
+       case HAMMERIOC_GET_DATA:
+               if (error == 0) {
+                       error = hammer_ioc_get_data(
+                                       &trans, ip, (struct hammer_ioc_data *)data);
+               }
+               break;
        default:
                error = EOPNOTSUPP;
                break;
@@ -991,3 +1005,38 @@ again:
        hammer_done_cursor(&cursor);
        return(0);
 }
+
+static
+int
+hammer_ioc_get_data(hammer_transaction_t trans, hammer_inode_t ip,
+                       struct hammer_ioc_data *data)
+{
+       struct hammer_cursor cursor;
+       int bytes;
+       int error;
+
+       /* XXX cached inode ? */
+       error = hammer_init_cursor(trans, &cursor, NULL, NULL);
+       if (error)
+               goto failed;
+
+       cursor.key_beg = data->elm;
+       cursor.flags |= HAMMER_CURSOR_BACKEND;
+
+       error = hammer_btree_lookup(&cursor);
+       if (error == 0) {
+               error = hammer_btree_extract(&cursor, HAMMER_CURSOR_GET_LEAF |
+                                                     HAMMER_CURSOR_GET_DATA);
+               if (error == 0) {
+                       data->leaf = *cursor.leaf;
+                       bytes = cursor.leaf->data_len;
+                       if (bytes > data->size)
+                               bytes = data->size;
+                       error = copyout(cursor.data, data->ubuf, bytes);
+               }
+       }
+
+failed:
+       hammer_done_cursor(&cursor);
+       return (error);
+}
index f0778ed..2c8e979 100644 (file)
@@ -438,6 +438,30 @@ struct hammer_ioc_config {
 };
 
 /*
+ * HAMMERIOC_DEDUP
+ */
+struct hammer_ioc_dedup {
+       struct hammer_ioc_head  head;
+       struct hammer_base_elm  elm1;
+       struct hammer_base_elm  elm2; /* candidate for dedup */
+};
+
+#define HAMMER_IOC_DEDUP_CMP_FAILURE   0x0001 /* verification failed */
+#define HAMMER_IOC_DEDUP_UNDERFLOW     0x0002 /* bigblock underflow */
+#define HAMMER_IOC_DEDUP_INVALID_ZONE  0x0004 /* we can't dedup all zones */
+
+/*
+ * HAMMERIOC_GET_DATA
+ */
+struct hammer_ioc_data {
+       struct hammer_ioc_head          head;
+       struct hammer_base_elm          elm;    /* btree key to lookup */
+       struct hammer_btree_leaf_elm    leaf;
+       void                            *ubuf;  /* user buffer */
+       int                             size;   /* max size */
+};
+
+/*
  * Ioctl cmd ids
  */
 #define HAMMERIOC_PRUNE                _IOWR('h',1,struct hammer_ioc_prune)
@@ -463,6 +487,8 @@ struct hammer_ioc_config {
 #define HAMMERIOC_GET_CONFIG   _IOWR('h',21,struct hammer_ioc_config)
 #define HAMMERIOC_SET_CONFIG   _IOWR('h',22,struct hammer_ioc_config)
 #define HAMMERIOC_DEL_VOLUME   _IOWR('h',24,struct hammer_ioc_volume)
+#define HAMMERIOC_DEDUP                _IOWR('h',25,struct hammer_ioc_dedup)
+#define HAMMERIOC_GET_DATA     _IOWR('h',26,struct hammer_ioc_data)
 
 #endif
 
index 39f1a59..f38be11 100644 (file)
@@ -206,7 +206,7 @@ hammer_lock_sh_try(struct hammer_lock *lock)
  * by someone else, this function will panic.
  */
 int
-hammer_lock_upgrade(struct hammer_lock *lock)
+hammer_lock_upgrade(struct hammer_lock *lock, int shcount)
 {
        thread_t td = curthread;
        u_int lv;
@@ -216,7 +216,7 @@ hammer_lock_upgrade(struct hammer_lock *lock)
        for (;;) {
                lv = lock->lockval;
 
-               if ((lv & ~HAMMER_LOCKF_WANTED) == 1) {
+               if ((lv & ~HAMMER_LOCKF_WANTED) == shcount) {
                        nlv = lv | HAMMER_LOCKF_EXCLUSIVE;
                        if (atomic_cmpset_int(&lock->lockval, lv, nlv)) {
                                lock->lowner = td;
@@ -245,14 +245,14 @@ hammer_lock_upgrade(struct hammer_lock *lock)
  * Downgrade an exclusively held lock to a shared lock.
  */
 void
-hammer_lock_downgrade(struct hammer_lock *lock)
+hammer_lock_downgrade(struct hammer_lock *lock, int shcount)
 {
        thread_t td __debugvar = curthread;
        u_int lv;
        u_int nlv;
 
        KKASSERT((lock->lockval & ~HAMMER_LOCKF_WANTED) ==
-                (HAMMER_LOCKF_EXCLUSIVE | 1));
+                (HAMMER_LOCKF_EXCLUSIVE | shcount));
        KKASSERT(lock->lowner == td);
 
        /*