Merge branches 'hammer2' and 'master' of ssh://crater.dragonflybsd.org/repository...
authorMatthew Dillon <dillon@apollo.backplane.com>
Sat, 19 May 2012 19:07:40 +0000 (12:07 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Sat, 19 May 2012 19:07:40 +0000 (12:07 -0700)
44 files changed:
lib/libstand/hammer2.c [new file with mode: 0644]
sbin/hammer2/Makefile [new file with mode: 0644]
sbin/hammer2/cmd_debug.c [new file with mode: 0644]
sbin/hammer2/cmd_leaf.c [new file with mode: 0644]
sbin/hammer2/cmd_pfs.c [new file with mode: 0644]
sbin/hammer2/cmd_remote.c [new file with mode: 0644]
sbin/hammer2/cmd_rsa.c [new file with mode: 0644]
sbin/hammer2/cmd_service.c [new file with mode: 0644]
sbin/hammer2/cmd_snapshot.c [new file with mode: 0644]
sbin/hammer2/crypto.c [new file with mode: 0644]
sbin/hammer2/hammer2.h [new file with mode: 0644]
sbin/hammer2/icrc.c [new file with mode: 0644]
sbin/hammer2/main.c [new file with mode: 0644]
sbin/hammer2/msg.c [new file with mode: 0644]
sbin/hammer2/network.h [new file with mode: 0644]
sbin/hammer2/subs.c [new file with mode: 0644]
sbin/mount_hammer2/Makefile [new file with mode: 0644]
sbin/mount_hammer2/mount_hammer2.c [new file with mode: 0644]
sbin/newfs_hammer2/Makefile [new file with mode: 0644]
sbin/newfs_hammer2/newfs_hammer2.8 [new file with mode: 0644]
sbin/newfs_hammer2/newfs_hammer2.c [new file with mode: 0644]
sys/vfs/hammer2/CHANGES [new file with mode: 0644]
sys/vfs/hammer2/DESIGN [new file with mode: 0644]
sys/vfs/hammer2/Makefile [new file with mode: 0644]
sys/vfs/hammer2/TODO [new file with mode: 0644]
sys/vfs/hammer2/donew [new file with mode: 0755]
sys/vfs/hammer2/donew2 [new file with mode: 0755]
sys/vfs/hammer2/dossd [new file with mode: 0755]
sys/vfs/hammer2/dossd2 [new file with mode: 0755]
sys/vfs/hammer2/dotest [new file with mode: 0755]
sys/vfs/hammer2/hammer2.h [new file with mode: 0644]
sys/vfs/hammer2/hammer2_chain.c [new file with mode: 0644]
sys/vfs/hammer2/hammer2_disk.h [new file with mode: 0644]
sys/vfs/hammer2/hammer2_freemap.c [new file with mode: 0644]
sys/vfs/hammer2/hammer2_icrc.c [new file with mode: 0644]
sys/vfs/hammer2/hammer2_inode.c [new file with mode: 0644]
sys/vfs/hammer2/hammer2_ioctl.c [new file with mode: 0644]
sys/vfs/hammer2/hammer2_ioctl.h [new file with mode: 0644]
sys/vfs/hammer2/hammer2_mount.h [new file with mode: 0644]
sys/vfs/hammer2/hammer2_network.h [new file with mode: 0644]
sys/vfs/hammer2/hammer2_subr.c [new file with mode: 0644]
sys/vfs/hammer2/hammer2_vfsops.c [new file with mode: 0644]
sys/vfs/hammer2/hammer2_vnops.c [new file with mode: 0644]
sys/vfs/hammer2/mkvntest [new file with mode: 0755]

diff --git a/lib/libstand/hammer2.c b/lib/libstand/hammer2.c
new file mode 100644 (file)
index 0000000..0d4cf4e
--- /dev/null
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/uuid.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <string.h>
+#include <strings.h>
+#include <errno.h>
+
+#include <hammer2/hammer2_disk.h>
+
+struct hammer2 {
+       int                             fd;     /* Device fd */
+       struct hammer2_blockref         sroot;  /* Superroot blockref */
+};
+
+struct inode {
+       struct hammer2_inode_data       dat;    /* raw inode data */
+       off_t                           doff;   /* disk inode offset */
+};
+
+off_t blockoff(ref)
+       struct hammer2_blockref ref;
+{
+
+}
+
+hinit(hfs)
+       struct hammer2 *hfs;
+{
+       struct hammer2_volume_data volhdr;
+       ssize_t rc;
+       hammer2_crc_t crc0;
+
+       rc = pread(hfs->fd, &volhdr, HAMMER2_VOLUME_SIZE, 0);
+       if (volhdr.magic == HAMMER2_VOLUME_ID_HBO) {
+               printf("Valid HAMMER2 filesystem\n");
+       } else {
+               return (-1);
+       }
+
+       hfs->sroot = volhdr.sroot_blockref;
+       return (0);
+}
+
+shread(hfs, ino, buf, off, len)
+       struct hammer2 *hfs;
+       struct inode *ino;
+       char *buf;
+       off_t off;
+       size_t len;
+{
+       /*
+        * Read [off, off+len) from inode ino rather than from disk
+        * offsets; correctly decodes blockrefs/indirs/...
+        */
+}
+
+struct inode *hlookup1(hfs, ino, name)
+       struct hammer2 *hfs;
+       struct inode *ino;
+       char *name;
+{
+       static struct inode filino;
+       off_t off;
+       int rc;
+
+       bzero(&filino, sizeof(struct inode));
+
+       for (off = 0;
+            off < ino->dat.size;
+            off += sizeof(struct hammer2_inode_data))
+       {
+               rc = shread(hfs, ino, &filino.dat, off,
+                           sizeof(struct hammer2_inode_data));
+               if (rc != sizeof(struct hammer2_inode_data))
+                       continue;
+               if (strcmp(name, &filino.dat.filename) == 0)
+                       return (&filino);
+       }
+
+       return (NULL);
+}
+
+struct inode *hlookup(hfs, name)
+       struct hammer2 *hfs;
+       char *name;
+{
+       /* Name is of form /SUPERROOT/a/b/c/file */
+
+}
+
+void hstat(hfs, ino, sb)
+       struct hammer2 *hfs;
+       struct inode *ino;
+       struct stat *sb;
+{
+
+}
+
+main(argc, argv)
+       int argc;
+       char *argv[];
+{
+       struct hammer2 hammer2;
+       struct inode *ino;
+       struct stat sb;
+       int i;
+
+       if (argc < 2) {
+               fprintf(stderr, "usage: hammer2 <dev>\n");
+               exit(1);
+       }
+
+       hammer2.fd = open(argv[1], O_RDONLY);
+       if (hammer2.fd < 0) {
+               fprintf(stderr, "unable to open %s\n", argv[1]);
+               exit(1);
+       }
+
+       if (hinit(&hammer2)) {
+               fprintf(stderr, "invalid fs\n");
+               close(hammer2.fd);
+               exit(1);
+       }
+
+       for (i = 2; i < argc; i++) {
+               ino = hlookup(&hammer2, argv[i]);
+               if (ino == NULL) {
+                       fprintf(stderr, "hlookup %s\n", argv[i]);
+                       continue;
+               }
+               hstat(&hammer2, ino, &sb);
+
+               printf("%s %lld", argv[i], sb.st_size);
+
+       }
+}
diff --git a/sbin/hammer2/Makefile b/sbin/hammer2/Makefile
new file mode 100644 (file)
index 0000000..35981b1
--- /dev/null
@@ -0,0 +1,18 @@
+PROG=  hammer2
+SRCS=  main.c subs.c icrc.c msg.c crypto.c
+SRCS+= cmd_remote.c cmd_snapshot.c cmd_pfs.c
+SRCS+= cmd_service.c cmd_leaf.c cmd_debug.c
+SRCS+= cmd_rsa.c
+#MAN=  hammer2.8
+NOMAN= TRUE
+DEBUG_FLAGS=-g
+
+CFLAGS+= -I${.CURDIR}/../../sys
+CFLAGS+= -pthread
+LDADD= -lm -lutil -lmd -lcrypto
+DPADD= ${LIBM} ${LIBUTIL} ${LIBMD} ${LIBCRYPTO}
+
+#.PATH: ${.CURDIR}/../../sys/libkern
+#SRCS+= crc32.c
+
+.include <bsd.prog.mk>
diff --git a/sbin/hammer2/cmd_debug.c b/sbin/hammer2/cmd_debug.c
new file mode 100644 (file)
index 0000000..f648ab1
--- /dev/null
@@ -0,0 +1,489 @@
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "hammer2.h"
+
+#define SHOW_TAB       2
+
+static void shell_recv(hammer2_iocom_t *iocom);
+static void shell_send(hammer2_iocom_t *iocom);
+static void shell_tty(hammer2_iocom_t *iocom);
+static void hammer2_shell_parse(hammer2_msg_t *msg, char *cmdbuf);
+
+/************************************************************************
+ *                                 SHELL                               *
+ ************************************************************************/
+
+int
+cmd_shell(const char *hostname)
+{
+       struct sockaddr_in lsin;
+       struct hammer2_iocom iocom;
+       hammer2_msg_t *msg;
+       struct hostent *hen;
+       int fd;
+
+       /*
+        * Acquire socket and set options
+        */
+       if ((fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
+               fprintf(stderr, "cmd_debug: socket(): %s\n",
+                       strerror(errno));
+               return 1;
+       }
+
+       /*
+        * Connect to the target
+        */
+       bzero(&lsin, sizeof(lsin));
+       lsin.sin_family = AF_INET;
+       lsin.sin_addr.s_addr = 0;
+       lsin.sin_port = htons(HAMMER2_LISTEN_PORT);
+
+       if (hostname) {
+               hen = gethostbyname2(hostname, AF_INET);
+               if (hen == NULL) {
+                       if (inet_pton(AF_INET, hostname, &lsin.sin_addr) != 1) {
+                               fprintf(stderr,
+                                       "Cannot resolve %s\n", hostname);
+                               return 1;
+                       }
+               } else {
+                       bcopy(hen->h_addr, &lsin.sin_addr, hen->h_length);
+               }
+       }
+       if (connect(fd, (struct sockaddr *)&lsin, sizeof(lsin)) < 0) {
+               close(fd);
+               fprintf(stderr, "debug: connect failed: %s\n",
+                       strerror(errno));
+               return 0;
+       }
+
+       /*
+        * Run the session.  The remote end transmits our prompt.
+        */
+       hammer2_iocom_init(&iocom, fd, 0);
+       printf("debug: connected\n");
+
+       msg = hammer2_allocmsg(&iocom, HAMMER2_DBG_SHELL, 0);
+       hammer2_ioq_write(msg);
+
+       hammer2_iocom_core(&iocom, shell_recv, shell_send, shell_tty);
+       fprintf(stderr, "debug: disconnected\n");
+       close(fd);
+       return 0;
+}
+
+/*
+ * Callback from hammer2_iocom_core() when messages might be present
+ * on the socket.
+ */
+static
+void
+shell_recv(hammer2_iocom_t *iocom)
+{
+       hammer2_msg_t *msg;
+
+       while ((iocom->flags & HAMMER2_IOCOMF_EOF) == 0 &&
+              (msg = hammer2_ioq_read(iocom)) != NULL) {
+
+               switch(msg->any.head.cmd & HAMMER2_MSGF_CMDSWMASK) {
+               case HAMMER2_LNK_ERROR:
+                       fprintf(stderr, "Link Error: %d\n",
+                               msg->any.head.error);
+                       break;
+               case HAMMER2_DBG_SHELL:
+                       /*
+                        * We send the commands, not accept them.
+                        */
+                       hammer2_replymsg(msg, HAMMER2_MSG_ERR_UNKNOWN);
+                       hammer2_freemsg(msg);
+                       break;
+               case HAMMER2_DBG_SHELL | HAMMER2_MSGF_REPLY:
+                       /*
+                        * A reply from the remote is data we copy to stdout.
+                        */
+                       if (msg->aux_size) {
+                               msg->aux_data[msg->aux_size - 1] = 0;
+                               write(1, msg->aux_data, strlen(msg->aux_data));
+                       } else {
+                               write(1, "debug> ", 7);
+                       }
+                       hammer2_freemsg(msg);
+                       break;
+               default:
+                       assert((msg->any.head.cmd & HAMMER2_MSGF_REPLY) == 0);
+                       fprintf(stderr, "Unknown message: %08x\n",
+                               msg->any.head.cmd);
+                       hammer2_replymsg(msg, HAMMER2_MSG_ERR_UNKNOWN);
+                       break;
+               }
+       }
+       if (iocom->ioq_rx.error) {
+               fprintf(stderr, "node_master_recv: comm error %d\n",
+                       iocom->ioq_rx.error);
+       }
+}
+
+/*
+ * Callback from hammer2_iocom_core() when messages might be transmittable
+ * to the socket.
+ */
+static
+void
+shell_send(hammer2_iocom_t *iocom)
+{
+       hammer2_iocom_flush(iocom);
+}
+
+static
+void
+shell_tty(hammer2_iocom_t *iocom)
+{
+       hammer2_msg_t *msg;
+       char buf[256];
+       size_t len;
+
+       if (fgets(buf, sizeof(buf), stdin) != NULL) {
+               len = strlen(buf);
+               if (len && buf[len - 1] == '\n')
+                       buf[--len] = 0;
+               ++len;
+               msg = hammer2_allocmsg(iocom, HAMMER2_DBG_SHELL, len);
+               bcopy(buf, msg->aux_data, len);
+               hammer2_ioq_write(msg);
+       } else {
+               /*
+                * Set EOF flag without setting any error code for normal
+                * EOF.
+                */
+               iocom->flags |= HAMMER2_IOCOMF_EOF;
+       }
+}
+
+/*
+ * This is called from the master node to process a received debug
+ * shell command.  We process the command, outputting the results,
+ * then finish up by outputting another prompt.
+ */
+void
+hammer2_shell_remote(hammer2_msg_t *msg)
+{
+       /* hammer2_iocom_t *iocom = msg->iocom; */
+
+       if (msg->aux_data)
+               msg->aux_data[msg->aux_size - 1] = 0;
+       if (msg->any.head.cmd & HAMMER2_MSGF_REPLY) {
+               /*
+                * A reply just prints out the string.  No newline is added
+                * (it is expected to be embedded if desired).
+                */
+               if (msg->aux_data)
+                       write(2, msg->aux_data, strlen(msg->aux_data));
+               hammer2_freemsg(msg);
+       } else {
+               /*
+                * Otherwise this is a command which we must process.
+                * When we are finished we generate a final reply.
+                */
+               hammer2_shell_parse(msg, msg->aux_data);
+               hammer2_replymsg(msg, 0);
+       }
+}
+
+static void
+hammer2_shell_parse(hammer2_msg_t *msg, char *cmdbuf)
+{
+       /* hammer2_iocom_t *iocom = msg->iocom; */
+       char *cmd = strsep(&cmdbuf, " \t");
+
+       if (cmd == NULL || *cmd == 0) {
+               ;
+       } else if (strcmp(cmd, "help") == 0 || strcmp(cmd, "?") == 0) {
+               msg_printf(msg, "help        Command help\n");
+       } else {
+               msg_printf(msg, "Unrecognized command: %s\n", cmd);
+       }
+}
+
+/*
+ * Returns text debug output to the original defined by (msg).  (msg) is
+ * not modified and stays intact.
+ */
+void
+msg_printf(hammer2_msg_t *msg, const char *ctl, ...)
+{
+       /* hammer2_iocom_t *iocom = msg->iocom; */
+       hammer2_msg_t *rmsg;
+       va_list va;
+       char buf[1024];
+       size_t len;
+
+       va_start(va, ctl);
+       vsnprintf(buf, sizeof(buf), ctl, va);
+       va_end(va);
+       len = strlen(buf) + 1;
+
+       rmsg = hammer2_allocreply(msg, HAMMER2_DBG_SHELL, len);
+       bcopy(buf, rmsg->aux_data, len);
+
+       hammer2_ioq_write(rmsg);
+}
+
+/************************************************************************
+ *                                 SHOW                                *
+ ************************************************************************/
+
+static void show_bref(int fd, int tab, int bi, hammer2_blockref_t *bref);
+static void tabprintf(int tab, const char *ctl, ...);
+
+int
+cmd_show(const char *devpath)
+{
+       hammer2_blockref_t broot;
+       int fd;
+
+       fd = open(devpath, O_RDONLY);
+       if (fd < 0) {
+               perror("open");
+               return 1;
+       }
+       bzero(&broot, sizeof(broot));
+       broot.type = HAMMER2_BREF_TYPE_VOLUME;
+       broot.data_off = 0 | HAMMER2_PBUFRADIX;
+       show_bref(fd, 0, 0, &broot);
+       close(fd);
+
+       return 0;
+}
+
+static void
+show_bref(int fd, int tab, int bi, hammer2_blockref_t *bref)
+{
+       hammer2_media_data_t media;
+       hammer2_blockref_t *bscan;
+       int bcount;
+       int i;
+       int didnl;
+       int obrace = 1;
+       size_t bytes;
+       const char *type_str;
+       char *str = NULL;
+
+       switch(bref->type) {
+       case HAMMER2_BREF_TYPE_EMPTY:
+               type_str = "empty";
+               break;
+       case HAMMER2_BREF_TYPE_INODE:
+               type_str = "inode";
+               break;
+       case HAMMER2_BREF_TYPE_INDIRECT:
+               type_str = "indblk";
+               break;
+       case HAMMER2_BREF_TYPE_DATA:
+               type_str = "data";
+               break;
+       case HAMMER2_BREF_TYPE_VOLUME:
+               type_str = "volume";
+               break;
+       default:
+               type_str = "unknown";
+               break;
+       }
+
+
+       tabprintf(tab, "%s.%-3d %016jx/%-2d mir=%016jx mod=%016jx ",
+              type_str, bi,
+              bref->key, bref->keybits,
+              bref->mirror_tid, bref->modify_tid);
+       tab += SHOW_TAB;
+
+       bytes = (size_t)1 << (bref->data_off & HAMMER2_OFF_MASK_RADIX);
+       if (bytes > sizeof(media)) {
+               printf("(bad block size %zd)\n", bytes);
+               return;
+       }
+       if (bref->type != HAMMER2_BREF_TYPE_DATA || VerboseOpt >= 1) {
+               lseek(fd, bref->data_off & ~HAMMER2_OFF_MASK_RADIX, 0);
+               if (read(fd, &media, bytes) != (ssize_t)bytes) {
+                       printf("(media read failed)\n");
+                       return;
+               }
+       }
+
+       bscan = NULL;
+       bcount = 0;
+       didnl = 0;
+
+       switch(bref->type) {
+       case HAMMER2_BREF_TYPE_EMPTY:
+               obrace = 0;
+               break;
+       case HAMMER2_BREF_TYPE_INODE:
+               printf("{\n");
+               if (media.ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA) {
+                       /* no blockrefs */
+               } else {
+                       bscan = &media.ipdata.u.blockset.blockref[0];
+                       bcount = HAMMER2_SET_COUNT;
+               }
+               tabprintf(tab, "filename \"%s\"\n", media.ipdata.filename);
+               tabprintf(tab, "version  %d\n", media.ipdata.version);
+               tabprintf(tab, "uflags   0x%08x\n",
+                         media.ipdata.uflags);
+               if (media.ipdata.rmajor || media.ipdata.rminor) {
+                       tabprintf(tab, "rmajor   %d\n",
+                                 media.ipdata.rmajor);
+                       tabprintf(tab, "rminor   %d\n",
+                                 media.ipdata.rminor);
+               }
+               tabprintf(tab, "ctime    %s\n",
+                         hammer2_time64_to_str(media.ipdata.ctime, &str));
+               tabprintf(tab, "mtime    %s\n",
+                         hammer2_time64_to_str(media.ipdata.mtime, &str));
+               tabprintf(tab, "atime    %s\n",
+                         hammer2_time64_to_str(media.ipdata.atime, &str));
+               tabprintf(tab, "btime    %s\n",
+                         hammer2_time64_to_str(media.ipdata.btime, &str));
+               tabprintf(tab, "uid      %s\n",
+                         hammer2_uuid_to_str(&media.ipdata.uid, &str));
+               tabprintf(tab, "gid      %s\n",
+                         hammer2_uuid_to_str(&media.ipdata.gid, &str));
+               tabprintf(tab, "type     %s\n",
+                         hammer2_iptype_to_str(media.ipdata.type));
+               tabprintf(tab, "opflgs   0x%02x\n",
+                         media.ipdata.op_flags);
+               tabprintf(tab, "capflgs  0x%04x\n",
+                         media.ipdata.cap_flags);
+               tabprintf(tab, "mode     %-7o\n",
+                         media.ipdata.mode);
+               tabprintf(tab, "inum     0x%016jx\n",
+                         media.ipdata.inum);
+               tabprintf(tab, "size     %ju\n",
+                         (uintmax_t)media.ipdata.size);
+               tabprintf(tab, "nlinks   %ju\n",
+                         (uintmax_t)media.ipdata.nlinks);
+               tabprintf(tab, "iparent  0x%016jx\n",
+                         (uintmax_t)media.ipdata.iparent);
+               tabprintf(tab, "name_key 0x%016jx\n",
+                         (uintmax_t)media.ipdata.name_key);
+               tabprintf(tab, "name_len %u\n",
+                         media.ipdata.name_len);
+               tabprintf(tab, "ncopies  %u\n",
+                         media.ipdata.ncopies);
+               tabprintf(tab, "compalg  %u\n",
+                         media.ipdata.comp_algo);
+               if (media.ipdata.op_flags & HAMMER2_OPFLAG_PFSROOT) {
+                       tabprintf(tab, "pfs_type %u (%s)\n",
+                                 media.ipdata.pfs_type,
+                                 hammer2_pfstype_to_str(media.ipdata.pfs_type));
+                       tabprintf(tab, "pfs_inum 0x%016jx\n",
+                                 (uintmax_t)media.ipdata.pfs_inum);
+                       tabprintf(tab, "pfs_id   %s\n",
+                                 hammer2_uuid_to_str(&media.ipdata.pfs_id,
+                                                     &str));
+                       tabprintf(tab, "pfs_fsid %s\n",
+                                 hammer2_uuid_to_str(&media.ipdata.pfs_fsid,
+                                                     &str));
+               }
+               tabprintf(tab, "data_quota  %ju\n",
+                         (uintmax_t)media.ipdata.data_quota);
+               tabprintf(tab, "data_count  %ju\n",
+                         (uintmax_t)media.ipdata.data_count);
+               tabprintf(tab, "inode_quota %ju\n",
+                         (uintmax_t)media.ipdata.inode_quota);
+               tabprintf(tab, "inode_count %ju\n",
+                         (uintmax_t)media.ipdata.inode_count);
+               tabprintf(tab, "attr_tid    0x%016jx\n",
+                         (uintmax_t)media.ipdata.attr_tid);
+               if (media.ipdata.type == HAMMER2_OBJTYPE_DIRECTORY) {
+                       tabprintf(tab, "dirent_tid  %016jx\n",
+                                 (uintmax_t)media.ipdata.dirent_tid);
+               }
+               break;
+       case HAMMER2_BREF_TYPE_INDIRECT:
+               bscan = &media.npdata.blockref[0];
+               bcount = bytes / sizeof(hammer2_blockref_t);
+               didnl = 1;
+               printf("{\n");
+               break;
+       case HAMMER2_BREF_TYPE_DATA:
+               if (VerboseOpt >= 2) {
+                       printf("{\n");
+               } else {
+                       printf("\n");
+                       obrace = 0;
+               }
+               break;
+       case HAMMER2_BREF_TYPE_VOLUME:
+               bscan = &media.voldata.sroot_blockset.blockref[0];
+               bcount = HAMMER2_SET_COUNT;
+               printf("{\n");
+               break;
+       default:
+               break;
+       }
+       if (str)
+               free(str);
+       for (i = 0; i < bcount; ++i) {
+               if (bscan[i].type != HAMMER2_BREF_TYPE_EMPTY) {
+                       if (didnl == 0) {
+                               printf("\n");
+                               didnl = 1;
+                       }
+                       show_bref(fd, tab, i, &bscan[i]);
+               }
+       }
+       tab -= SHOW_TAB;
+       if (obrace) {
+               if (bref->type == HAMMER2_BREF_TYPE_INODE)
+                       tabprintf(tab, "} (%s.%d, \"%s\")\n",
+                                 type_str, bi, media.ipdata.filename);
+               else
+                       tabprintf(tab, "} (%s.%d)\n", type_str,bi);
+       }
+}
+
+static
+void
+tabprintf(int tab, const char *ctl, ...)
+{
+       va_list va;
+
+       printf("%*.*s", tab, tab, "");
+       va_start(va, ctl);
+       vprintf(ctl, va);
+       va_end(va);
+}
diff --git a/sbin/hammer2/cmd_leaf.c b/sbin/hammer2/cmd_leaf.c
new file mode 100644 (file)
index 0000000..a9026e2
--- /dev/null
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "hammer2.h"
+
+/*
+ * Start-up the leaf daemon for a PFS on this machine.
+ *
+ * One leaf daemon is run for each mounted PFS.  The daemon may multi-thread
+ * to improve performance if desired.  The daemon performs the following
+ * functions:
+ *
+ *     (1) Makes and maintains connections to all cluster nodes found for
+ *         the PFS, retrieved from the REMOTE configuration stored in
+ *         the HAMMER2 mount.  A localhost connection is always implied
+ *         (using the backbone), but also having more direct connections
+ *         can result in higher performance.
+ *
+ *         This also includes any required encryption or authentication.
+ *
+ *     (2) Runs the spanning tree protocol as a leaf, meaning that
+ *         the leaf daemon does not serve as a relay and the individual
+ *         connections made in (1) do not cross-connect.
+ *
+ *     (3) Obtains the PFS's registration and makes it available to the
+ *         cluster via the spanning tree protocol.
+ *
+ *     (4) Creates a communications pipe to the HAMMER2 VFS in the kernel
+ *         (installed via ioctl()) which the HAMMER2 VFS uses to accept and
+ *         communicate high-level requests.
+ *
+ *     (5) Performs all complex high-level messaging protocol operations,
+ *         such as quorum operations, maintains persistent cache state,
+ *         and so on and so forth.
+ *
+ * As you may have noted, the leaf daemon serves as an intermediary between
+ * the kernel and the rest of the cluster.  The kernel will issue high level
+ * protocol commands to the leaf which performs the protocol and sends a
+ * response.  The kernel does NOT have to deal with the quorum or other
+ * complex maintainance.
+ *
+ * Basically the kernel is simply another client from the point of view
+ * of the high-level protocols, requesting cache state locks and such from
+ * the leaf (in a degenerate situation one master lock is all that is needed).
+ * If the kernel PFS has local media storage that storage can be used for
+ * numerous purposes, such as caching, and in the degenerate non-clustered
+ * case simply represents the one-and-only master copy of the filesystem.
+ */
+int
+cmd_leaf(const char *sel_info)
+{
+       int ecode = 0;
+       int fd;
+
+       /*
+        * Obtain an ioctl descriptor and retrieve the registration info
+        * for the PFS.
+        */
+       if ((fd = hammer2_ioctl_handle(sel_info)) < 0)
+               return(1);
+
+       /*
+        * Start a daemon to interconnect the HAMMER2 PFS in-kernel to the
+        * master-node daemon.  This daemon's thread will spend most of its
+        * time in the kernel.
+        */
+/*     hammer2_demon(helper_pfs_interlink, (void *)(intptr_t)fd);*/
+       if (NormalExit)
+               close(fd);
+
+       return ecode;
+}
+
+#if 0
+/*
+ * LEAF interconnect between PFS and the messaging core.  We create a
+ * socket connection to the messaging core, register the PFS with the
+ * core, and then pass the messaging descriptor to the kernel.
+ *
+ * The kernel takes over operation of the interconnect until the filesystem
+ * is unmounted or the descriptor is lost or explicitly terminated via
+ * a hammer2 command.
+ *
+ * This is essentially a localhost connection, so we don't have to worry
+ * about encryption.  Any encryption will be handled by the messaging
+ * core.
+ */
+static
+void *
+leaf_connect(void *data)
+{
+       int fd;
+
+       fd = (int)(intptr_t)data;
+
+       return (NULL);
+}
+#endif
diff --git a/sbin/hammer2/cmd_pfs.c b/sbin/hammer2/cmd_pfs.c
new file mode 100644 (file)
index 0000000..46df7cf
--- /dev/null
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "hammer2.h"
+
+int
+cmd_pfs_list(const char *sel_path)
+{
+       hammer2_ioc_pfs_t pfs;
+       int ecode = 0;
+       int count = 0;
+       int fd;
+       uint32_t status;
+       char *pfs_id_str = NULL;
+
+       if ((fd = hammer2_ioctl_handle(sel_path)) < 0)
+               return(1);
+       bzero(&pfs, sizeof(pfs));
+
+       while ((pfs.name_key = pfs.name_next) != (hammer2_key_t)-1) {
+               if (ioctl(fd, HAMMER2IOC_PFS_GET, &pfs) < 0) {
+                       perror("ioctl");
+                       ecode = 1;
+                       break;
+               }
+               if (count == 0) {
+                       printf("Type        "
+                              "Pfs_id                               "
+                              "Label\n");
+               }
+               switch(pfs.pfs_type) {
+               case HAMMER2_PFSTYPE_NONE:
+                       printf("NONE        ");
+                       break;
+               case HAMMER2_PFSTYPE_ADMIN:
+                       printf("ADMIN       ");
+                       break;
+               case HAMMER2_PFSTYPE_CACHE:
+                       printf("CACHE       ");
+                       break;
+               case HAMMER2_PFSTYPE_COPY:
+                       printf("COPY        ");
+                       break;
+               case HAMMER2_PFSTYPE_SLAVE:
+                       printf("SLAVE       ");
+                       break;
+               case HAMMER2_PFSTYPE_SOFT_SLAVE:
+                       printf("SOFT_SLAVE  ");
+                       break;
+               case HAMMER2_PFSTYPE_SOFT_MASTER:
+                       printf("SOFT_MASTER ");
+                       break;
+               case HAMMER2_PFSTYPE_MASTER:
+                       printf("MASTER      ");
+                       break;
+               default:
+                       printf("%02x          ", pfs.pfs_type);
+                       break;
+               }
+               uuid_to_string(&pfs.pfs_id, &pfs_id_str, &status);
+               printf("%s ", pfs_id_str);
+               free(pfs_id_str);
+               pfs_id_str = NULL;
+               printf("%s\n", pfs.name);
+               ++count;
+       }
+       close(fd);
+
+       return (ecode);
+}
+
+int
+cmd_pfs_create(const char *sel_path, const char *name,
+              uint8_t pfs_type, const char *uuid_str)
+{
+       hammer2_ioc_pfs_t pfs;
+       int ecode = 0;
+       int fd;
+       uint32_t status;
+
+       /*
+        * Default to MASTER
+        */
+       if (pfs_type == HAMMER2_PFSTYPE_NONE) {
+               pfs_type = HAMMER2_PFSTYPE_MASTER;
+       }
+
+       if ((fd = hammer2_ioctl_handle(sel_path)) < 0)
+               return(1);
+       bzero(&pfs, sizeof(pfs));
+       snprintf(pfs.name, sizeof(pfs.name), "%s", name);
+       pfs.pfs_type = pfs_type;
+       if (uuid_str) {
+               uuid_from_string(uuid_str, &pfs.pfs_id, &status);
+       } else {
+               uuid_create(&pfs.pfs_id, &status);
+       }
+       if (status == uuid_s_ok)
+               uuid_create(&pfs.pfs_fsid, &status);
+       if (status == uuid_s_ok) {
+               if (ioctl(fd, HAMMER2IOC_PFS_CREATE, &pfs) < 0) {
+                       perror("ioctl");
+                       ecode = 1;
+               }
+       } else {
+               fprintf(stderr, "hammer2: pfs_create: badly formed uuid\n");
+               ecode = 1;
+       }
+       close(fd);
+       return (ecode);
+}
+
+int
+cmd_pfs_delete(const char *sel_path, const char *name)
+{
+       hammer2_ioc_pfs_t pfs;
+       int ecode = 0;
+       int fd;
+
+       if ((fd = hammer2_ioctl_handle(sel_path)) < 0)
+               return(1);
+       bzero(&pfs, sizeof(pfs));
+       snprintf(pfs.name, sizeof(pfs.name), "%s", name);
+
+       if (ioctl(fd, HAMMER2IOC_PFS_CREATE, &pfs) < 0) {
+               fprintf(stderr, "hammer2: pfs_delete(%s): %s\n",
+                       name, strerror(errno));
+               ecode = 1;
+       }
+       close(fd);
+
+       return (ecode);
+}
diff --git a/sbin/hammer2/cmd_remote.c b/sbin/hammer2/cmd_remote.c
new file mode 100644 (file)
index 0000000..8300e09
--- /dev/null
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "hammer2.h"
+
+int
+cmd_remote_connect(const char *sel_path, const char *url)
+{
+       hammer2_ioc_remote_t remote;
+       int ecode = 0;
+       int fd;
+
+       if ((fd = hammer2_ioctl_handle(sel_path)) < 0)
+               return(1);
+       bzero(&remote, sizeof(remote));
+       remote.copyid = -1;
+       remote.fd = -1;
+       if (strlen(url) >= sizeof(remote.copy1.path)) {
+               fprintf(stderr, "hammer2: connect: Path too long\n");
+               close(fd);
+               return(1);
+       }
+       snprintf(remote.copy1.path, sizeof(remote.copy1.path), "%s", url);
+       if (ioctl(fd, HAMMER2IOC_REMOTE_ADD, &remote) < 0) {
+               perror("ioctl");
+               ecode = 1;
+       }
+       close(fd);
+       return 0;;
+}
+
+int
+cmd_remote_disconnect(const char *sel_path, const char *url)
+{
+       hammer2_ioc_remote_t remote;
+       int ecode = 0;
+       int fd;
+
+       if ((fd = hammer2_ioctl_handle(sel_path)) < 0)
+               return(1);
+       bzero(&remote, sizeof(remote));
+       remote.copyid = -1;
+       remote.fd = -1;
+       if (strlen(url) >= sizeof(remote.copy1.path)) {
+               fprintf(stderr, "hammer2: disconnect: Path too long\n");
+               close(fd);
+               return(1);
+       }
+       snprintf(remote.copy1.path, sizeof(remote.copy1.path), "%s", url);
+       if (ioctl(fd, HAMMER2IOC_REMOTE_DEL, &remote) < 0) {
+               perror("ioctl");
+               ecode = 1;
+       }
+       close(fd);
+       return 0;;
+}
+
+int
+cmd_remote_status(const char *sel_path, int all_opt __unused)
+{
+       hammer2_ioc_remote_t remote;
+       int ecode = 0;
+       int count = 0;
+       int fd;
+
+       if ((fd = hammer2_ioctl_handle(sel_path)) < 0)
+               return(1);
+       bzero(&remote, sizeof(remote));
+
+       while ((remote.copyid = remote.nextid) >= 0) {
+               if (ioctl(fd, HAMMER2IOC_REMOTE_GET, &remote) < 0) {
+                       perror("ioctl");
+                       ecode = 1;
+                       break;
+               }
+               if (remote.copy1.copyid == 0)
+                       continue;
+               if (count == 0)
+                       printf("CPYID LABEL           STATUS PATH\n");
+               printf("%5d %-15s %c%c%c.%02x %s\n",
+                       remote.copy1.copyid,
+                       remote.copy1.label,
+                       '-', '-', '-',
+                       remote.copy1.priority,
+                       remote.copy1.path);
+               ++count;
+       }
+       if (count == 0)
+               printf("No linkages found\n");
+       return (ecode);
+}
diff --git a/sbin/hammer2/cmd_rsa.c b/sbin/hammer2/cmd_rsa.c
new file mode 100644 (file)
index 0000000..75d3893
--- /dev/null
@@ -0,0 +1,379 @@
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "hammer2.h"
+
+/*
+ * Should be run as root.  Creates /etc/hammer2/rsa.{pub,prv} using
+ * an openssl command.
+ */
+int
+cmd_rsainit(const char *dir_path)
+{
+       struct stat st;
+       int ecode;
+       char *str1;
+       char *str2;
+       char *cmd;
+       mode_t old_umask;
+
+       /*
+        * Create the directory if necessary
+        */
+       if (stat(dir_path, &st) < 0) {
+               str1 = strdup(dir_path);
+               str2 = str1 - 1;
+
+               while ((str2 = strchr(str2 + 1, '/')) != NULL) {
+                       *str2 = 0;
+                       mkdir(str1, 0755);
+                       *str2 = '/';
+               }
+               mkdir(str1, 0700);
+               free(str1);
+       }
+       asprintf(&str1, "%s/rsa.prv", dir_path);
+       asprintf(&str2, "%s/rsa.pub", dir_path);
+
+       if (stat(str1, &st) < 0) {
+               old_umask = umask(077);
+               asprintf(&cmd, "openssl genrsa -out %s 2048", str1);
+               umask(old_umask);
+               ecode = system(cmd);
+               free(cmd);
+               chmod(str1, 0400);
+               if (ecode) {
+                       fprintf(stderr,
+                               "hammer2 rsainit: private key gen failed\n");
+                       free(str2);
+                       free(str1);
+                       return 1;
+               }
+               printf("hammer2 rsainit: created %s\n", str1);
+               remove(str2);
+       } else {
+               printf("hammer2 rsainit: Using existing private key in %s\n",
+                      str1);
+       }
+       if (stat(str2, &st) < 0) {
+               asprintf(&cmd, "openssl rsa -in %s -out %s -pubout",
+                        str1, str2);
+               ecode = system(cmd);
+               free(cmd);
+               if (ecode) {
+                       fprintf(stderr,
+                               "hammer2 rsainit: public key gen failed\n");
+                       free(str2);
+                       free(str1);
+                       return 1;
+               }
+               printf("hammer2 rsainit: created %s\n", str2);
+       } else {
+               printf("hammer2 rsainit: both keys already exist\n");
+       }
+       free(str2);
+       free(str1);
+
+       return 0;
+}
+
+int
+cmd_rsaenc(const char **keyfiles, int nkeys)
+{
+       RSA **keys = calloc(nkeys, sizeof(RSA *));
+       int *ispub = calloc(nkeys, sizeof(int));
+       int ecode = 0;
+       int blksize = 0;
+       int i;
+       int off;
+       int n;
+       unsigned char *data_in;
+       unsigned char *data_out;
+
+       for (i = 0; i < nkeys; ++i) {
+               FILE *fp;
+               const char *sfx;
+
+               sfx = strrchr(keyfiles[i], '.');
+               if (sfx && strcmp(sfx, ".pub") == 0) {
+                       fp = fopen(keyfiles[i], "r");
+                       if (fp == NULL) {
+                               fprintf(stderr, "hammer2 rsaenc: unable to "
+                                               "open %s\n", keyfiles[i]);
+                               ecode = 1;
+                               goto done;
+                       }
+                       keys[i] = PEM_read_RSA_PUBKEY(fp, NULL, NULL, NULL);
+                       ispub[i] = 1;
+                       fclose(fp);
+                       if (keys[i] == NULL) {
+                               fprintf(stderr, "hammer2 rsaenc: unable to "
+                                               "parse public key from %s\n",
+                                               keyfiles[i]);
+                               ecode = 1;
+                               goto done;
+                       }
+               } else if (sfx && strcmp(sfx, ".prv") == 0) {
+                       fp = fopen(keyfiles[i], "r");
+                       if (fp == NULL) {
+                               fprintf(stderr, "hammer2 rsaenc: unable to "
+                                               "open %s\n", keyfiles[i]);
+                               ecode = 1;
+                               goto done;
+                       }
+                       keys[i] = PEM_read_RSAPrivateKey(fp, NULL, NULL, NULL);
+                       fclose(fp);
+                       if (keys[i] == NULL) {
+                               fprintf(stderr, "hammer2 rsaenc: unable to "
+                                               "parse private key from %s\n",
+                                               keyfiles[i]);
+                               ecode = 1;
+                               goto done;
+                       }
+               } else {
+                       fprintf(stderr, "hammer2: rsaenc: key files must end "
+                                       "in .pub or .prv\n");
+                       ecode = 1;
+                       goto done;
+               }
+               if (i == 0)
+                       blksize = RSA_size(keys[i]);
+               else
+                       assert(blksize == RSA_size(keys[i]));
+       }
+       fprintf(stderr, "blksize %d\n", blksize);
+
+       /*
+        *
+        */
+       data_in = malloc(blksize);
+       data_out = malloc(blksize);
+       off = 0;
+       while ((n = read(0, data_in + off, blksize - off)) > 0) {
+               off += n;
+               if (off == blksize) {
+                       for (i = 0; i < nkeys; ++i) {
+                               if (ispub[i])
+                                       RSA_public_encrypt(blksize,
+                                                          data_in, data_out,
+                                                          keys[i],
+                                                          RSA_NO_PADDING);
+                               else
+                                       RSA_private_encrypt(blksize,
+                                                          data_in, data_out,
+                                                          keys[i],
+                                                          RSA_NO_PADDING);
+                               if (i + 1 != nkeys)
+                                       bcopy(data_out, data_in, blksize);
+                       }
+                       if (write(1, data_out, blksize) != blksize) {
+                               perror("write");
+                               ecode = 1;
+                               break;
+                       }
+                       off = 0;
+               }
+       }
+       if (off && ecode == 0) {
+               if (off < blksize)
+                       bzero(data_in + off, blksize - off);
+               for (i = 0; i < nkeys; ++i) {
+                       if (ispub[i])
+                               RSA_public_encrypt(blksize,
+                                                  data_in, data_out,
+                                                  keys[i],
+                                                  RSA_NO_PADDING);
+                       else
+                               RSA_private_encrypt(blksize,
+                                                  data_in, data_out,
+                                                  keys[i],
+                                                  RSA_NO_PADDING);
+                       if (i + 1 != nkeys)
+                               bcopy(data_out, data_in, blksize);
+               }
+               if (write(1, data_out, blksize) != blksize) {
+                       perror("write");
+                       ecode = 1;
+               }
+       }
+       if (n < 0) {
+               perror("read");
+               ecode = 1;
+       }
+       free(data_out);
+       free(data_in);
+done:
+       for (i = 0; i < nkeys; ++i) {
+               if (keys[i])
+                       RSA_free(keys[i]);
+       }
+       free(keys);
+       free(ispub);
+       return (ecode);
+}
+
+int
+cmd_rsadec(const char **keyfiles, int nkeys)
+{
+       RSA **keys = calloc(nkeys, sizeof(RSA *));
+       int *ispub = calloc(nkeys, sizeof(int));
+       int ecode = 0;
+       int blksize = 0;
+       int i;
+       int off;
+       int n;
+       unsigned char *data_in;
+       unsigned char *data_out;
+
+       for (i = 0; i < nkeys; ++i) {
+               FILE *fp;
+               const char *sfx;
+
+               sfx = strrchr(keyfiles[i], '.');
+               if (sfx && strcmp(sfx, ".pub") == 0) {
+                       fp = fopen(keyfiles[i], "r");
+                       if (fp == NULL) {
+                               fprintf(stderr, "hammer2 rsaenc: unable to "
+                                               "open %s\n", keyfiles[i]);
+                               ecode = 1;
+                               goto done;
+                       }
+                       keys[i] = PEM_read_RSA_PUBKEY(fp, NULL, NULL, NULL);
+                       ispub[i] = 1;
+                       fclose(fp);
+                       if (keys[i] == NULL) {
+                               fprintf(stderr, "hammer2 rsaenc: unable to "
+                                               "parse public key from %s\n",
+                                               keyfiles[i]);
+                               ecode = 1;
+                               goto done;
+                       }
+               } else if (sfx && strcmp(sfx, ".prv") == 0) {
+                       fp = fopen(keyfiles[i], "r");
+                       if (fp == NULL) {
+                               fprintf(stderr, "hammer2 rsaenc: unable to "
+                                               "open %s\n", keyfiles[i]);
+                               ecode = 1;
+                               goto done;
+                       }
+                       keys[i] = PEM_read_RSAPrivateKey(fp, NULL, NULL, NULL);
+                       fclose(fp);
+                       if (keys[i] == NULL) {
+                               fprintf(stderr, "hammer2 rsaenc: unable to "
+                                               "parse private key from %s\n",
+                                               keyfiles[i]);
+                               ecode = 1;
+                               goto done;
+                       }
+               } else {
+                       fprintf(stderr, "hammer2: rsaenc: key files must end "
+                                       "in .pub or .prv\n");
+                       ecode = 1;
+                       goto done;
+               }
+               if (i == 0)
+                       blksize = RSA_size(keys[i]);
+               else
+                       assert(blksize == RSA_size(keys[i]));
+       }
+
+       /*
+        *
+        */
+       data_in = malloc(blksize);
+       data_out = malloc(blksize);
+       off = 0;
+       while ((n = read(0, data_in + off, blksize - off)) > 0) {
+               off += n;
+               if (off == blksize) {
+                       for (i = 0; i < nkeys; ++i) {
+                               if (ispub[i])
+                                       RSA_public_decrypt(blksize,
+                                                          data_in, data_out,
+                                                          keys[i],
+                                                          RSA_NO_PADDING);
+                               else
+                                       RSA_private_decrypt(blksize,
+                                                          data_in, data_out,
+                                                          keys[i],
+                                                          RSA_NO_PADDING);
+                               if (i + 1 != nkeys)
+                                       bcopy(data_out, data_in, blksize);
+                       }
+                       if (write(1, data_out, blksize) != blksize) {
+                               perror("write");
+                               ecode = 1;
+                               break;
+                       }
+                       off = 0;
+               }
+       }
+       if (off) {
+               if (off < blksize)
+                       bzero(data_in + off, blksize - off);
+               for (i = 0; i < nkeys; ++i) {
+                       if (ispub[i])
+                               RSA_public_decrypt(blksize,
+                                                  data_in, data_out,
+                                                  keys[i],
+                                                  RSA_NO_PADDING);
+                       else
+                               RSA_private_decrypt(blksize,
+                                                  data_in, data_out,
+                                                  keys[i],
+                                                  RSA_NO_PADDING);
+                       if (i + 1 != nkeys)
+                               bcopy(data_out, data_in, blksize);
+               }
+               if (write(1, data_out, blksize) != blksize) {
+                       perror("write");
+                       ecode = 1;
+               }
+       }
+       if (n < 0) {
+               perror("read");
+               ecode = 1;
+       }
+       free(data_out);
+       free(data_in);
+done:
+       for (i = 0; i < nkeys; ++i) {
+               if (keys[i])
+                       RSA_free(keys[i]);
+       }
+       free(keys);
+       free(ispub);
+       return (ecode);
+}
diff --git a/sbin/hammer2/cmd_service.c b/sbin/hammer2/cmd_service.c
new file mode 100644 (file)
index 0000000..5b0ebc7
--- /dev/null
@@ -0,0 +1,241 @@
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "hammer2.h"
+
+static void *master_accept(void *data);
+static void *master_service(void *data);
+static void master_auth_rx(hammer2_iocom_t *iocom);
+static void master_auth_tx(hammer2_iocom_t *iocom);
+static void master_link_rx(hammer2_iocom_t *iocom);
+static void master_link_tx(hammer2_iocom_t *iocom);
+
+/*
+ * Start-up the master listener daemon for the machine.
+ *
+ * The master listener serves as a rendezvous point in the cluster, accepting
+ * connections, performing registrations and authentications, maintaining
+ * the spanning tree, and keeping track of message state so disconnects can
+ * be handled properly.
+ *
+ * Once authenticated only low-level messaging protocols (which includes
+ * tracking persistent messages) are handled by this daemon.  This daemon
+ * does not run the higher level quorum or locking protocols.
+ *
+ * This daemon can also be told to maintain connections to other nodes,
+ * forming a messaging backbone, which in turn allows PFS's (if desired) to
+ * simply connect to the master daemon via localhost if desired.
+ * Backbones are specified via /etc/hammer2.conf.
+ */
+int
+cmd_service(void)
+{
+       struct sockaddr_in lsin;
+       int on;
+       int lfd;
+
+       /*
+        * Acquire socket and set options
+        */
+       if ((lfd = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
+               fprintf(stderr, "master_listen: socket(): %s\n",
+                       strerror(errno));
+               return 1;
+       }
+       on = 1;
+       setsockopt(lfd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on));
+
+       /*
+        * Setup listen port and try to bind.  If the bind fails we assume
+        * that a master listener process is already running and silently
+        * fail.
+        */
+       bzero(&lsin, sizeof(lsin));
+       lsin.sin_family = AF_INET;
+       lsin.sin_addr.s_addr = INADDR_ANY;
+       lsin.sin_port = htons(HAMMER2_LISTEN_PORT);
+       if (bind(lfd, (struct sockaddr *)&lsin, sizeof(lsin)) < 0) {
+               close(lfd);
+               fprintf(stderr, "master listen: daemon already running\n");
+               return 0;
+       }
+       fprintf(stderr, "master listen: startup\n");
+       listen(lfd, 50);
+
+       /*
+        * Fork and disconnect the controlling terminal and parent process,
+        * executing the specified function as a pthread.
+        *
+        * Returns to the original process which can then continue running.
+        * In debug mode this call will create the pthread without forking
+        * and set NormalExit to 0, instead of fork.
+        */
+       hammer2_demon(master_accept, (void *)(intptr_t)lfd);
+       if (NormalExit)
+               close(lfd);
+       return 0;
+}
+
+/*
+ * Master listen/accept thread.  Accept connections on the master socket,
+ * starting a pthread for each one.
+ */
+static
+void *
+master_accept(void *data)
+{
+       struct sockaddr_in asin;
+       socklen_t alen;
+       pthread_t thread;
+       int lfd = (int)(intptr_t)data;
+       int fd;
+
+       /*
+        * Nobody waits for us
+        */
+       setproctitle("hammer2 master listen");
+       pthread_detach(pthread_self());
+
+       /*
+        * Accept connections and create pthreads to handle them after
+        * validating the IP.
+        */
+       for (;;) {
+               alen = sizeof(asin);
+               fd = accept(lfd, (struct sockaddr *)&asin, &alen);
+               if (fd < 0) {
+                       if (errno == EINTR)
+                               continue;
+                       break;
+               }
+               thread = NULL;
+               fprintf(stderr, "master_accept: accept fd %d\n", fd);
+               pthread_create(&thread, NULL,
+                              master_service, (void *)(intptr_t)fd);
+       }
+       return (NULL);
+}
+
+/*
+ * Service an accepted connection (runs as a pthread)
+ */
+static
+void *
+master_service(void *data)
+{
+       hammer2_iocom_t iocom;
+       int fd;
+
+       fd = (int)(intptr_t)data;
+       hammer2_iocom_init(&iocom, fd, -1);
+       hammer2_iocom_core(&iocom, master_auth_rx, master_auth_tx, NULL);
+
+       fprintf(stderr,
+               "iocom on fd %d terminated error rx=%d, tx=%d\n",
+               fd, iocom.ioq_rx.error, iocom.ioq_tx.error);
+       close(fd);
+
+       return (NULL);
+}
+
+/************************************************************************
+ *                         AUTHENTICATION                              *
+ ************************************************************************
+ *
+ * Additional messaging-based authentication must occur before normal
+ * message operation.  The connection has already been encrypted at
+ * this point.
+ */
+static
+void
+master_auth_rx(hammer2_iocom_t *iocom __unused)
+{
+       printf("AUTHRX\n");
+       iocom->recvmsg_callback = master_link_rx;
+       iocom->sendmsg_callback = master_link_tx;
+}
+
+static
+void
+master_auth_tx(hammer2_iocom_t *iocom __unused)
+{
+       printf("AUTHTX\n");
+       iocom->recvmsg_callback = master_link_rx;
+       iocom->sendmsg_callback = master_link_tx;
+}
+
+/*
+ * Callback from hammer2_iocom_core() when messages might be present
+ * on the socket.
+ */
+static
+void
+master_link_rx(hammer2_iocom_t *iocom)
+{
+       hammer2_msg_t *msg;
+
+       while ((iocom->flags & HAMMER2_IOCOMF_EOF) == 0 &&
+              (msg = hammer2_ioq_read(iocom)) != NULL) {
+               fprintf(stderr, "MSG RECEIVED: %08x error %d\n",
+                       msg->any.head.cmd, msg->any.head.error);
+               switch(msg->any.head.cmd & HAMMER2_MSGF_CMDSWMASK) {
+               case HAMMER2_LNK_ERROR:
+                       break;
+               case HAMMER2_DBG_SHELL:
+               case HAMMER2_DBG_SHELL | HAMMER2_MSGF_REPLY:
+                       hammer2_shell_remote(msg);
+                       break;
+               default:
+                       hammer2_replymsg(msg, HAMMER2_MSG_ERR_UNKNOWN);
+                       break;
+               }
+       }
+       if (iocom->ioq_rx.error) {
+               fprintf(stderr,
+                       "master_recv: comm error %d\n",
+                       iocom->ioq_rx.error);
+       }
+}
+
+/*
+ * Callback from hammer2_iocom_core() when messages might be transmittable
+ * to the socket.
+ */
+static
+void
+master_link_tx(hammer2_iocom_t *iocom)
+{
+       hammer2_iocom_flush(iocom);
+}
diff --git a/sbin/hammer2/cmd_snapshot.c b/sbin/hammer2/cmd_snapshot.c
new file mode 100644 (file)
index 0000000..2d46b9e
--- /dev/null
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "hammer2.h"
+
+/*
+ * The snapshot is named <PFSNAME>_<YYYYMMDD.HHMMSS.TRANSID> unless
+ * overridden by a label.
+ *
+ * When local non-cache media is involved the media is
+ * first synchronized and the snapshot is then based on
+ * the media.
+ *
+ * If the media is remote the snapshot is created on the remote
+ * end (if you have sufficient administrative rights) and a local
+ * ADMIN or CACHE PFS is created with a connection to the snapshot
+ * on the remote.
+ *
+ * If the client has snapshot rights to multiple remotes then TBD.
+ */
diff --git a/sbin/hammer2/crypto.c b/sbin/hammer2/crypto.c
new file mode 100644 (file)
index 0000000..480568c
--- /dev/null
@@ -0,0 +1,534 @@
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "hammer2.h"
+
+/*
+ * Synchronously negotiate crypto for a new session.  This must occur
+ * within 10 seconds or the connection is error'd out.
+ *
+ * We work off the IP address and/or reverse DNS.  The IP address is
+ * checked first, followed by the IP address at various levels of granularity,
+ * followed by the full domain name and domain names at various levels of
+ * granularity.
+ *
+ *     /etc/hammer2/remote/<name>.pub  - Contains a public key
+ *     /etc/hammer2/remote/<name>.none - Indicates no encryption (empty file)
+ *                                       (e.g. localhost.none).
+ *
+ * We first attempt to locate a public key file based on the peer address or
+ * peer FQDN.
+ *
+ *     <name>.none     - No further negotiation is needed.  We simply return.
+ *                       All communication proceeds without encryption.
+ *                       No public key handshake occurs in this situation.
+ *                       (both ends must match).
+ *
+ *     <name>.pub      - We have located the public key for the peer.  Both
+ *                       sides transmit a block encrypted with their private
+ *                       keys and the peer's public key.
+ *
+ *                       Both sides receive a block and decrypt it.
+ *
+ *                       Both sides formulate a reply using the decrypted
+ *                       block and transmit it.
+ *
+ *                       communication proceeds with the negotiated session
+ *                       key (typically AES-256-CBC).
+ *
+ * If we fail to locate the appropriate file and no floating.db exists the
+ * connection is terminated without further action.
+ *
+ * If floating.db exists the connection proceeds with a floating negotiation.
+ */
+typedef union {
+       struct sockaddr sa;
+       struct sockaddr_in sa_in;
+       struct sockaddr_in6 sa_in6;
+} sockaddr_any_t;
+
+void
+hammer2_crypto_negotiate(hammer2_iocom_t *iocom)
+{
+       sockaddr_any_t sa;
+       socklen_t salen = sizeof(sa);
+       char peername[128];
+       char realname[128];
+       hammer2_handshake_t handtx;
+       hammer2_handshake_t handrx;
+       char buf1[sizeof(handtx)];
+       char buf2[sizeof(handtx)];
+       char *ptr;
+       char *path;
+       struct stat st;
+       FILE *fp;
+       RSA *keys[3] = { NULL, NULL, NULL };
+       size_t i;
+       size_t blksize;
+       size_t blkmask;
+       ssize_t n;
+       int fd;
+
+       /*
+        * Get the peer IP address for the connection as a string.
+        */
+       if (getpeername(iocom->sock_fd, &sa.sa, &salen) < 0) {
+               iocom->ioq_rx.error = HAMMER2_IOQ_ERROR_NOPEER;
+               iocom->flags |= HAMMER2_IOCOMF_EOF;
+               if (DebugOpt)
+                       fprintf(stderr, "accept: getpeername() failed\n");
+               goto done;
+       }
+       if (getnameinfo(&sa.sa, salen, peername, sizeof(peername),
+                       NULL, 0, NI_NUMERICHOST) < 0) {
+               iocom->ioq_rx.error = HAMMER2_IOQ_ERROR_NOPEER;
+               iocom->flags |= HAMMER2_IOCOMF_EOF;
+               if (DebugOpt)
+                       fprintf(stderr, "accept: cannot decode sockaddr\n");
+               goto done;
+       }
+       if (DebugOpt) {
+               if (realhostname_sa(realname, sizeof(realname),
+                                   &sa.sa, salen) == HOSTNAME_FOUND) {
+                       fprintf(stderr, "accept from %s (%s)\n",
+                               peername, realname);
+               } else {
+                       fprintf(stderr, "accept from %s\n", peername);
+               }
+       }
+
+       /*
+        * Find the remote host's public key
+        */
+       asprintf(&path, "%s/%s.pub", HAMMER2_PATH_REMOTE, peername);
+       if ((fp = fopen(path, "r")) == NULL) {
+               free(path);
+               asprintf(&path, "%s/%s.none",
+                        HAMMER2_PATH_REMOTE, peername);
+               if (stat(path, &st) < 0) {
+                       iocom->ioq_rx.error = HAMMER2_IOQ_ERROR_NORKEY;
+                       iocom->flags |= HAMMER2_IOCOMF_EOF;
+                       if (DebugOpt)
+                               fprintf(stderr, "auth failure: unknown host\n");
+                       goto done;
+               }
+               if (DebugOpt)
+                       fprintf(stderr, "auth succeeded, unencrypted link\n");
+       }
+       if (fp) {
+               keys[0] = PEM_read_RSA_PUBKEY(fp, NULL, NULL, NULL);
+               fclose(fp);
+               if (keys[0] == NULL) {
+                       iocom->ioq_rx.error = HAMMER2_IOQ_ERROR_KEYFMT;
+                       iocom->flags |= HAMMER2_IOCOMF_EOF;
+                       if (DebugOpt)
+                               fprintf(stderr,
+                                       "auth failure: bad key format\n");
+                       goto done;
+               }
+       }
+
+       /*
+        * Get our public and private keys
+        */
+       free(path);
+       asprintf(&path, HAMMER2_DEFAULT_DIR "/rsa.pub");
+       if ((fp = fopen(path, "r")) == NULL) {
+               iocom->ioq_rx.error = HAMMER2_IOQ_ERROR_NOLKEY;
+               iocom->flags |= HAMMER2_IOCOMF_EOF;
+               goto done;
+       }
+       keys[1] = PEM_read_RSA_PUBKEY(fp, NULL, NULL, NULL);
+       fclose(fp);
+       if (keys[1] == NULL) {
+               iocom->ioq_rx.error = HAMMER2_IOQ_ERROR_KEYFMT;
+               iocom->flags |= HAMMER2_IOCOMF_EOF;
+               if (DebugOpt)
+                       fprintf(stderr, "auth failure: bad host key format\n");
+               goto done;
+       }
+
+       free(path);
+       asprintf(&path, HAMMER2_DEFAULT_DIR "/rsa.prv");
+       if ((fp = fopen(path, "r")) == NULL) {
+               iocom->ioq_rx.error = HAMMER2_IOQ_ERROR_NOLKEY;
+               iocom->flags |= HAMMER2_IOCOMF_EOF;
+               if (DebugOpt)
+                       fprintf(stderr, "auth failure: bad host key format\n");
+               goto done;
+       }
+       keys[2] = PEM_read_RSAPrivateKey(fp, NULL, NULL, NULL);
+       fclose(fp);
+       if (keys[2] == NULL) {
+               iocom->ioq_rx.error = HAMMER2_IOQ_ERROR_KEYFMT;
+               iocom->flags |= HAMMER2_IOCOMF_EOF;
+               if (DebugOpt)
+                       fprintf(stderr, "auth failure: bad host key format\n");
+               goto done;
+       }
+       free(path);
+       path = NULL;
+
+       /*
+        * public key encrypt/decrypt block size.
+        */
+       if (keys[0]) {
+               blksize = (size_t)RSA_size(keys[0]);
+               if (blksize != (size_t)RSA_size(keys[1]) ||
+                   blksize != (size_t)RSA_size(keys[2]) ||
+                   sizeof(handtx) % blksize != 0) {
+                       iocom->ioq_rx.error = HAMMER2_IOQ_ERROR_KEYFMT;
+                       iocom->flags |= HAMMER2_IOCOMF_EOF;
+                       if (DebugOpt)
+                               fprintf(stderr, "auth failure: "
+                                               "key size mismatch\n");
+                       goto done;
+               }
+       } else {
+               blksize = sizeof(handtx);
+       }
+       blkmask = blksize - 1;
+
+       bzero(&handrx, sizeof(handrx));
+       bzero(&handtx, sizeof(handtx));
+
+       /*
+        * Fill all unused fields (particular all junk fields) with random
+        * data, and also set the session key.
+        */
+       fd = open("/dev/urandom", O_RDONLY);
+       if (fd < 0 ||
+           fstat(fd, &st) < 0 ||       /* something wrong */
+           S_ISREG(st.st_mode) ||      /* supposed to be a RNG dev! */
+           read(fd, &handtx, sizeof(handtx)) != sizeof(handtx)) {
+urandfail:
+               if (fd >= 0)
+                       close(fd);
+               iocom->ioq_rx.error = HAMMER2_IOQ_ERROR_BADURANDOM;
+               iocom->flags |= HAMMER2_IOCOMF_EOF;
+               if (DebugOpt)
+                       fprintf(stderr, "auth failure: bad rng\n");
+               goto done;
+       }
+       if (bcmp(&handrx, &handtx, sizeof(handtx)) == 0)
+               goto urandfail;                 /* read all zeros */
+       close(fd);
+       /* ERR_load_crypto_strings(); openssl debugging */
+
+       /*
+        * Handshake with the remote.
+        *
+        *      Encrypt with my private and remote's public
+        *      Decrypt with my private and remote's public
+        *
+        * When encrypting we have to make sure our buffer fits within the
+        * modulus, which typically requires bit 7 o the first byte to be
+        * zero.  To be safe make sure that bit 7 and bit 6 is zero.
+        */
+       snprintf(handtx.quickmsg, sizeof(handtx.quickmsg), "Testing 1 2 3");
+       handtx.magic = HAMMER2_MSGHDR_MAGIC;
+       handtx.version = 1;
+       handtx.flags = 0;
+       assert(sizeof(handtx.verf) * 4 == sizeof(handtx.sess));
+       bzero(handtx.verf, sizeof(handtx.verf));
+
+       handtx.pad1[0] &= 0x3f; /* message must fit within modulus */
+       handtx.pad2[0] &= 0x3f; /* message must fit within modulus */
+
+       for (i = 0; i < sizeof(handtx.sess); ++i)
+               handtx.verf[i / 4] ^= handtx.sess[i];
+
+       /*
+        * Write handshake buffer to remote
+        */
+       for (i = 0; i < sizeof(handtx); i += blksize) {
+               ptr = (char *)&handtx + i;
+               if (keys[0]) {
+                       /*
+                        * Since we are double-encrypting we have to make
+                        * sure that the result of the first stage does
+                        * not blow out the modulus for the second stage.
+                        *
+                        * The pointer is pointing to the pad*[] area so
+                        * we can mess with that until the first stage
+                        * is legal.
+                        */
+                       do {
+                               ++*(int *)(ptr + 4);
+                               if (RSA_private_encrypt(blksize, ptr, buf1,
+                                           keys[2], RSA_NO_PADDING) < 0) {
+                                       iocom->ioq_rx.error =
+                                               HAMMER2_IOQ_ERROR_KEYXCHGFAIL;
+                               }
+                       } while (buf1[0] & 0xC0);
+
+                       if (RSA_public_encrypt(blksize, buf1, buf2,
+                                           keys[0], RSA_NO_PADDING) < 0) {
+                               iocom->ioq_rx.error =
+                                       HAMMER2_IOQ_ERROR_KEYXCHGFAIL;
+                       }
+               }
+               if (write(iocom->sock_fd, buf2, blksize) != (ssize_t)blksize) {
+                       fprintf(stderr, "WRITE ERROR\n");
+               }
+       }
+       if (iocom->ioq_rx.error) {
+               iocom->flags |= HAMMER2_IOCOMF_EOF;
+               if (DebugOpt)
+                       fprintf(stderr, "auth failure: key exchange failure "
+                                       "during encryption\n");
+               goto done;
+       }
+
+       /*
+        * Read handshake buffer from remote
+        */
+       i = 0;
+       while (i < sizeof(handrx)) {
+               ptr = (char *)&handrx + i;
+               n = read(iocom->sock_fd, ptr, blksize - (i & blkmask));
+               if (n <= 0)
+                       break;
+               ptr -= (i & blkmask);
+               i += n;
+               if (keys[0] && (i & blkmask) == 0) {
+                       if (RSA_private_decrypt(blksize, ptr, buf1,
+                                          keys[2], RSA_NO_PADDING) < 0)
+                               iocom->ioq_rx.error =
+                                               HAMMER2_IOQ_ERROR_KEYXCHGFAIL;
+                       if (RSA_public_decrypt(blksize, buf1, ptr,
+                                          keys[0], RSA_NO_PADDING) < 0)
+                               iocom->ioq_rx.error =
+                                               HAMMER2_IOQ_ERROR_KEYXCHGFAIL;
+               }
+       }
+       if (iocom->ioq_rx.error) {
+               iocom->flags |= HAMMER2_IOCOMF_EOF;
+               if (DebugOpt)
+                       fprintf(stderr, "auth failure: key exchange failure "
+                                       "during decryption\n");
+               goto done;
+       }
+
+       /*
+        * Validate the received data.  Try to make this a constant-time
+        * algorithm.
+        */
+       if (i != sizeof(handrx)) {
+keyxchgfail:
+               iocom->ioq_rx.error = HAMMER2_IOQ_ERROR_KEYXCHGFAIL;
+               iocom->flags |= HAMMER2_IOCOMF_EOF;
+               if (DebugOpt)
+                       fprintf(stderr, "auth failure: key exchange failure\n");
+               goto done;
+       }
+
+       if (handrx.magic == HAMMER2_MSGHDR_MAGIC_REV) {
+               handrx.version = bswap16(handrx.version);
+               handrx.flags = bswap32(handrx.flags);
+       }
+       for (i = 0; i < sizeof(handrx.sess); ++i)
+               handrx.verf[i / 4] ^= handrx.sess[i];
+       n = 0;
+       for (i = 0; i < sizeof(handrx.verf); ++i)
+               n += handrx.verf[i];
+       if (handrx.version != 1)
+               ++n;
+       if (n != 0)
+               goto keyxchgfail;
+
+       /*
+        * Calculate the session key and initialize the iv[].
+        */
+       assert(HAMMER2_AES_KEY_SIZE * 2 == sizeof(handrx.sess));
+       for (i = 0; i < HAMMER2_AES_KEY_SIZE; ++i) {
+               iocom->sess[i] = handrx.sess[i] ^ handtx.sess[i];
+               iocom->ioq_rx.iv[i] = handrx.sess[HAMMER2_AES_KEY_SIZE + i] ^
+                                     handtx.sess[HAMMER2_AES_KEY_SIZE + i];
+               iocom->ioq_tx.iv[i] = handrx.sess[HAMMER2_AES_KEY_SIZE + i] ^
+                                     handtx.sess[HAMMER2_AES_KEY_SIZE + i];
+       }
+       printf("sess: ");
+       for (i = 0; i < HAMMER2_AES_KEY_SIZE; ++i)
+               printf("%02x", (unsigned char)iocom->sess[i]);
+       printf("\n");
+       printf("iv: ");
+       for (i = 0; i < HAMMER2_AES_KEY_SIZE; ++i)
+               printf("%02x", (unsigned char)iocom->ioq_rx.iv[i]);
+       printf("\n");
+
+       EVP_CIPHER_CTX_init(&iocom->ioq_rx.ctx);
+       EVP_DecryptInit_ex(&iocom->ioq_rx.ctx, HAMMER2_AES_TYPE_EVP, NULL,
+                          iocom->sess, iocom->ioq_rx.iv);
+       EVP_CIPHER_CTX_set_padding(&iocom->ioq_rx.ctx, 0);
+
+       EVP_CIPHER_CTX_init(&iocom->ioq_tx.ctx);
+       EVP_EncryptInit_ex(&iocom->ioq_tx.ctx, HAMMER2_AES_TYPE_EVP, NULL,
+                          iocom->sess, iocom->ioq_tx.iv);
+       EVP_CIPHER_CTX_set_padding(&iocom->ioq_tx.ctx, 0);
+
+       iocom->flags |= HAMMER2_IOCOMF_CRYPTED;
+
+       if (DebugOpt)
+               fprintf(stderr, "auth success: %s\n", handrx.quickmsg);
+done:
+       if (path)
+               free(path);
+       if (keys[0])
+               RSA_free(keys[0]);
+       if (keys[1])
+               RSA_free(keys[1]);
+       if (keys[1])
+               RSA_free(keys[2]);
+}
+
+/*
+ * Decrypt pending data in the ioq's fifo.  The data is decrypted in-place.
+ */
+void
+hammer2_crypto_decrypt(hammer2_iocom_t *iocom, hammer2_ioq_t *ioq)
+{
+       int p_len;
+       int n;
+       int i;
+       char buf[512];
+
+       if ((iocom->flags & HAMMER2_IOCOMF_CRYPTED) == 0)
+               return;
+       p_len = ioq->fifo_end - ioq->fifo_cdx;
+       p_len &= ~HAMMER2_AES_KEY_MASK;
+       if (p_len == 0)
+               return;
+       for (i = 0; i < p_len; i += n) {
+               n = (p_len - i > (int)sizeof(buf)) ?
+                       (int)sizeof(buf) : p_len - i;
+               bcopy(ioq->buf + ioq->fifo_cdx + i, buf, n);
+               EVP_DecryptUpdate(&ioq->ctx,
+                                 ioq->buf + ioq->fifo_cdx + i, &n,
+                                 buf, n);
+       }
+       ioq->fifo_cdx += p_len;
+}
+
+/*
+ * Decrypt data in the message's auxilary buffer.  The data is decrypted
+ * in-place.
+ */
+void
+hammer2_crypto_decrypt_aux(hammer2_iocom_t *iocom, hammer2_ioq_t *ioq,
+                          hammer2_msg_t *msg, int already)
+{
+       int p_len;
+       int n;
+       int i;
+       char buf[512];
+
+       if ((iocom->flags & HAMMER2_IOCOMF_CRYPTED) == 0)
+               return;
+       p_len = msg->aux_size;
+       assert((p_len & HAMMER2_AES_KEY_MASK) == 0);
+       if (p_len == 0)
+               return;
+       i = already;
+       while (i < p_len) {
+               n = (p_len - i > (int)sizeof(buf)) ?
+                       (int)sizeof(buf) : p_len - i;
+               bcopy(msg->aux_data + i, buf, n);
+               EVP_DecryptUpdate(&ioq->ctx,
+                                 msg->aux_data + i, &n,
+                                 buf, n);
+               i += n;
+       }
+#if 0
+       EVP_DecryptUpdate(&iocom->ioq_rx.ctx,
+                         msg->aux_data, &p_len,
+                         msg->aux_data, p_len);
+#endif
+}
+
+int
+hammer2_crypto_encrypt(hammer2_iocom_t *iocom, hammer2_ioq_t *ioq,
+                      struct iovec *iov, int n)
+{
+       int p_len;
+       int i;
+       int already;
+       int nmax;
+
+       if ((iocom->flags & HAMMER2_IOCOMF_CRYPTED) == 0)
+               return (n);
+       nmax = sizeof(ioq->buf) - ioq->fifo_cdx;        /* max new bytes */
+       already = ioq->fifo_cdx - ioq->fifo_beg;        /* already encrypted */
+
+       for (i = 0; i < n; ++i) {
+               p_len = iov[i].iov_len;
+               if (p_len <= already) {
+                       already -= p_len;
+                       continue;
+               }
+               p_len -= already;
+               if (p_len > nmax)
+                       p_len = nmax;
+               EVP_EncryptUpdate(&ioq->ctx,
+                                 ioq->buf + ioq->fifo_cdx, &p_len,
+                                 (char *)iov[i].iov_base + already, p_len);
+               ioq->fifo_cdx += p_len;
+               ioq->fifo_end += p_len;
+               nmax -= p_len;
+               if (nmax == 0)
+                       break;
+               already = 0;
+       }
+       iov[0].iov_base = ioq->buf + ioq->fifo_beg;
+       iov[0].iov_len = ioq->fifo_cdx - ioq->fifo_beg;
+
+       return (1);
+}
+
+void
+hammer2_crypto_encrypt_wrote(hammer2_iocom_t *iocom, hammer2_ioq_t *ioq,
+                            int nact)
+{
+       if ((iocom->flags & HAMMER2_IOCOMF_CRYPTED) == 0)
+               return;
+       if (nact == 0)
+               return;
+       ioq->fifo_beg += nact;
+       if (ioq->fifo_beg == ioq->fifo_end) {
+               ioq->fifo_beg = 0;
+               ioq->fifo_cdx = 0;
+               ioq->fifo_end = 0;
+       }
+}
diff --git a/sbin/hammer2/hammer2.h b/sbin/hammer2/hammer2.h
new file mode 100644 (file)
index 0000000..e7642af
--- /dev/null
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Rollup headers for hammer2 utility
+ */
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <sys/mount.h>
+#include <sys/file.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <sys/wait.h>
+#include <sys/tty.h>
+#include <sys/endian.h>
+
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <arpa/inet.h>
+#include <netdb.h>
+
+#include <vfs/hammer2/hammer2_disk.h>
+#include <vfs/hammer2/hammer2_mount.h>
+#include <vfs/hammer2/hammer2_ioctl.h>
+#include <vfs/hammer2/hammer2_network.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <stddef.h>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <signal.h>
+#include <string.h>
+#include <unistd.h>
+#include <ctype.h>
+#include <uuid.h>
+#include <assert.h>
+#include <pthread.h>
+#include <poll.h>
+
+#include <libutil.h>
+
+#include "network.h"
+
+#define HAMMER2_DEFAULT_DIR    "/etc/hammer2"
+#define HAMMER2_PATH_REMOTE    HAMMER2_DEFAULT_DIR "/remote"
+
+extern int DebugOpt;
+extern int VerboseOpt;
+extern int NormalExit;
+
+int hammer2_ioctl_handle(const char *sel_path);
+void hammer2_demon(void *(*func)(void *), void *arg);
+void hammer2_bswap_head(hammer2_msg_hdr_t *head);
+
+int cmd_remote_connect(const char *sel_path, const char *url);
+int cmd_remote_disconnect(const char *sel_path, const char *url);
+int cmd_remote_status(const char *sel_path, int all_opt);
+
+int cmd_pfs_list(const char *sel_path);
+int cmd_pfs_create(const char *sel_path, const char *name,
+                       uint8_t pfs_type, const char *uuid_str);
+int cmd_pfs_delete(const char *sel_path, const char *name);
+
+int cmd_service(void);
+int cmd_leaf(const char *sel_path);
+int cmd_shell(const char *hostname);
+int cmd_show(const char *devpath);
+int cmd_rsainit(const char *dir_path);
+int cmd_rsaenc(const char **keys, int nkeys);
+int cmd_rsadec(const char **keys, int nkeys);
+
+void hammer2_ioq_init(hammer2_iocom_t *iocom, hammer2_ioq_t *ioq);
+void hammer2_ioq_done(hammer2_iocom_t *iocom, hammer2_ioq_t *ioq);
+void hammer2_iocom_init(hammer2_iocom_t *iocom, int sock_fd, int alt_fd);
+void hammer2_iocom_done(hammer2_iocom_t *iocom);
+hammer2_msg_t *hammer2_allocmsg(hammer2_iocom_t *iocom,
+                       uint32_t cmd, int aux_size);
+hammer2_msg_t *hammer2_allocreply(hammer2_msg_t *msg,
+                       uint32_t cmd, int aux_size);
+void hammer2_replymsg(hammer2_msg_t *msg, uint16_t error);
+void hammer2_freemsg(hammer2_msg_t *msg);
+
+void hammer2_iocom_core(hammer2_iocom_t *iocom,
+                       void (*iocom_recvmsg)(hammer2_iocom_t *),
+                       void (*iocom_sendmsg)(hammer2_iocom_t *),
+                       void (*iocom_altmsg)(hammer2_iocom_t *));
+hammer2_msg_t *hammer2_ioq_read(hammer2_iocom_t *iocom);
+void hammer2_ioq_write(hammer2_msg_t *msg);
+
+void hammer2_ioq_stream(hammer2_msg_t *msg, int reply);
+void hammer2_iocom_drain(hammer2_iocom_t *iocom);
+void hammer2_iocom_flush(hammer2_iocom_t *iocom);
+
+void hammer2_crypto_negotiate(hammer2_iocom_t *iocom);
+void hammer2_crypto_decrypt(hammer2_iocom_t *iocom, hammer2_ioq_t *ioq);
+void hammer2_crypto_decrypt_aux(hammer2_iocom_t *iocom, hammer2_ioq_t *ioq,
+                       hammer2_msg_t *msg, int already);
+int hammer2_crypto_encrypt(hammer2_iocom_t *iocom, hammer2_ioq_t *ioq,
+                       struct iovec *iov, int n);
+void hammer2_crypto_encrypt_wrote(hammer2_iocom_t *iocom, hammer2_ioq_t *ioq,
+                       int nact);
+
+const char *hammer2_time64_to_str(uint64_t htime64, char **strp);
+const char *hammer2_uuid_to_str(uuid_t *uuid, char **strp);
+const char *hammer2_iptype_to_str(uint8_t type);
+const char *hammer2_pfstype_to_str(uint8_t type);
+
+void hammer2_shell_remote(hammer2_msg_t *msg);
+void msg_printf(hammer2_msg_t *msg, const char *ctl, ...);
diff --git a/sbin/hammer2/icrc.c b/sbin/hammer2/icrc.c
new file mode 100644 (file)
index 0000000..82cadcd
--- /dev/null
@@ -0,0 +1,147 @@
+/*-
+ * Copyright (c) 2005-2010 Daniel Braniss <danny@cs.huji.ac.il>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+/*
+ | iSCSI
+ | $Id: isc_subr.c 560 2009-05-07 07:37:49Z danny $
+ */
+
+#include <sys/types.h>
+#include <sys/uuid.h>
+
+#include <vfs/hammer2/hammer2_disk.h>
+
+/*****************************************************************/
+/*                                                               */
+/* CRC LOOKUP TABLE                                              */
+/* ================                                              */
+/* The following CRC lookup table was generated automagically    */
+/* by the Rocksoft^tm Model CRC Algorithm Table Generation       */
+/* Program V1.0 using the following model parameters:            */
+/*                                                               */
+/*    Width   : 4 bytes.                                         */
+/*    Poly    : 0x1EDC6F41L                                      */
+/*    Reverse : TRUE.                                            */
+/*                                                               */
+/* For more information on the Rocksoft^tm Model CRC Algorithm,  */
+/* see the document titled "A Painless Guide to CRC Error        */
+/* Detection Algorithms" by Ross Williams                        */
+/* (ross@guest.adelaide.edu.au.). This document is likely to be  */
+/* in the FTP archive "ftp.adelaide.edu.au/pub/rocksoft".        */
+/*                                                               */
+/*****************************************************************/
+
+static uint32_t crc32Table[256] = {
+    0x00000000L, 0xF26B8303L, 0xE13B70F7L, 0x1350F3F4L,
+    0xC79A971FL, 0x35F1141CL, 0x26A1E7E8L, 0xD4CA64EBL,
+    0x8AD958CFL, 0x78B2DBCCL, 0x6BE22838L, 0x9989AB3BL,
+    0x4D43CFD0L, 0xBF284CD3L, 0xAC78BF27L, 0x5E133C24L,
+    0x105EC76FL, 0xE235446CL, 0xF165B798L, 0x030E349BL,
+    0xD7C45070L, 0x25AFD373L, 0x36FF2087L, 0xC494A384L,
+    0x9A879FA0L, 0x68EC1CA3L, 0x7BBCEF57L, 0x89D76C54L,
+    0x5D1D08BFL, 0xAF768BBCL, 0xBC267848L, 0x4E4DFB4BL,
+    0x20BD8EDEL, 0xD2D60DDDL, 0xC186FE29L, 0x33ED7D2AL,
+    0xE72719C1L, 0x154C9AC2L, 0x061C6936L, 0xF477EA35L,
+    0xAA64D611L, 0x580F5512L, 0x4B5FA6E6L, 0xB93425E5L,
+    0x6DFE410EL, 0x9F95C20DL, 0x8CC531F9L, 0x7EAEB2FAL,
+    0x30E349B1L, 0xC288CAB2L, 0xD1D83946L, 0x23B3BA45L,
+    0xF779DEAEL, 0x05125DADL, 0x1642AE59L, 0xE4292D5AL,
+    0xBA3A117EL, 0x4851927DL, 0x5B016189L, 0xA96AE28AL,
+    0x7DA08661L, 0x8FCB0562L, 0x9C9BF696L, 0x6EF07595L,
+    0x417B1DBCL, 0xB3109EBFL, 0xA0406D4BL, 0x522BEE48L,
+    0x86E18AA3L, 0x748A09A0L, 0x67DAFA54L, 0x95B17957L,
+    0xCBA24573L, 0x39C9C670L, 0x2A993584L, 0xD8F2B687L,
+    0x0C38D26CL, 0xFE53516FL, 0xED03A29BL, 0x1F682198L,
+    0x5125DAD3L, 0xA34E59D0L, 0xB01EAA24L, 0x42752927L,
+    0x96BF4DCCL, 0x64D4CECFL, 0x77843D3BL, 0x85EFBE38L,
+    0xDBFC821CL, 0x2997011FL, 0x3AC7F2EBL, 0xC8AC71E8L,
+    0x1C661503L, 0xEE0D9600L, 0xFD5D65F4L, 0x0F36E6F7L,
+    0x61C69362L, 0x93AD1061L, 0x80FDE395L, 0x72966096L,
+    0xA65C047DL, 0x5437877EL, 0x4767748AL, 0xB50CF789L,
+    0xEB1FCBADL, 0x197448AEL, 0x0A24BB5AL, 0xF84F3859L,
+    0x2C855CB2L, 0xDEEEDFB1L, 0xCDBE2C45L, 0x3FD5AF46L,
+    0x7198540DL, 0x83F3D70EL, 0x90A324FAL, 0x62C8A7F9L,
+    0xB602C312L, 0x44694011L, 0x5739B3E5L, 0xA55230E6L,
+    0xFB410CC2L, 0x092A8FC1L, 0x1A7A7C35L, 0xE811FF36L,
+    0x3CDB9BDDL, 0xCEB018DEL, 0xDDE0EB2AL, 0x2F8B6829L,
+    0x82F63B78L, 0x709DB87BL, 0x63CD4B8FL, 0x91A6C88CL,
+    0x456CAC67L, 0xB7072F64L, 0xA457DC90L, 0x563C5F93L,
+    0x082F63B7L, 0xFA44E0B4L, 0xE9141340L, 0x1B7F9043L,
+    0xCFB5F4A8L, 0x3DDE77ABL, 0x2E8E845FL, 0xDCE5075CL,
+    0x92A8FC17L, 0x60C37F14L, 0x73938CE0L, 0x81F80FE3L,
+    0x55326B08L, 0xA759E80BL, 0xB4091BFFL, 0x466298FCL,
+    0x1871A4D8L, 0xEA1A27DBL, 0xF94AD42FL, 0x0B21572CL,
+    0xDFEB33C7L, 0x2D80B0C4L, 0x3ED04330L, 0xCCBBC033L,
+    0xA24BB5A6L, 0x502036A5L, 0x4370C551L, 0xB11B4652L,
+    0x65D122B9L, 0x97BAA1BAL, 0x84EA524EL, 0x7681D14DL,
+    0x2892ED69L, 0xDAF96E6AL, 0xC9A99D9EL, 0x3BC21E9DL,
+    0xEF087A76L, 0x1D63F975L, 0x0E330A81L, 0xFC588982L,
+    0xB21572C9L, 0x407EF1CAL, 0x532E023EL, 0xA145813DL,
+    0x758FE5D6L, 0x87E466D5L, 0x94B49521L, 0x66DF1622L,
+    0x38CC2A06L, 0xCAA7A905L, 0xD9F75AF1L, 0x2B9CD9F2L,
+    0xFF56BD19L, 0x0D3D3E1AL, 0x1E6DCDEEL, 0xEC064EEDL,
+    0xC38D26C4L, 0x31E6A5C7L, 0x22B65633L, 0xD0DDD530L,
+    0x0417B1DBL, 0xF67C32D8L, 0xE52CC12CL, 0x1747422FL,
+    0x49547E0BL, 0xBB3FFD08L, 0xA86F0EFCL, 0x5A048DFFL,
+    0x8ECEE914L, 0x7CA56A17L, 0x6FF599E3L, 0x9D9E1AE0L,
+    0xD3D3E1ABL, 0x21B862A8L, 0x32E8915CL, 0xC083125FL,
+    0x144976B4L, 0xE622F5B7L, 0xF5720643L, 0x07198540L,
+    0x590AB964L, 0xAB613A67L, 0xB831C993L, 0x4A5A4A90L,
+    0x9E902E7BL, 0x6CFBAD78L, 0x7FAB5E8CL, 0x8DC0DD8FL,
+    0xE330A81AL, 0x115B2B19L, 0x020BD8EDL, 0xF0605BEEL,
+    0x24AA3F05L, 0xD6C1BC06L, 0xC5914FF2L, 0x37FACCF1L,
+    0x69E9F0D5L, 0x9B8273D6L, 0x88D28022L, 0x7AB90321L,
+    0xAE7367CAL, 0x5C18E4C9L, 0x4F48173DL, 0xBD23943EL,
+    0xF36E6F75L, 0x0105EC76L, 0x12551F82L, 0xE03E9C81L,
+    0x34F4F86AL, 0xC69F7B69L, 0xD5CF889DL, 0x27A40B9EL,
+    0x79B737BAL, 0x8BDCB4B9L, 0x988C474DL, 0x6AE7C44EL,
+    0xBE2DA0A5L, 0x4C4623A6L, 0x5F16D052L, 0xAD7D5351L
+};
+
+uint32_t
+hammer2_icrc32(const void *buf, size_t size)
+{
+     const uint8_t *p = buf;
+     uint32_t crc = 0;
+
+     crc = crc ^ 0xffffffff;
+     while (size--)
+         crc = crc32Table[(crc ^ *p++) & 0xff] ^ (crc >> 8);
+     crc = crc ^ 0xffffffff;
+     return crc;
+}
+
+uint32_t
+hammer2_icrc32c(const void *buf, size_t size, uint32_t crc)
+{
+     const uint8_t *p = buf;
+
+     crc = crc ^ 0xffffffff;
+     while (size--)
+         crc = crc32Table[(crc ^ *p++) & 0xff] ^ (crc >> 8);
+     crc = crc ^ 0xffffffff;
+     return crc;
+}
diff --git a/sbin/hammer2/main.c b/sbin/hammer2/main.c
new file mode 100644 (file)
index 0000000..a43f5cd
--- /dev/null
@@ -0,0 +1,318 @@
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "hammer2.h"
+
+static void usage(int code);
+
+int DebugOpt;
+int VerboseOpt;
+int NormalExit = 1;    /* if set to 0 main() has to pthread_exit() */
+
+int
+main(int ac, char **av)
+{
+       const char *sel_path = NULL;
+       const char *uuid_str = NULL;
+       const char *arg;
+       int pfs_type = HAMMER2_PFSTYPE_NONE;
+       int quick_opt = 0;
+       int all_opt = 0;
+       int ecode = 0;
+       int ch;
+
+       srandomdev();
+
+       /*
+        * Core options
+        */
+       while ((ch = getopt(ac, av, "adqs:t:u:v")) != -1) {
+               switch(ch) {
+               case 'a':
+                       all_opt = 1;
+                       break;
+               case 'd':
+                       DebugOpt = 1;
+                       break;
+               case 'q':
+                       /*
+                        * Quick mode - do not block verifying certain
+                        * operations such as (connect).
+                        */
+                       quick_opt = 1;
+                       break;
+               case 's':
+                       sel_path = optarg;
+                       break;
+               case 't':
+                       /*
+                        * set node type for mkpfs
+                        */
+                       if (strcasecmp(optarg, "ADMIN") == 0) {
+                               pfs_type = HAMMER2_PFSTYPE_ADMIN;
+                       } else if (strcasecmp(optarg, "CACHE") == 0) {
+                               pfs_type = HAMMER2_PFSTYPE_CACHE;
+                       } else if (strcasecmp(optarg, "COPY") == 0) {
+                               pfs_type = HAMMER2_PFSTYPE_COPY;
+                       } else if (strcasecmp(optarg, "SLAVE") == 0) {
+                               pfs_type = HAMMER2_PFSTYPE_SLAVE;
+                       } else if (strcasecmp(optarg, "SOFT_SLAVE") == 0) {
+                               pfs_type = HAMMER2_PFSTYPE_SOFT_SLAVE;
+                       } else if (strcasecmp(optarg, "SOFT_MASTER") == 0) {
+                               pfs_type = HAMMER2_PFSTYPE_SOFT_MASTER;
+                       } else if (strcasecmp(optarg, "MASTER") == 0) {
+                               pfs_type = HAMMER2_PFSTYPE_MASTER;
+                       } else {
+                               fprintf(stderr, "-t: Unrecognized node type\n");
+                               usage(1);
+                       }
+                       break;
+               case 'u':
+                       /*
+                        * set uuid for mkpfs, else one will be generated
+                        * (required for all except the MASTER node_type)
+                        */
+                       uuid_str = optarg;
+                       break;
+               case 'v':
+                       ++VerboseOpt;
+                       break;
+               default:
+                       fprintf(stderr, "Unknown option: %c\n", ch);
+                       usage(1);
+                       /* not reached */
+                       break;
+               }
+       }
+
+       /*
+        * Adjust, then process the command
+        */
+       ac -= optind;
+       av += optind;
+       if (ac < 1) {
+               fprintf(stderr, "Missing command\n");
+               usage(1);
+               /* not reached */
+       }
+
+       if (strcmp(av[0], "connect") == 0) {
+               /*
+                * Add cluster connection
+                */
+               if (ac < 2) {
+                       fprintf(stderr, "connect: missing argument\n");
+                       usage(1);
+               }
+               ecode = cmd_remote_connect(sel_path, av[1]);
+       } else if (strcmp(av[0], "disconnect") == 0) {
+               /*
+                * Remove cluster connection
+                */
+               if (ac < 2) {
+                       fprintf(stderr, "disconnect: missing argument\n");
+                       usage(1);
+               }
+               ecode = cmd_remote_disconnect(sel_path, av[1]);
+       } else if (strcmp(av[0], "status") == 0) {
+               /*
+                * Get status of PFS and its connections (-a for all PFSs)
+                */
+               ecode = cmd_remote_status(sel_path, all_opt);
+       } else if (strcmp(av[0], "pfs-list") == 0) {
+               /*
+                * List all PFSs
+                */
+               ecode = cmd_pfs_list(sel_path);
+       } else if (strcmp(av[0], "pfs-create") == 0) {
+               /*
+                * Create new PFS using pfs_type
+                */
+               if (ac < 2) {
+                       fprintf(stderr, "pfs-create: requires name\n");
+                       usage(1);
+               }
+               ecode = cmd_pfs_create(sel_path, av[1], pfs_type, uuid_str);
+       } else if (strcmp(av[0], "pfs-delete") == 0) {
+               /*
+                * Delete a PFS by name
+                */
+               if (ac < 2) {
+                       fprintf(stderr, "pfs-delete: requires name\n");
+                       usage(1);
+               }
+               ecode = cmd_pfs_delete(sel_path, av[1]);
+       } else if (strcmp(av[0], "snapshot") == 0) {
+               /*
+                * Create snapshot with optional pfs-type and optional
+                * label override.
+                */
+       } else if (strcmp(av[0], "service") == 0) {
+               /*
+                * Start the service daemon.  This daemon accepts
+                * connections from local and remote clients, handles
+                * the security handshake, and manages the core messaging
+                * protocol.
+                */
+               ecode = cmd_service();
+       } else if (strcmp(av[0], "leaf") == 0) {
+               /*
+                * Start the management daemon for a specific PFS.
+                *
+                * This will typically connect to the local master node
+                * daemon, register the PFS, and then pass its side of
+                * the socket descriptor to the kernel HAMMER2 VFS via an
+                * ioctl().  The process and/or thread context remains in the
+                * kernel until the PFS is unmounted or the connection is
+                * lost, then returns from the ioctl.
+                *
+                * It is possible to connect directly to a remote master node
+                * instead of the local master node in situations where
+                * encryption is not desired or no local master node is
+                * desired.  This is not recommended because it represents
+                * a single point of failure for the PFS's communications.
+                *
+                * Direct kernel<->kernel communication between HAMMER2 VFSs
+                * is theoretically possible for directly-connected
+                * registrations (i.e. where the spanning tree is degenerate),
+                * but not recommended.  We specifically try to reduce the
+                * complexity of the HAMMER2 VFS kernel code.
+                */
+               ecode = cmd_leaf(sel_path);
+       } else if (strcmp(av[0], "shell") == 0) {
+               /*
+                * Connect to the command line monitor in the hammer2 master
+                * node for the machine using HAMMER2_DBG_SHELL messages.
+                */
+               ecode = cmd_shell((ac < 2) ? NULL : av[1]);
+       } else if (strcmp(av[0], "rsainit") == 0) {
+               /*
+                * Initialize a RSA keypair.  If no target directory is
+                * specified we default to "/etc/hammer2".
+                */
+               arg = (ac < 2) ? HAMMER2_DEFAULT_DIR : av[1];
+               ecode = cmd_rsainit(arg);
+       } else if (strcmp(av[0], "rsaenc") == 0) {
+               /*
+                * Encrypt the input symmetrically by running it through
+                * the specified public and/or private key files.
+                *
+                * If no key files are specified data is encoded using
+                * "/etc/hammer2/rsa.pub".
+                *
+                * WARNING: no padding is added, data stream must contain
+                *          random padding for this to be secure.
+                *
+                * Used for debugging only
+                */
+               if (ac == 1) {
+                       const char *rsapath = HAMMER2_DEFAULT_DIR "/rsa.pub";
+                       ecode = cmd_rsaenc(&rsapath, 1);
+               } else {
+                       ecode = cmd_rsaenc((const char **)&av[1], ac - 1);
+               }
+       } else if (strcmp(av[0], "rsadec") == 0) {
+               /*
+                * Decrypt the input symmetrically by running it through
+                * the specified public and/or private key files.
+                *
+                * If no key files are specified data is decoded using
+                * "/etc/hammer2/rsa.prv".
+                *
+                * WARNING: no padding is added, data stream must contain
+                *          random padding for this to be secure.
+                *
+                * Used for debugging only
+                */
+               if (ac == 1) {
+                       const char *rsapath = HAMMER2_DEFAULT_DIR "/rsa.prv";
+                       ecode = cmd_rsadec(&rsapath, 1);
+               } else {
+                       ecode = cmd_rsadec((const char **)&av[1], ac - 1);
+               }
+       } else if (strcmp(av[0], "show") == 0) {
+               /*
+                * Raw dump of filesystem.  Use -v to check all crc's, and
+                * -vv to dump bulk file data.
+                */
+               if (ac != 2) {
+                       fprintf(stderr, "show: requires device path\n");
+                       usage(1);
+               } else {
+                       cmd_show(av[1]);
+               }
+       } else {
+               fprintf(stderr, "Unrecognized command: %s\n", av[0]);
+               usage(1);
+       }
+
+       /*
+        * In DebugMode we may wind up starting several pthreads in the
+        * original process, in which case we have to let them run and
+        * not actually exit.
+        */
+       if (NormalExit) {
+               return (ecode);
+       } else {
+               pthread_exit(NULL);
+               _exit(2);       /* NOT REACHED */
+       }
+}
+
+static
+void
+usage(int code)
+{
+       fprintf(stderr,
+               "hammer2 [-s path] command...\n"
+               "    -s path            Select filesystem\n"
+               "    -t type            PFS type for pfs-create\n"
+               "    -u uuid            uuid for pfs-create\n"
+               "\n"
+               "    connect <target>   Add cluster link\n"
+               "    disconnect <target> Del cluster link\n"
+               "    status             Report cluster status\n"
+               "    pfs-list           List PFSs\n"
+               "    pfs-create <label> Create a PFS\n"
+               "    pfs-delete <label> Destroy a PFS\n"
+               "    snapshot           Snapshot a PFS\n"
+               "    service            Start service daemon\n"
+               "    leaf               Start pfs leaf daemon\n"
+               "    shell [<host>]     Connect to debug shell\n"
+               "    rsainit            Initialize rsa fields\n"
+               "    show devpath       Raw hammer2 media dump\n"
+       );
+       exit(code);
+}
diff --git a/sbin/hammer2/msg.c b/sbin/hammer2/msg.c
new file mode 100644 (file)
index 0000000..4fc4439
--- /dev/null
@@ -0,0 +1,937 @@
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "hammer2.h"
+
+/*
+ * Initialize a low-level ioq
+ */
+void
+hammer2_ioq_init(hammer2_iocom_t *iocom __unused, hammer2_ioq_t *ioq)
+{
+       bzero(ioq, sizeof(*ioq));
+       ioq->state = HAMMER2_MSGQ_STATE_HEADER1;
+       TAILQ_INIT(&ioq->msgq);
+}
+
+void
+hammer2_ioq_done(hammer2_iocom_t *iocom __unused, hammer2_ioq_t *ioq)
+{
+       hammer2_msg_t *msg;
+
+       while ((msg = TAILQ_FIRST(&ioq->msgq)) != NULL) {
+               TAILQ_REMOVE(&ioq->msgq, msg, entry);
+               hammer2_freemsg(msg);
+       }
+       if ((msg = ioq->msg) != NULL) {
+               ioq->msg = NULL;
+               hammer2_freemsg(msg);
+       }
+}
+
+/*
+ * Initialize a low-level communications channel
+ */
+void
+hammer2_iocom_init(hammer2_iocom_t *iocom, int sock_fd, int alt_fd)
+{
+       bzero(iocom, sizeof(*iocom));
+
+       TAILQ_INIT(&iocom->freeq);
+       TAILQ_INIT(&iocom->freeq_aux);
+       iocom->sock_fd = sock_fd;
+       iocom->alt_fd = alt_fd;
+       iocom->flags = HAMMER2_IOCOMF_RREQ | HAMMER2_IOCOMF_WIDLE;
+       hammer2_ioq_init(iocom, &iocom->ioq_rx);
+       hammer2_ioq_init(iocom, &iocom->ioq_tx);
+
+       /*
+        * Negotiate session crypto synchronously.  This will mark the
+        * connection as error'd if it fails.
+        */
+       hammer2_crypto_negotiate(iocom);
+
+       /*
+        * Make sure our fds are set to non-blocking for the iocom core.
+        */
+       if (sock_fd >= 0)
+               fcntl(sock_fd, F_SETFL, O_NONBLOCK);
+#if 0
+       /* if line buffered our single fgets() should be fine */
+       if (alt_fd >= 0)
+               fcntl(alt_fd, F_SETFL, O_NONBLOCK);
+#endif
+}
+
+void
+hammer2_iocom_done(hammer2_iocom_t *iocom)
+{
+       hammer2_msg_t *msg;
+
+       iocom->sock_fd = -1;
+       hammer2_ioq_done(iocom, &iocom->ioq_rx);
+       hammer2_ioq_done(iocom, &iocom->ioq_tx);
+       if ((msg = TAILQ_FIRST(&iocom->freeq)) != NULL) {
+               TAILQ_REMOVE(&iocom->freeq, msg, entry);
+               free(msg);
+       }
+       if ((msg = TAILQ_FIRST(&iocom->freeq_aux)) != NULL) {
+               TAILQ_REMOVE(&iocom->freeq_aux, msg, entry);
+               free(msg->aux_data);
+               msg->aux_data = NULL;
+               free(msg);
+       }
+}
+
+/*
+ * Allocate a new one-way message.
+ */
+hammer2_msg_t *
+hammer2_allocmsg(hammer2_iocom_t *iocom, uint32_t cmd, int aux_size)
+{
+       hammer2_msg_t *msg;
+       int hbytes;
+
+       if (aux_size) {
+               aux_size = (aux_size + HAMMER2_MSG_ALIGNMASK) &
+                          ~HAMMER2_MSG_ALIGNMASK;
+               if ((msg = TAILQ_FIRST(&iocom->freeq_aux)) != NULL)
+                       TAILQ_REMOVE(&iocom->freeq_aux, msg, entry);
+       } else {
+               if ((msg = TAILQ_FIRST(&iocom->freeq)) != NULL)
+                       TAILQ_REMOVE(&iocom->freeq, msg, entry);
+       }
+       if (msg == NULL) {
+               msg = malloc(sizeof(*msg));
+               msg->iocom = iocom;
+               msg->aux_data = NULL;
+               msg->aux_size = 0;
+       }
+       if (msg->aux_size != aux_size) {
+               if (msg->aux_data) {
+                       free(msg->aux_data);
+                       msg->aux_data = NULL;
+                       msg->aux_size = 0;
+               }
+               if (aux_size) {
+                       msg->aux_data = malloc(aux_size);
+                       msg->aux_size = aux_size;
+               }
+       }
+       msg->flags = 0;
+       hbytes = (cmd & HAMMER2_MSGF_SIZE) * HAMMER2_MSG_ALIGN;
+       if (hbytes)
+               bzero(&msg->any.head, hbytes);
+       msg->any.head.aux_icrc = 0;
+       msg->any.head.cmd = cmd;
+
+       return (msg);
+}
+
+/*
+ * Allocate a one-way or streaming reply to a message.  The message is
+ * not modified.  This function may be used to allocate multiple replies.
+ *
+ * If cmd is 0 then msg->any.head.cmd is used to formulate the reply command.
+ */
+hammer2_msg_t *
+hammer2_allocreply(hammer2_msg_t *msg, uint32_t cmd, int aux_size)
+{
+       hammer2_msg_t *rmsg;
+       hammer2_persist_t *pers;
+
+       assert((msg->any.head.cmd & HAMMER2_MSGF_REPLY) == 0);
+       if (cmd == 0)
+               cmd = msg->any.head.cmd;
+
+       rmsg = hammer2_allocmsg(msg->iocom, cmd, aux_size);
+       rmsg->any.head = msg->any.head;
+       rmsg->any.head.source = msg->any.head.target;
+       rmsg->any.head.target = msg->any.head.source;
+       rmsg->any.head.cmd = (cmd | HAMMER2_MSGF_REPLY) &
+                            ~(HAMMER2_MSGF_CREATE | HAMMER2_MSGF_DELETE);
+       rmsg->any.head.aux_icrc = 0;
+
+       if ((pers = msg->persist) != NULL) {
+               assert(pers->lrep & HAMMER2_MSGF_DELETE);
+               rmsg->any.head.cmd |= pers->lrep & HAMMER2_MSGF_CREATE;
+               pers->lrep &= ~HAMMER2_MSGF_CREATE;
+               /* do not clear DELETE */
+       }
+       return (rmsg);
+}
+
+/*
+ * Free a message so it can be reused afresh.
+ *
+ * NOTE: aux_size can be 0 with a non-NULL aux_data.
+ */
+void
+hammer2_freemsg(hammer2_msg_t *msg)
+{
+       hammer2_iocom_t *iocom = msg->iocom;
+
+       if (msg->aux_data)
+               TAILQ_INSERT_TAIL(&iocom->freeq_aux, msg, entry);
+       else
+               TAILQ_INSERT_TAIL(&iocom->freeq, msg, entry);
+}
+
+/*
+ * I/O core loop for an iocom.
+ */
+void
+hammer2_iocom_core(hammer2_iocom_t *iocom,
+                  void (*recvmsg_func)(hammer2_iocom_t *),
+                  void (*sendmsg_func)(hammer2_iocom_t *),
+                  void (*altmsg_func)(hammer2_iocom_t *))
+{
+       struct pollfd fds[2];
+       int timeout;
+
+       iocom->recvmsg_callback = recvmsg_func;
+       iocom->sendmsg_callback = sendmsg_func;
+       iocom->altmsg_callback = altmsg_func;
+
+       while ((iocom->flags & HAMMER2_IOCOMF_EOF) == 0) {
+               timeout = 5000;
+
+               fds[0].fd = iocom->sock_fd;
+               fds[0].events = 0;
+               fds[0].revents = 0;
+
+               if (iocom->flags & HAMMER2_IOCOMF_RREQ)
+                       fds[0].events |= POLLIN;
+               else
+                       timeout = 0;
+               if ((iocom->flags & HAMMER2_IOCOMF_WIDLE) == 0) {
+                       if (iocom->flags & HAMMER2_IOCOMF_WREQ)
+                               fds[0].events |= POLLOUT;
+                       else
+                               timeout = 0;
+               }
+
+               if (iocom->alt_fd >= 0) {
+                       fds[1].fd = iocom->alt_fd;
+                       fds[1].events |= POLLIN;
+                       fds[1].revents = 0;
+                       poll(fds, 2, timeout);
+               } else {
+                       poll(fds, 1, timeout);
+               }
+               if ((fds[0].revents & POLLIN) ||
+                   (iocom->flags & HAMMER2_IOCOMF_RREQ) == 0) {
+                       iocom->recvmsg_callback(iocom);
+               }
+               if ((iocom->flags & HAMMER2_IOCOMF_WIDLE) == 0) {
+                       if ((fds[0].revents & POLLOUT) ||
+                           (iocom->flags & HAMMER2_IOCOMF_WREQ) == 0) {
+                               iocom->sendmsg_callback(iocom);
+                       }
+               }
+               if (iocom->alt_fd >= 0 && (fds[1].revents & POLLIN))
+                       iocom->altmsg_callback(iocom);
+       }
+}
+
+/*
+ * Read the next ready message from the ioq, issuing I/O if needed.
+ * Caller should retry on a read-event when NULL is returned.
+ *
+ * If an error occurs during reception a HAMMER2_LNK_ERROR msg will
+ * be returned (and the caller must not call us again after that).
+ */
+hammer2_msg_t *
+hammer2_ioq_read(hammer2_iocom_t *iocom)
+{
+       hammer2_ioq_t *ioq = &iocom->ioq_rx;
+       hammer2_msg_t *msg;
+       hammer2_msg_hdr_t *head;
+       ssize_t n;
+       int bytes;
+       int flags;
+       int nmax;
+       uint16_t xcrc16;
+       uint32_t xcrc32;
+
+       /*
+        * If a message is already pending we can just remove and
+        * return it.
+        */
+       if ((msg = TAILQ_FIRST(&ioq->msgq)) != NULL) {
+               TAILQ_REMOVE(&ioq->msgq, msg, entry);
+               return(msg);
+       }
+
+       /*
+        * Message read in-progress (msg is NULL at the moment).  We don't
+        * allocate a msg until we have its core header.
+        */
+       bytes = ioq->fifo_end - ioq->fifo_beg;
+       nmax = sizeof(ioq->buf) - ioq->fifo_end;
+       msg = ioq->msg;
+
+       switch(ioq->state) {
+       case HAMMER2_MSGQ_STATE_HEADER1:
+               /*
+                * Load the primary header, fail on any non-trivial read
+                * error or on EOF.  Since the primary header is the same
+                * size is the message alignment it will never straddle
+                * the end of the buffer.
+                */
+               if (bytes < (int)sizeof(msg->any.head)) {
+                       n = read(iocom->sock_fd,
+                                ioq->buf + ioq->fifo_end,
+                                nmax);
+                       if (n <= 0) {
+                               if (n == 0) {
+                                       ioq->error = HAMMER2_IOQ_ERROR_EOF;
+                                       break;
+                               }
+                               if (errno != EINTR &&
+                                   errno != EINPROGRESS &&
+                                   errno != EAGAIN) {
+                                       ioq->error = HAMMER2_IOQ_ERROR_SOCK;
+                                       break;
+                               }
+                               n = 0;
+                               /* fall through */
+                       }
+                       ioq->fifo_end += n;
+                       bytes += n;
+                       nmax -= n;
+               }
+
+               /*
+                * Insufficient data accumulated (msg is NULL, caller will
+                * retry on event).
+                */
+               assert(msg == NULL);
+               if (bytes < (int)sizeof(msg->any.head))
+                       break;
+
+               /*
+                * Calculate the header, decrypt data received so far.
+                * Data will be decrypted in-place.  Partial blocks are
+                * not immediately decrypted.
+                */
+               hammer2_crypto_decrypt(iocom, ioq);
+               flags = 0;
+               head = (void *)(ioq->buf + ioq->fifo_beg);
+
+               /*
+                * Check and fixup the core header.  Note that the icrc
+                * has to be calculated before any fixups, but the crc
+                * fields in the msg may have to be swapped like everything
+                * else.
+                */
+               if (head->magic != HAMMER2_MSGHDR_MAGIC &&
+                   head->magic != HAMMER2_MSGHDR_MAGIC_REV) {
+                       ioq->error = HAMMER2_IOQ_ERROR_SYNC;
+                       break;
+               }
+
+               xcrc32 = hammer2_icrc32((char *)head + HAMMER2_MSGHDR_CRCOFF,
+                                       HAMMER2_MSGHDR_CRCBYTES);
+               if (head->magic == HAMMER2_MSGHDR_MAGIC_REV) {
+                       hammer2_bswap_head(head);
+                       flags |= HAMMER2_MSGX_BSWAPPED;
+               }
+               xcrc16 = (uint16_t)xcrc32 ^ (uint16_t)(xcrc32 >> 16);
+               if (xcrc16 != head->icrc1) {
+                       ioq->error = HAMMER2_IOQ_ERROR_HCRC;
+                       break;
+               }
+
+               /*
+                * Calculate the full header size and aux data size
+                */
+               ioq->hbytes = (head->cmd & HAMMER2_MSGF_SIZE) *
+                             HAMMER2_MSG_ALIGN;
+               ioq->abytes = head->aux_bytes * HAMMER2_MSG_ALIGN;
+               if (ioq->hbytes < (int)sizeof(msg->any.head) ||
+                   ioq->hbytes > (int)sizeof(msg->any) ||
+                   ioq->abytes > HAMMER2_MSGAUX_MAX) {
+                       ioq->error = HAMMER2_IOQ_ERROR_FIELD;
+                       break;
+               }
+
+               /*
+                * Finally allocate the message and copy the core header
+                * to the embedded extended header.
+                *
+                * Initialize msg->aux_size to 0 and use it to track
+                * the amount of data copied from the stream.
+                */
+               msg = hammer2_allocmsg(iocom, 0, ioq->abytes);
+               msg->aux_size = 0;
+               msg->flags = flags;
+               ioq->msg = msg;
+
+               /*
+                * We are either done or we fall-through
+                */
+               if (ioq->hbytes == sizeof(msg->any.head) && ioq->abytes == 0) {
+                       bcopy(head, &msg->any.head, sizeof(msg->any.head));
+                       ioq->fifo_beg += ioq->hbytes;
+                       break;
+               }
+
+               /*
+                * Fall through to the next state.  Make sure that the
+                * extended header does not straddle the end of the buffer.
+                * We still want to issue larger reads into our buffer,
+                * book-keeping is easier if we don't bcopy() yet.
+                */
+               if (bytes + nmax < ioq->hbytes) {
+                       bcopy(ioq->buf + ioq->fifo_beg, ioq->buf, bytes);
+                       ioq->fifo_cdx -= ioq->fifo_beg;
+                       ioq->fifo_beg = 0;
+                       ioq->fifo_end = bytes;
+                       nmax = sizeof(ioq->buf) - ioq->fifo_end;
+               }
+               ioq->state = HAMMER2_MSGQ_STATE_HEADER2;
+               /* fall through */
+       case HAMMER2_MSGQ_STATE_HEADER2:
+               /*
+                * Fill out the extended header.
+                */
+               assert(msg != NULL);
+               if (bytes < ioq->hbytes) {
+                       n = read(iocom->sock_fd,
+                                msg->any.buf + ioq->fifo_end,
+                                nmax);
+                       if (n <= 0) {
+                               if (n == 0) {
+                                       ioq->error = HAMMER2_IOQ_ERROR_EOF;
+                                       break;
+                               }
+                               if (errno != EINTR &&
+                                   errno != EINPROGRESS &&
+                                   errno != EAGAIN) {
+                                       ioq->error = HAMMER2_IOQ_ERROR_SOCK;
+                                       break;
+                               }
+                               n = 0;
+                               /* fall through */
+                       }
+                       ioq->fifo_end += n;
+                       bytes += n;
+                       nmax -= n;
+               }
+
+               /*
+                * Insufficient data accumulated (set msg NULL so caller will
+                * retry on event).
+                */
+               if (bytes < ioq->hbytes) {
+                       msg = NULL;
+                       break;
+               }
+
+               /*
+                * Calculate the extended header, decrypt data received
+                * so far.
+                */
+               hammer2_crypto_decrypt(iocom, ioq);
+               head = (void *)(ioq->buf + ioq->fifo_beg);
+
+               /*
+                * Check the crc on the extended header
+                */
+               if (ioq->hbytes > (int)sizeof(hammer2_msg_hdr_t)) {
+                       xcrc32 = hammer2_icrc32(head + 1,
+                                               ioq->hbytes - sizeof(*head));
+                       xcrc16 = (uint16_t)xcrc32 ^ (uint16_t)(xcrc32 >> 16);
+                       if (head->icrc2 != xcrc16) {
+                               ioq->error = HAMMER2_IOQ_ERROR_XCRC;
+                               break;
+                       }
+               }
+
+               /*
+                * Copy the extended header into the msg and adjust the
+                * FIFO.
+                */
+               bcopy(head, &msg->any, ioq->hbytes);
+
+               /*
+                * We are either done or we fall-through.
+                */
+               if (ioq->abytes == 0) {
+                       ioq->fifo_beg += ioq->hbytes;
+                       break;
+               }
+
+               /*
+                * Must adjust nmax and bytes (and the state) when falling
+                * through.
+                */
+               ioq->fifo_beg += ioq->hbytes;
+               nmax -= ioq->hbytes;
+               bytes -= ioq->hbytes;
+               ioq->state = HAMMER2_MSGQ_STATE_AUXDATA1;
+               /* fall through */
+       case HAMMER2_MSGQ_STATE_AUXDATA1:
+               /*
+                * Copy the partial or complete payload from remaining
+                * bytes in the FIFO.  We have to fall-through either
+                * way so we can check the crc.
+                */
+               assert(msg->aux_size == 0);
+               ioq->already = ioq->fifo_cdx - ioq->fifo_beg;
+               if (ioq->already > ioq->abytes)
+                       ioq->already = ioq->abytes;
+               if (bytes >= ioq->abytes) {
+                       bcopy(ioq->buf + ioq->fifo_beg, msg->aux_data,
+                             ioq->abytes);
+                       msg->aux_size = ioq->abytes;
+                       ioq->fifo_beg += ioq->abytes;
+                       if (ioq->fifo_cdx < ioq->fifo_beg)
+                               ioq->fifo_cdx = ioq->fifo_beg;
+                       bytes -= ioq->abytes;
+               } else if (bytes) {
+                       bcopy(ioq->buf + ioq->fifo_beg, msg->aux_data,
+                             bytes);
+                       msg->aux_size = bytes;
+                       ioq->fifo_beg += bytes;
+                       if (ioq->fifo_cdx < ioq->fifo_beg)
+                               ioq->fifo_cdx = ioq->fifo_beg;
+                       bytes = 0;
+               }
+               ioq->state = HAMMER2_MSGQ_STATE_AUXDATA2;
+               /* fall through */
+       case HAMMER2_MSGQ_STATE_AUXDATA2:
+               /*
+                * Read the remainder of the payload directly into the
+                * msg->aux_data buffer.
+                */
+               assert(msg);
+               if (msg->aux_size < ioq->abytes) {
+                       assert(bytes == 0);
+                       n = read(iocom->sock_fd,
+                                msg->aux_data + msg->aux_size,
+                                ioq->abytes - msg->aux_size);
+                       if (n <= 0) {
+                               if (n == 0) {
+                                       ioq->error = HAMMER2_IOQ_ERROR_EOF;
+                                       break;
+                               }
+                               if (errno != EINTR &&
+                                   errno != EINPROGRESS &&
+                                   errno != EAGAIN) {
+                                       ioq->error = HAMMER2_IOQ_ERROR_SOCK;
+                                       break;
+                               }
+                               n = 0;
+                               /* fall through */
+                       }
+                       msg->aux_size += n;
+               }
+
+               /*
+                * Insufficient data accumulated (set msg NULL so caller will
+                * retry on event).
+                */
+               if (msg->aux_size < ioq->abytes) {
+                       msg = NULL;
+                       break;
+               }
+               assert(msg->aux_size == ioq->abytes);
+               hammer2_crypto_decrypt_aux(iocom, ioq, msg, ioq->already);
+
+               /*
+                * Check aux_icrc, then we are done.
+                */
+               xcrc32 = hammer2_icrc32(msg->aux_data, msg->aux_size);
+               if (xcrc32 != msg->any.head.aux_icrc) {
+                       ioq->error = HAMMER2_IOQ_ERROR_ACRC;
+                       break;
+               }
+               break;
+       case HAMMER2_MSGQ_STATE_ERROR:
+       default:
+               /*
+                * We don't double-return errors, the caller should not
+                * have called us again after getting an error msg.
+                */
+               assert(0);
+               break;
+       }
+
+       /*
+        * Check the message sequence.  The iv[] should prevent any
+        * possibility of a replay but we add this check anyway.
+        */
+       if (msg && ioq->error == 0) {
+               if ((msg->any.head.salt & 255) != (ioq->seq & 255)) {
+                       ioq->error = HAMMER2_IOQ_ERROR_MSGSEQ;
+               } else {
+                       ++ioq->seq;
+               }
+       }
+
+       /*
+        * Handle error, RREQ, or completion
+        *
+        * NOTE: nmax and bytes are invalid at this point, we don't bother
+        *       to update them when breaking out.
+        */
+       if (ioq->error) {
+               /*
+                * An unrecoverable error occured during processing,
+                * return a special error message.  Try to leave the
+                * ioq state alone for post-mortem debugging.
+                *
+                * Link error messages are returned as one-way messages,
+                * so no flags get set.  Source and target is 0 (link-level),
+                * msgid is 0 (link-level).  All we really need to do is
+                * set up magic, cmd, and error.
+                */
+               assert(ioq->msg == msg);
+               if (msg == NULL)
+                       msg = hammer2_allocmsg(iocom, 0, 0);
+               else
+                       ioq->msg = NULL;
+
+               if (msg->aux_data) {
+                       free(msg->aux_data);
+                       msg->aux_data = NULL;
+                       msg->aux_size = 0;
+               }
+               bzero(&msg->any.head, sizeof(msg->any.head));
+               msg->any.head.magic = HAMMER2_MSGHDR_MAGIC;
+               msg->any.head.cmd = HAMMER2_LNK_ERROR;
+               msg->any.head.error = ioq->error;
+               ioq->state = HAMMER2_MSGQ_STATE_ERROR;
+               iocom->flags |= HAMMER2_IOCOMF_EOF;
+       } else if (msg == NULL) {
+               /*
+                * Insufficient data received to finish building the message,
+                * set RREQ and return NULL.
+                *
+                * Leave ioq->msg intact.
+                * Leave the FIFO intact.
+                */
+               iocom->flags |= HAMMER2_IOCOMF_RREQ;
+#if 0
+               ioq->fifo_cdx = 0;
+               ioq->fifo_beg = 0;
+               ioq->fifo_end = 0;
+#endif
+       } else {
+               /*
+                * Return msg, clear the FIFO if it is now empty.
+                * Flag RREQ if the caller needs to wait for a read-event
+                * or not.
+                *
+                * The fifo has already been advanced past the message.
+                * Trivially reset the FIFO indices if possible.
+                */
+               if (ioq->fifo_beg == ioq->fifo_end) {
+                       iocom->flags |= HAMMER2_IOCOMF_RREQ;
+                       ioq->fifo_cdx = 0;
+                       ioq->fifo_beg = 0;
+                       ioq->fifo_end = 0;
+               } else {
+                       iocom->flags &= ~HAMMER2_IOCOMF_RREQ;
+               }
+               ioq->state = HAMMER2_MSGQ_STATE_HEADER1;
+               ioq->msg = NULL;
+       }
+       return (msg);
+}
+
+/*
+ * Calculate the header and data crc's and write a low-level message to
+ * the connection.  If aux_icrc is non-zero the aux_data crc is already
+ * assumed to have been set.
+ *
+ * A non-NULL msg is added to the queue but not necessarily flushed.
+ * Calling this function with msg == NULL will get a flush going.
+ */
+void
+hammer2_ioq_write(hammer2_msg_t *msg)
+{
+       hammer2_iocom_t *iocom = msg->iocom;
+       hammer2_ioq_t *ioq = &iocom->ioq_tx;
+       uint16_t xcrc16;
+       uint32_t xcrc32;
+       int hbytes;
+
+       assert(msg);
+       if (ioq->error) {
+               TAILQ_INSERT_TAIL(&ioq->msgq, msg, entry);
+               ++ioq->msgcount;
+               hammer2_iocom_drain(iocom);
+               return;
+       }
+
+       /*
+        * Finish populating the msg fields.  The salt ensures that the iv[]
+        * array is ridiculously randomized and we also re-seed our PRNG
+        * every 32768 messages just to be sure.
+        */
+       msg->any.head.magic = HAMMER2_MSGHDR_MAGIC;
+       msg->any.head.salt = (random() << 8) | (ioq->seq & 255);
+       ++ioq->seq;
+       if ((ioq->seq & 32767) == 0)
+               srandomdev();
+
+       /*
+        * Calculate aux_icrc if 0, calculate icrc2, and finally
+        * calculate icrc1.
+        */
+       if (msg->aux_size && msg->any.head.aux_icrc == 0) {
+               assert((msg->aux_size & HAMMER2_MSG_ALIGNMASK) == 0);
+               xcrc32 = hammer2_icrc32(msg->aux_data, msg->aux_size);
+               msg->any.head.aux_icrc = xcrc32;
+       }
+       msg->any.head.aux_bytes = msg->aux_size / HAMMER2_MSG_ALIGN;
+       assert((msg->aux_size & HAMMER2_MSG_ALIGNMASK) == 0);
+
+       if ((msg->any.head.cmd & HAMMER2_MSGF_SIZE) >
+           sizeof(msg->any.head) / HAMMER2_MSG_ALIGN) {
+               hbytes = (msg->any.head.cmd & HAMMER2_MSGF_SIZE) *
+                       HAMMER2_MSG_ALIGN;
+               hbytes -= sizeof(msg->any.head);
+               xcrc32 = hammer2_icrc32(&msg->any.head + 1, hbytes);
+               xcrc16 = (uint16_t)xcrc32 ^ (uint16_t)(xcrc32 >> 16);
+               msg->any.head.icrc2 = xcrc16;
+       } else {
+               msg->any.head.icrc2 = 0;
+       }
+       xcrc32 = hammer2_icrc32(msg->any.buf + HAMMER2_MSGHDR_CRCOFF,
+                               HAMMER2_MSGHDR_CRCBYTES);
+       xcrc16 = (uint16_t)xcrc32 ^ (uint16_t)(xcrc32 >> 16);
+       msg->any.head.icrc1 = xcrc16;
+
+       /*
+        * XXX Encrypt the message
+        */
+
+       /*
+        * Enqueue the message.
+        */
+       TAILQ_INSERT_TAIL(&ioq->msgq, msg, entry);
+       ++ioq->msgcount;
+       iocom->flags &= ~HAMMER2_IOCOMF_WIDLE;
+
+       /*
+        * Flush if we know we can write (WREQ not set) and if
+        * sufficient messages have accumulated.  Otherwise hold
+        * off to avoid piecemeal system calls.
+        */
+       if (iocom->flags & HAMMER2_IOCOMF_WREQ)
+               return;
+       if (ioq->msgcount < HAMMER2_IOQ_MAXIOVEC / 2)
+               return;
+       hammer2_iocom_flush(iocom);
+}
+
+void
+hammer2_iocom_flush(hammer2_iocom_t *iocom)
+{
+       hammer2_ioq_t *ioq = &iocom->ioq_tx;
+       hammer2_msg_t *msg;
+       ssize_t nmax;
+       ssize_t nact;
+       struct iovec iov[HAMMER2_IOQ_MAXIOVEC];
+       int hbytes;
+       int abytes;
+       int hoff;
+       int aoff;
+       int n;
+
+       /*
+        * Pump messages out the connection by building an iovec.
+        */
+       n = 0;
+       nmax = 0;
+
+       TAILQ_FOREACH(msg, &ioq->msgq, entry) {
+               hoff = 0;
+               hbytes = (msg->any.head.cmd & HAMMER2_MSGF_SIZE) *
+                        HAMMER2_MSG_ALIGN;
+               aoff = 0;
+               abytes = msg->aux_size;
+               if (n == 0) {
+                       hoff += ioq->hbytes;
+                       aoff += ioq->abytes;
+               }
+               if (hbytes - hoff > 0) {
+                       iov[n].iov_base = (char *)&msg->any.head + hoff;
+                       iov[n].iov_len = hbytes - hoff;
+                       nmax += hbytes - hoff;
+                       ++n;
+                       if (n == HAMMER2_IOQ_MAXIOVEC)
+                               break;
+               }
+               if (abytes - aoff > 0) {
+                       assert(msg->aux_data != NULL);
+                       iov[n].iov_base = msg->aux_data + aoff;
+                       iov[n].iov_len = abytes - aoff;
+                       nmax += abytes - aoff;
+                       ++n;
+                       if (n == HAMMER2_IOQ_MAXIOVEC)
+                               break;
+               }
+       }
+       if (n == 0)
+               return;
+
+       /*
+        * Encrypt and write the data.  The crypto code will move the
+        * data into the fifo and adjust the iov as necessary.  If
+        * encryption is disabled the iov is left alone.
+        *
+        * hammer2_crypto_encrypt_wrote()
+        */
+       n = hammer2_crypto_encrypt(iocom, ioq, iov, n);
+
+       /*
+        * Execute the writev() then figure out what happened.
+        */
+       nact = writev(iocom->sock_fd, iov, n);
+       if (nact < 0) {
+               if (errno != EINTR &&
+                   errno != EINPROGRESS &&
+                   errno != EAGAIN) {
+                       ioq->error = HAMMER2_IOQ_ERROR_SOCK;
+                       hammer2_iocom_drain(iocom);
+               } else {
+                       iocom->flags |= HAMMER2_IOCOMF_WREQ;
+               }
+               return;
+       }
+       hammer2_crypto_encrypt_wrote(iocom, ioq, nact);
+       if (nact == nmax)
+               iocom->flags &= ~HAMMER2_IOCOMF_WREQ;
+       else
+               iocom->flags |= HAMMER2_IOCOMF_WREQ;
+
+       while ((msg = TAILQ_FIRST(&ioq->msgq)) != NULL) {
+               hbytes = (msg->any.head.cmd & HAMMER2_MSGF_SIZE) *
+                        HAMMER2_MSG_ALIGN;
+               abytes = msg->aux_size;
+
+               if (nact < hbytes - ioq->hbytes) {
+                       ioq->hbytes += nact;
+                       break;
+               }
+               nact -= hbytes - ioq->hbytes;
+               ioq->hbytes = hbytes;
+               if (nact < abytes - ioq->abytes) {
+                       ioq->abytes += nact;
+                       break;
+               }
+               nact -= abytes - ioq->abytes;
+
+               TAILQ_REMOVE(&ioq->msgq, msg, entry);
+               --ioq->msgcount;
+               ioq->hbytes = 0;
+               ioq->abytes = 0;
+               if (msg->aux_data)
+                       TAILQ_INSERT_TAIL(&iocom->freeq_aux, msg, entry);
+               else
+                       TAILQ_INSERT_TAIL(&iocom->freeq, msg, entry);
+       }
+       if (msg == NULL) {
+               iocom->flags |= HAMMER2_IOCOMF_WIDLE;
+               iocom->flags &= ~HAMMER2_IOCOMF_WREQ;
+       }
+       if (ioq->error) {
+               iocom->flags |= HAMMER2_IOCOMF_EOF |
+                               HAMMER2_IOCOMF_WIDLE;
+               iocom->flags &= ~HAMMER2_IOCOMF_WREQ;
+       }
+}
+
+/*
+ * Kill pending msgs on ioq_tx and adjust the flags such that no more
+ * write events will occur.  We don't kill read msgs because we want
+ * the caller to pull off our contrived terminal error msg to detect
+ * the connection failure.
+ */
+void
+hammer2_iocom_drain(hammer2_iocom_t *iocom)
+{
+       hammer2_ioq_t *ioq = &iocom->ioq_tx;
+       hammer2_msg_t *msg;
+
+       while ((msg = TAILQ_FIRST(&ioq->msgq)) != NULL) {
+               TAILQ_REMOVE(&ioq->msgq, msg, entry);
+               --ioq->msgcount;
+               hammer2_freemsg(msg);
+       }
+       iocom->flags |= HAMMER2_IOCOMF_WIDLE;
+       iocom->flags &= ~HAMMER2_IOCOMF_WREQ;
+}
+
+/*
+ * This is a shortcut to the normal hammer2_allocreply() mechanic which
+ * uses the received message to formulate a final reply and error code.
+ * Can be used to issue a final reply for one-way, one-off, or streaming
+ * commands.
+ *
+ * Replies to one-way messages are a bit of an oxymoron but the feature
+ * is used by the debug (DBG) protocol.
+ *
+ * The reply contains no data.
+ *
+ * (msg) is eaten up by this function.
+ */
+void
+hammer2_replymsg(hammer2_msg_t *msg, uint16_t error)
+{
+       hammer2_persist_t *pers;
+       uint16_t t16;
+
+       assert((msg->any.head.cmd & HAMMER2_MSGF_REPLY) == 0);
+
+       t16 = msg->any.head.source;
+       msg->any.head.source = msg->any.head.target;
+       msg->any.head.target = t16;
+       msg->any.head.error = error;
+       msg->any.head.cmd |= HAMMER2_MSGF_REPLY;
+       msg->aux_size = 0;
+       if ((pers = msg->persist) != NULL) {
+               assert(pers->lrep & HAMMER2_MSGF_DELETE);
+               msg->any.head.cmd |= pers->lrep & (HAMMER2_MSGF_CREATE |
+                                                  HAMMER2_MSGF_DELETE);
+               pers->lrep &= ~(HAMMER2_MSGF_CREATE | HAMMER2_MSGF_DELETE);
+       }
+       hammer2_ioq_write(msg);
+}
diff --git a/sbin/hammer2/network.h b/sbin/hammer2/network.h
new file mode 100644 (file)
index 0000000..eeaeb20
--- /dev/null
@@ -0,0 +1,316 @@
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <openssl/rsa.h>       /* public/private key functions */
+#include <openssl/pem.h>       /* public/private key file load */
+#include <openssl/err.h>
+#include <openssl/evp.h>       /* aes_256_cbc functions */
+
+/***************************************************************************
+ *                             CRYPTO HANDSHAKE                           *
+ ***************************************************************************
+ *
+ * The initial public-key exchange is implementing by transmitting a
+ * 512-byte buffer to the other side in a symmetrical fashion.  This
+ * buffer contains the following:
+ *
+ * (1) A random session key.  512 bits is specified.  We use aes_256_cbc()
+ *     and initialize the key with the first 256 bits and the iv[] with
+ *     the second.  Note that the transmitted and received session
+ *     keys are XOR'd together to create the session key used for
+ *     communications (so even if the verifier is compromised the session
+ *     will still be gobbly gook if the public key has not been completely
+ *     broken).
+ *
+ * (2) A verifier to determine that the decode was successful.  It encodes
+ *     an XOR of each group of 4 bytes from the session key.
+ *
+ * (3) Additional configuration and additional random data.
+ *
+ *     - The hammer2 message header magic for endian detect
+ *
+ *     - The hammer2 protocol version.  The two sides agree on the
+ *      smaller of the two.
+ *
+ *     - All unused fields (junk*) are filled with random data.
+ *
+ * This structure must be exactly 512 bytes and expects to use 256-byte
+ * RSA keys.
+ */
+struct hammer2_handshake {
+       char pad1[8];           /* 000 */
+       uint16_t magic;         /* 008 HAMMER2_MSGHDR_MAGIC for endian detect */
+       uint16_t version;       /* 00A hammer2 protocol version */
+       uint32_t flags;         /* 00C protocol extension flags */
+       uint8_t sess[64];       /* 010 512-bit session key */
+       uint8_t verf[16];       /* 050 verifier = ~sess */
+       char quickmsg[32];      /* 060 reason for connecting */
+       char junk080[128];      /* 080-0FF */
+       char pad2[8];           /* 100-107 */
+       char junk100[256-8];    /* 108-1FF */
+};
+
+typedef struct hammer2_handshake hammer2_handshake_t;
+
+#define HAMMER2_AES_KEY_SIZE   32
+#define HAMMER2_AES_KEY_MASK   (HAMMER2_AES_KEY_SIZE - 1)
+#define HAMMER2_AES_TYPE       aes_256_cbc
+#define HAMMER2_AES_TYPE_EVP   EVP_aes_256_cbc()
+#define HAMMER2_AES_TYPE_STR   #HAMMER2_AES_TYPE
+
+/***************************************************************************
+ *                             LOW LEVEL MESSAGING                        *
+ ***************************************************************************
+ *
+ * hammer2_msg - A standalone copy of a message, typically referenced by
+ *              or embedded in other structures, or used with I/O queues.
+ *
+ * These structures are strictly temporary, so they do not have to be
+ * particularly optimized for size.  All possible message headers are
+ * directly embedded (any), and the message may contain a reference
+ * to allocated auxillary data.  The structure is recycled quite often
+ * by a connection.
+ *
+ * This structure is typically not used for storing persistent message
+ * state (see hammer2_persist for that).
+ */
+struct hammer2_iocom;
+struct hammer2_persist;
+
+struct hammer2_msg {
+       struct hammer2_iocom *iocom;
+       struct hammer2_persist  *persist;
+       TAILQ_ENTRY(hammer2_msg) entry; /* queue */
+       char            *aux_data;      /* aux-data if any */
+       int             aux_size;
+       int             flags;
+       hammer2_any_t   any;            /* raw extended msg header */
+};
+
+typedef struct hammer2_msg hammer2_msg_t;
+
+TAILQ_HEAD(hammer2_msg_queue, hammer2_msg);
+typedef struct hammer2_msg_queue hammer2_msg_queue_t;
+
+#define HAMMER2_MSGX_BSWAPPED  0x0001
+
+/*
+ * hammer2_ioq - An embedded component of hammer2_connect, holds state
+ * for the buffering and parsing of incoming and outgoing messages.
+ */
+struct hammer2_ioq {
+       enum { HAMMER2_MSGQ_STATE_HEADER1,
+              HAMMER2_MSGQ_STATE_HEADER2,
+              HAMMER2_MSGQ_STATE_AUXDATA1,
+              HAMMER2_MSGQ_STATE_AUXDATA2,
+              HAMMER2_MSGQ_STATE_ERROR } state;
+       int             fifo_beg;               /* buffered data */
+       int             fifo_cdx;               /* encrypt/decrypt index */
+       int             fifo_end;
+       int             hbytes;                 /* header size */
+       int             abytes;                 /* aux_data size */
+       int             already;                /* aux_data already decrypted */
+       int             error;
+       int             seq;                    /* salt sequencer */
+       int             msgcount;
+       EVP_CIPHER_CTX  ctx;
+       char            iv[HAMMER2_AES_KEY_SIZE]; /* encrypt or decrypt iv[] */
+       hammer2_msg_t   *msg;
+       hammer2_msg_queue_t msgq;
+       char            buf[HAMMER2_MSGBUF_SIZE]; /* staging buffer */
+};
+
+typedef struct hammer2_ioq hammer2_ioq_t;
+
+#define HAMMER2_IOQ_ERROR_SYNC         1       /* bad magic / out of sync */
+#define HAMMER2_IOQ_ERROR_EOF          2       /* unexpected EOF */
+#define HAMMER2_IOQ_ERROR_SOCK         3       /* read() error on socket */
+#define HAMMER2_IOQ_ERROR_FIELD                4       /* invalid field */
+#define HAMMER2_IOQ_ERROR_HCRC         5       /* core header crc bad */
+#define HAMMER2_IOQ_ERROR_XCRC         6       /* ext header crc bad */
+#define HAMMER2_IOQ_ERROR_ACRC         7       /* aux data crc bad */
+#define HAMMER2_IOQ_ERROR_STATE                8       /* bad state */
+#define HAMMER2_IOQ_ERROR_NOPEER       9       /* bad socket peer */
+#define HAMMER2_IOQ_ERROR_NORKEY       10      /* no remote keyfile found */
+#define HAMMER2_IOQ_ERROR_NOLKEY       11      /* no local keyfile found */
+#define HAMMER2_IOQ_ERROR_KEYXCHGFAIL  12      /* key exchange failed */
+#define HAMMER2_IOQ_ERROR_KEYFMT       13      /* key file format problem */
+#define HAMMER2_IOQ_ERROR_BADURANDOM   14      /* /dev/urandom is bad */
+#define HAMMER2_IOQ_ERROR_MSGSEQ       15      /* message sequence error */
+
+#define HAMMER2_IOQ_MAXIOVEC    16
+
+/*
+ * hammer2_iocom - governs a messaging stream connection
+ */
+struct hammer2_iocom {
+       hammer2_ioq_t   ioq_rx;
+       hammer2_ioq_t   ioq_tx;
+       hammer2_msg_queue_t freeq;              /* free msgs hdr only */
+       hammer2_msg_queue_t freeq_aux;          /* free msgs w/aux_data */
+       void    (*recvmsg_callback)(struct hammer2_iocom *);
+       void    (*sendmsg_callback)(struct hammer2_iocom *);
+       void    (*altmsg_callback)(struct hammer2_iocom *);
+       int     sock_fd;                        /* comm socket or pipe */
+       int     alt_fd;                         /* thread signal, tty, etc */
+       int     flags;
+       int     rxmisc;
+       int     txmisc;
+       char    sess[HAMMER2_AES_KEY_SIZE];     /* aes_256_cbc key */
+};
+
+typedef struct hammer2_iocom hammer2_iocom_t;
+
+#define HAMMER2_IOCOMF_EOF     0x00000001      /* EOF or ERROR on desc */
+#define HAMMER2_IOCOMF_RREQ    0x00000002      /* request read-data event */
+#define HAMMER2_IOCOMF_WREQ    0x00000004      /* request write-avail event */
+#define HAMMER2_IOCOMF_WIDLE   0x00000008      /* request write-avail event */
+#define HAMMER2_IOCOMF_SIGNAL  0x00000010
+#define HAMMER2_IOCOMF_CRYPTED 0x00000020      /* encrypt enabled */
+
+/***************************************************************************
+ *                             HIGH LEVEL MESSAGING                       *
+ ***************************************************************************
+ *
+ * Persistent state is stored via the hammer2_persist structure.
+ */
+struct hammer2_persist {
+       uint32_t        lcmd;           /* recent command direction */
+       uint32_t        lrep;           /* recent reply direction */
+};
+
+typedef struct hammer2_persist hammer2_persist_t;
+
+#if 0
+
+
+
+/*
+ * The global registration structure consolidates information accumulated
+ * via the spanning tree algorithm and tells us which connection (link)
+ * is the best path to get to any given registration.
+ *
+ * glob_node   - Splay entry for this registration in the global index
+ *               of all registrations.
+ *
+ * glob_entry  - tailq entry when this registration's best_span element
+ *               has changed state.
+ *
+ * span_list   - Head of a simple list of spanning tree entries which
+ *               we use to determine the best link.
+ *
+ * best_span   - Which of the span structure on span_list is the best
+ *               one.
+ *
+ * source_root - Splay tree root indexing all mesasges sent from this
+ *               registration.  The messages are indexed by
+ *               {linkid,msgid} XXX
+ *
+ * target_root - Splay tree root indexing all messages being sent to
+ *               this registration.  The messages are indexed by
+ *               {linkid,msgid}. XXX
+ *
+ *
+ * Whenever spanning tree data causes a registration's best_link field to
+ * change that registration is transmitted as spanning tree data to every
+ * active link.  Note that pure clients to the cluster, of which there can
+ * be millions, typically do not transmit spanning tree data to each other.
+ *
+ * Each registration is assigned a unique linkid local to the node (another
+ * node might assign a different linkid to the same registration).  This
+ * linkid must be persistent as long as messages are active and is used
+ * to identify the message source and target.
+ */
+TAILQ_HEAD(hammer2_span_list, hammer2_span);
+typedef struct hammer2_span_list hammer2_span_list_t;
+
+struct hammer2_reg {
+       SPLAY_ENTRY(hammer2_reg) glob_node;     /* index of registrations */
+       TAILQ_ENTRY(hammer2_reg) glob_entry;    /* when modified */
+       hammer2_span_list_t     span_list;      /* list of hammer2_span's */
+       hammer2_span_t          *best_span;     /* best span entry */
+       hammer2_pmsg_splay_head_t source_root;  /* msgs sent from reg */
+       hammer2_pmsg_splay_head_t target_root;  /* msgs sent to reg */
+       uuid_t  pfs_id;                         /* key field */
+       uuid_t  pfs_fsid;                       /* key field */
+       uint32_t linkid;
+       int     flags;
+       int     refs;
+};
+
+#define HAMMER2_PROTO_REGF_MODIFIED    0x0001
+
+/*
+ * Each link (connection) collects spanning tree data received via the
+ * link and stores it in these span structures.
+ */
+struct hammer2_span {
+       TAILQ_ENTRY(hammer2_span)       span_entry;     /* from hammer2_reg */
+       SPLAY_ENTRY(hammer2_span)       span_node;      /* from hammer2_link */
+       hammer2_reg_t                   *reg;
+       hammer2_link_t                  *link;
+       int                             weight;
+};
+
+/*
+ * Most hammer2 messages represent transactions and have persistent state
+ * which must be recorded.  Some messages, such as cache states and inode
+ * representations are very long-lasting transactions.
+ *
+ * Each node in the graph must keep track of the message state in order
+ * to perform the proper action when a connection is lost.  To do this
+ * the message is indexed on the source and target (global) registration,
+ * and the actual span element the message was received on and transmitted
+ * to is recorded (allowing us to retrieve the physical links involved).
+ *
+ * The {source_reg, target_reg, msgid} uniquely identifies a message.  Any
+ * streaming operations using the same msgid use the same rendezvous.
+ *
+ * It is important to note that recorded state must use the same physical
+ * link (and thus the same chain of links across the graph) as was 'forged'
+ * by the initial message for that msgid.  If the source span a message is
+ * received on does not match the recorded source, or the recorded target
+ * is no longer routeable, the message will be returned or generate an ABORT
+ * with LINKFAIL as appropriate.
+ */
+struct hammer2_pmsg {
+       SPLAY_ENTRY(hammer2_pmsg) source_reg;
+       SPLAY_ENTRY(hammer2_pmsg) target_reg;
+       hammer2_span_t  *source;
+       hammer2_span_t  *target;
+       uint16_t        msgid;
+};
+
+#endif
diff --git a/sbin/hammer2/subs.c b/sbin/hammer2/subs.c
new file mode 100644 (file)
index 0000000..a1d526e
--- /dev/null
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "hammer2.h"
+
+/*
+ * Obtain a file descriptor that the caller can execute ioctl()'s on.
+ */
+int
+hammer2_ioctl_handle(const char *sel_path)
+{
+       struct hammer2_ioc_version info;
+       int fd;
+
+       if (sel_path == NULL)
+               sel_path = ".";
+
+       fd = open(sel_path, O_RDONLY, 0);
+       if (fd < 0) {
+               fprintf(stderr, "hammer2: Unable to open %s: %s\n",
+                       sel_path, strerror(errno));
+               return(-1);
+       }
+       if (ioctl(fd, HAMMER2IOC_VERSION_GET, &info) < 0) {
+               fprintf(stderr, "hammer2: '%s' is not a hammer2 filesystem\n",
+                       sel_path);
+               close(fd);
+               return(-1);
+       }
+       return (fd);
+}
+
+/*
+ * Execute the specified function as a detached independent process/daemon,
+ * unless we are in debug mode.  If we are in debug mode the function is
+ * executed as a pthread in the current process.
+ */
+void
+hammer2_demon(void *(*func)(void *), void *arg)
+{
+       pthread_t thread = NULL;
+       pid_t pid;
+       int ttyfd;
+
+       /*
+        * Do not disconnect in debug mode
+        */
+       if (DebugOpt) {
+                pthread_create(&thread, NULL, func, arg);
+               NormalExit = 0;
+               return;
+       }
+
+       /*
+        * Otherwise disconnect us.  Double-fork to get rid of the ppid
+        * association and disconnect the TTY.
+        */
+       if ((pid = fork()) < 0) {
+               fprintf(stderr, "hammer2: fork(): %s\n", strerror(errno));
+               exit(1);
+       }
+       if (pid > 0) {
+               while (waitpid(pid, NULL, 0) != pid)
+                       ;
+               return;         /* parent returns */
+       }
+
+       /*
+        * Get rid of the TTY/session before double-forking to finish off
+        * the ppid.
+        */
+       ttyfd = open("/dev/null", O_RDWR);
+       if (ttyfd >= 0) {
+               if (ttyfd != 0)
+                       dup2(ttyfd, 0);
+               if (ttyfd != 1)
+                       dup2(ttyfd, 1);
+               if (ttyfd != 2)
+                       dup2(ttyfd, 2);
+               if (ttyfd > 2)
+                       close(ttyfd);
+       }
+
+       ttyfd = open("/dev/tty", O_RDWR);
+       if (ttyfd >= 0) {
+               ioctl(ttyfd, TIOCNOTTY, 0);
+               close(ttyfd);
+       }
+       setsid();
+
+       /*
+        * Second fork to disconnect ppid (the original parent waits for
+        * us to exit).
+        */
+       if ((pid = fork()) < 0) {
+               _exit(2);
+       }
+       if (pid > 0)
+               _exit(0);
+
+       /*
+        * The double child
+        */
+       setsid();
+       pthread_create(&thread, NULL, func, arg);
+       pthread_exit(NULL);
+       _exit(2);       /* NOT REACHED */
+}
+
+/*
+ * This swaps endian for a hammer2_msg_hdr.  Note that the extended
+ * header is not adjusted, just the core header.
+ */
+void
+hammer2_bswap_head(hammer2_msg_hdr_t *head)
+{
+       head->magic     = bswap16(head->magic);
+       head->icrc1     = bswap16(head->icrc1);
+       head->salt      = bswap32(head->salt);
+       head->source    = bswap16(head->source);
+       head->target    = bswap16(head->target);
+       head->msgid     = bswap32(head->msgid);
+       head->cmd       = bswap32(head->cmd);
+       head->error     = bswap16(head->error);
+       head->resv05    = bswap16(head->resv05);
+       head->icrc2     = bswap16(head->icrc2);
+       head->aux_bytes = bswap16(head->aux_bytes);
+       head->aux_icrc  = bswap32(head->aux_icrc);
+}
+
+const char *
+hammer2_time64_to_str(uint64_t htime64, char **strp)
+{
+       struct tm *tp;
+       time_t t;
+
+       if (*strp) {
+               free(*strp);
+               *strp = NULL;
+       }
+       *strp = malloc(64);
+       t = htime64 / 1000000;
+       tp = localtime(&t);
+       strftime(*strp, 64, "%d-%b-%Y %H:%M:%S", tp);
+       return (*strp);
+}
+
+const char *
+hammer2_uuid_to_str(uuid_t *uuid, char **strp)
+{
+       uint32_t status;
+       if (*strp) {
+               free(*strp);
+               *strp = NULL;
+       }
+       uuid_to_string(uuid, strp, &status);
+       return (*strp);
+}
+
+const char *
+hammer2_iptype_to_str(uint8_t type)
+{
+       switch(type) {
+       case HAMMER2_OBJTYPE_UNKNOWN:
+               return("UNKNOWN");
+       case HAMMER2_OBJTYPE_DIRECTORY:
+               return("DIR");
+       case HAMMER2_OBJTYPE_REGFILE:
+               return("FILE");
+       case HAMMER2_OBJTYPE_FIFO:
+               return("FIFO");
+       case HAMMER2_OBJTYPE_CDEV:
+               return("CDEV");
+       case HAMMER2_OBJTYPE_BDEV:
+               return("BDEV");
+       case HAMMER2_OBJTYPE_SOFTLINK:
+               return("SOFTLINK");
+       case HAMMER2_OBJTYPE_HARDLINK:
+               return("HARDLINK");
+       case HAMMER2_OBJTYPE_SOCKET:
+               return("SOCKET");
+       case HAMMER2_OBJTYPE_WHITEOUT:
+               return("WHITEOUT");
+       default:
+               return("ILLEGAL");
+       }
+}
+
+const char *
+hammer2_pfstype_to_str(uint8_t type)
+{
+       switch(type) {
+       case HAMMER2_PFSTYPE_NONE:
+               return("NONE");
+       case HAMMER2_PFSTYPE_ADMIN:
+               return("ADMIN");
+       case HAMMER2_PFSTYPE_CACHE:
+               return("CACHE");
+       case HAMMER2_PFSTYPE_COPY:
+               return("COPY");
+       case HAMMER2_PFSTYPE_SLAVE:
+               return("SLAVE");
+       case HAMMER2_PFSTYPE_SOFT_SLAVE:
+               return("SOFT_SLAVE");
+       case HAMMER2_PFSTYPE_SOFT_MASTER:
+               return("SOFT_MASTER");
+       case HAMMER2_PFSTYPE_MASTER:
+               return("MASTER");
+       default:
+               return("ILLEGAL");
+       }
+}
diff --git a/sbin/mount_hammer2/Makefile b/sbin/mount_hammer2/Makefile
new file mode 100644 (file)
index 0000000..f3f4fc8
--- /dev/null
@@ -0,0 +1,7 @@
+PROG=  mount_hammer2
+SRCS=  mount_hammer2.c
+MAN=
+
+CFLAGS+= -I${.CURDIR}/..
+
+.include <bsd.prog.mk>
diff --git a/sbin/mount_hammer2/mount_hammer2.c b/sbin/mount_hammer2/mount_hammer2.c
new file mode 100644 (file)
index 0000000..7a2688a
--- /dev/null
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+#include <sys/types.h>
+#include <sys/mount.h>
+#include <vfs/hammer2/hammer2_mount.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+
+/*
+ * Usage: mount_hammer2 [volume] [mtpt]
+ */
+int
+main(int argc, char *argv[])
+{
+       struct hammer2_mount_info info;
+       struct vfsconf vfc;
+       char *mountpt;
+       int error;
+       int mount_flags;
+
+       bzero(&info, sizeof(info));
+       mount_flags = 0;
+
+       if (argc < 3)
+               exit(1);
+
+       error = getvfsbyname("hammer2", &vfc);
+       if (error) {
+               fprintf(stderr, "hammer2 vfs not loaded\n");
+               exit(1);
+       }
+
+       info.volume = argv[1];
+       info.hflags = 0;
+       mountpt = argv[2];
+
+       error = mount(vfc.vfc_name, mountpt, mount_flags, &info);
+       if (error)
+               perror("mount: ");
+}
diff --git a/sbin/newfs_hammer2/Makefile b/sbin/newfs_hammer2/Makefile
new file mode 100644 (file)
index 0000000..f306feb
--- /dev/null
@@ -0,0 +1,12 @@
+#
+#
+PROG=  newfs_hammer2
+MAN=   newfs_hammer2.8
+CFLAGS+= -I${.CURDIR}/../../sys -I${.CURDIR}/../hammer2
+SRCS= newfs_hammer2.c hammer2_icrc.c
+
+.PATH: ${.CURDIR}/../../sys/libkern
+.PATH: ${.CURDIR}/../../sys/vfs/hammer2
+SRCS+= crc32.c
+
+.include <bsd.prog.mk>
diff --git a/sbin/newfs_hammer2/newfs_hammer2.8 b/sbin/newfs_hammer2/newfs_hammer2.8
new file mode 100644 (file)
index 0000000..c42516b
--- /dev/null
@@ -0,0 +1,180 @@
+.\" Copyright (c) 2011 The DragonFly Project.  All rights reserved.
+.\"
+.\" This code is derived from software contributed to The DragonFly Project
+.\" by Matthew Dillon <dillon@backplane.com>
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\"
+.\" 1. Redistributions of source code must retain the above copyright
+.\"    notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\"    notice, this list of conditions and the following disclaimer in
+.\"    the documentation and/or other materials provided with the
+.\"    distribution.
+.\" 3. Neither the name of The DragonFly Project nor the names of its
+.\"    contributors may be used to endorse or promote products derived
+.\"    from this software without specific, prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+.\" ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+.\" LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+.\" FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+.\" COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+.\" INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+.\" BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+.\" LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+.\" AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+.\" OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+.\" OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.Dd May 23, 2011
+.Dt NEWFS_HAMMER2 8
+.Os
+.Sh NAME
+.Nm newfs_hammer2
+.Nd construct a new HAMMER2 file system
+.Sh SYNOPSIS
+.Nm
+.Fl L Ar label
+.Op Fl f
+.Op Fl b Ar bootsize
+.Op Fl r Ar redosize
+.Op Fl V Ar version
+.Ar special ...
+.Sh DESCRIPTION
+The
+.Nm
+utility creates a
+.Nm HAMMER2
+file system on device(s)
+.Ar special .
+If multiple devices are specified a single
+.Nm HAMMER2
+file system is created
+which spans all of them.
+Each
+.Ar special
+will constitute a volume which the
+.Nm HAMMER2
+file system is built on.
+.Nm HAMMER2
+file systems are sector-size agnostic, however the
+.Dx
+implementation requires the sector size to be no larger than 16K.
+.Nm HAMMER2
+file systems start at a relative offset of 0 and may only be created
+under out-of-band disk labels
+.Po
+.Xr disklabel64 5
+or
+.Xr gpt 8
+labels
+.Pc ,
+or in
+.Xr disklabel32 5
+partitions which do not overlap the label area (have a starting sector
+greater than 16).
+.Pp
+.Nm HAMMER2
+file systems are designed for large storage systems, up to 1 Exabyte, and
+will not operate efficiently on small storage systems.
+The minimum recommended file system size is 50GB.
+.Nm HAMMER2
+must reserve 500MB to 1GB of its storage for reblocking and UNDO/REDO.
+In addition,
+.Nm HAMMER2
+file systems operating normally, with full history
+retention and daily snapshots, do not immediately reclaim space when
+files are deleted.
+A regular system maintenance job runs once a day by
+.Xr periodic 8
+to handle reclamation.
+.Pp
+.Nm HAMMER2
+works best when the machine's normal workload would not otherwise fill
+the file system up in the course of 60 days of operation.
+.Pp
+The options are as follows:
+.Bl -tag -width indent
+.It Fl L Ar label
+All
+.Nm HAMMER2
+file systems must be named and names should be unique on a
+per-machine basis.
+.It Fl b Ar bootsize
+Specify a fixed area in which a boot related kernel and data can be stored.
+The
+.Ar bootsize
+is specified in bytes.
+By default a boot area of approximately 4MB will be created.
+.It Fl f
+Force operation.
+This is needed for the creation of a
+.Nm HAMMER2
+file system less than 10GB size or
+with less than 500MB UNDO/REDO FIFO.
+This should not be used under normal circumstances.
+.It Fl r Ar redosize
+Specify the size of the fixed REDO FIFO.
+The
+.Ar redosize
+is specified in bytes.
+By default 0.1% of the root
+volume's size is used, with a reasonable minimum and a reasonable cap.
+The UNDO/REDO FIFO is used to sequence meta-data out to the media for
+instant crash recovery.
+.It Fl V Ar version
+Specify the
+.Nm HAMMER2
+file system version to format.
+By default
+.Nm
+formats the file system using the highest production version number
+supported by the
+.Nm HAMMER2
+VFS by checking the
+.Va vfs.hammer2.supported_version
+sysctl.
+If you need to maintain compatibility with an older version of
+.Nm HAMMER2
+you may specify the version with this option.
+.El
+.Pp
+The
+.Ar bootsize
+and
+.Ar redosize
+must be given with a suffix of
+.Cm K , M , G
+or
+.Cm T
+meaning kilobyte, megabyte, gigabyte and terabyte.
+Lower case can also be used for suffix.
+.Sh EXAMPLES
+.Bd -literal -offset indent
+newfs_hammer2 -L Home /dev/ad0s1d
+.Ed
+.Pp
+Create a file system named
+.Sq Home
+on
+.Pa /dev/ad0s1d .
+.Sh DIAGNOSTICS
+Exit status is 0 on success and 1 on error.
+.Sh SEE ALSO
+.Xr disklabel32 5 ,
+.Xr disklabel64 5 ,
+.Xr HAMMER2 5 ,
+.Xr fdisk 8 ,
+.Xr gpt 8 ,
+.Xr newfs 8
+.Sh HISTORY
+The
+.Nm
+utility first appeared in
+.Dx 1.11 .
+.Sh AUTHORS
+.An Matthew Dillon Aq dillon@backplane.com
diff --git a/sbin/newfs_hammer2/newfs_hammer2.c b/sbin/newfs_hammer2/newfs_hammer2.c
new file mode 100644 (file)
index 0000000..629dff2
--- /dev/null
@@ -0,0 +1,763 @@
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/types.h>
+#include <sys/diskslice.h>
+#include <sys/diskmbr.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/sysctl.h>
+#include <vfs/hammer2/hammer2_disk.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <stddef.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <assert.h>
+#include <err.h>
+#include <uuid.h>
+
+static hammer2_off_t check_volume(const char *path, int *fdp);
+static int64_t getsize(const char *str, int64_t minval, int64_t maxval, int pw);
+static const char *sizetostr(hammer2_off_t size);
+static uint64_t nowtime(void);
+static void usage(void);
+
+static void format_hammer2(int fd, hammer2_off_t total_space,
+                               hammer2_off_t free_space);
+static void alloc_direct(hammer2_off_t *basep, hammer2_blockref_t *bref,
+                               size_t bytes);
+static hammer2_key_t dirhash(const unsigned char *name, size_t len);
+
+static int Hammer2Version = -1;
+static int ForceOpt = 0;
+static uuid_t Hammer2_FSType;  /* static filesystem type id for HAMMER2 */
+static uuid_t Hammer2_FSId;    /* unique filesystem id in volu header */
+static uuid_t Hammer2_SPFSId;  /* PFS id in super-root inode */
+static uuid_t Hammer2_RPFSId;  /* PFS id in root inode */
+static const char *Label = "ROOT";
+static hammer2_off_t BootAreaSize;
+static hammer2_off_t AuxAreaSize;
+
+#define GIG    ((hammer2_off_t)1024*1024*1024)
+
+int
+main(int ac, char **av)
+{
+       uint32_t status;
+       hammer2_off_t total_space;
+       hammer2_off_t free_space;
+       hammer2_off_t reserved_space;
+       int ch;
+       int fd = -1;
+       char *fsidstr;
+       char *spfsidstr;
+       char *rpfsidstr;
+
+       /*
+        * Sanity check basic filesystem structures.  No cookies for us
+        * if it gets broken!
+        */
+       assert(sizeof(hammer2_volume_data_t) == HAMMER2_VOLUME_BYTES);
+       assert(sizeof(hammer2_inode_data_t) == HAMMER2_INODE_BYTES);
+       assert(sizeof(hammer2_blockref_t) == HAMMER2_BLOCKREF_BYTES);
+
+       /*
+        * Generate a filesystem id and lookup the filesystem type
+        */
+       srandomdev();
+       uuidgen(&Hammer2_FSId, 1);
+       uuidgen(&Hammer2_SPFSId, 1);
+       uuidgen(&Hammer2_RPFSId, 1);
+       uuid_from_string(HAMMER2_UUID_STRING, &Hammer2_FSType, &status);
+       /*uuid_name_lookup(&Hammer2_FSType, "DragonFly HAMMER2", &status);*/
+       if (status != uuid_s_ok) {
+               errx(1, "uuids file does not have the DragonFly "
+                       "HAMMER filesystem type");
+       }
+
+       /*
+        * Parse arguments
+        */
+       while ((ch = getopt(ac, av, "fL:b:m:r:V:")) != -1) {
+               switch(ch) {
+               case 'f':
+                       ForceOpt = 1;
+                       break;
+               case 'L':
+                       Label = optarg;
+                       if (strlen(Label) > HAMMER2_INODE_MAXNAME) {
+                               errx(1, "Root directory label too long "
+                                       "(64 chars max)\n");
+                       }
+                       break;
+               case 'b':
+                       BootAreaSize = getsize(optarg,
+                                        HAMMER2_NEWFS_ALIGN,
+                                        HAMMER2_BOOT_MAX_BYTES, 2);
+                       break;
+               case 'r':
+                       AuxAreaSize = getsize(optarg,
+                                        HAMMER2_NEWFS_ALIGN,
+                                        HAMMER2_REDO_MAX_BYTES, 2);
+                       break;
+               case 'V':
+                       Hammer2Version = strtol(optarg, NULL, 0);
+                       if (Hammer2Version < HAMMER2_VOL_VERSION_MIN ||
+                           Hammer2Version >= HAMMER2_VOL_VERSION_WIP) {
+                               errx(1,
+                                    "I don't understand how to format "
+                                    "HAMMER2 version %d\n",
+                                    Hammer2Version);
+                       }
+                       break;
+               default:
+                       usage();
+                       break;
+               }
+       }
+
+       if (Hammer2Version < 0) {
+               size_t olen = sizeof(Hammer2Version);
+               Hammer2Version = HAMMER2_VOL_VERSION_DEFAULT;
+               if (sysctlbyname("vfs.hammer2.supported_version",
+                                &Hammer2Version, &olen, NULL, 0) == 0) {
+                       if (Hammer2Version >= HAMMER2_VOL_VERSION_WIP) {
+                               Hammer2Version = HAMMER2_VOL_VERSION_WIP - 1;
+                               fprintf(stderr,
+                                       "newfs_hammer: WARNING: HAMMER2 VFS "
+                                       "supports higher version than I "
+                                       "understand,\n"
+                                       "using version %d\n",
+                                       Hammer2Version);
+                       }
+               } else {
+                       fprintf(stderr,
+                               "newfs_hammer: WARNING: HAMMER2 VFS not "
+                               "loaded, cannot get version info.\n"
+                               "Using version %d\n",
+                               HAMMER2_VOL_VERSION_DEFAULT);
+               }
+       }
+
+       /*
+        * Collect volume information.
+        */
+       ac -= optind;
+       av += optind;
+
+       if (ac != 1) {
+               fprintf(stderr, "Exactly one disk device must be specified\n");
+               exit(1);
+       }
+       total_space = check_volume(av[0], &fd);
+
+       /*
+        * ~typically 8MB alignment to avoid edge cases for reserved blocks
+        * and so raid stripes (if any) operate efficiently.
+        */
+       total_space &= ~HAMMER2_VOLUME_ALIGNMASK64;
+
+       /*
+        * Calculate defaults for the boot area size and round to the
+        * volume alignment boundary.
+        */
+       if (BootAreaSize == 0) {
+               BootAreaSize = HAMMER2_BOOT_NOM_BYTES;
+               while (BootAreaSize > total_space / 20)
+                       BootAreaSize >>= 1;
+               if (BootAreaSize < HAMMER2_BOOT_MIN_BYTES)
+                       BootAreaSize = HAMMER2_BOOT_MIN_BYTES;
+       } else if (BootAreaSize < HAMMER2_BOOT_MIN_BYTES) {
+               BootAreaSize = HAMMER2_BOOT_MIN_BYTES;
+       }
+       BootAreaSize = (BootAreaSize + HAMMER2_VOLUME_ALIGNMASK64) &
+                      ~HAMMER2_VOLUME_ALIGNMASK64;
+
+       /*
+        * Calculate defaults for the redo area size and round to the
+        * volume alignment boundary.
+        */
+       if (AuxAreaSize == 0) {
+               AuxAreaSize = HAMMER2_REDO_NOM_BYTES;
+               while (AuxAreaSize > total_space / 20)
+                       AuxAreaSize >>= 1;
+               if (AuxAreaSize < HAMMER2_REDO_MIN_BYTES)
+                       AuxAreaSize = HAMMER2_REDO_MIN_BYTES;
+       } else if (AuxAreaSize < HAMMER2_REDO_MIN_BYTES) {
+               AuxAreaSize = HAMMER2_REDO_MIN_BYTES;
+       }
+       AuxAreaSize = (AuxAreaSize + HAMMER2_VOLUME_ALIGNMASK64) &
+                      ~HAMMER2_VOLUME_ALIGNMASK64;
+
+       /*
+        * We'll need to stuff this in the volume header soon.
+        */
+       uuid_to_string(&Hammer2_FSId, &fsidstr, &status);
+       uuid_to_string(&Hammer2_SPFSId, &spfsidstr, &status);
+       uuid_to_string(&Hammer2_RPFSId, &rpfsidstr, &status);
+
+       /*
+        * Calculate the amount of reserved space.  HAMMER2_ZONE_SEG (4MB)
+        * is reserved at the beginning of every 2GB of storage, rounded up.
+        * Thus a 200MB filesystem will still have a 4MB reserve area.
+        *
+        * We also include the boot and redo areas in the reserve.  The
+        * reserve is used to help 'df' calculate the amount of available
+        * space.
+        */
+       reserved_space = ((total_space + HAMMER2_ZONE_MASK64) /
+                         HAMMER2_ZONE_BYTES64) * HAMMER2_ZONE_SEG64;
+
+       free_space = total_space - reserved_space -
+                    BootAreaSize - AuxAreaSize;
+
+       format_hammer2(fd, total_space, free_space);
+       fsync(fd);
+       close(fd);
+
+       printf("---------------------------------------------\n");
+       printf("total-size:       %s (%jd bytes)\n",
+              sizetostr(total_space),
+              (intmax_t)total_space);
+       printf("root-label:       %s\n", Label);
+       printf("version:            %d\n", Hammer2Version);
+       printf("boot-area-size:   %s\n", sizetostr(BootAreaSize));
+       printf("aux-area-size:    %s\n", sizetostr(AuxAreaSize));
+       printf("topo-reserved:    %s\n", sizetostr(reserved_space));
+       printf("free-space:       %s\n", sizetostr(free_space));
+       printf("fsid:             %s\n", fsidstr);
+       printf("supr-pfsid:       %s\n", spfsidstr);
+       printf("root-pfsid:       %s\n", rpfsidstr);
+       printf("\n");
+
+       return(0);
+}
+
+static
+void
+usage(void)
+{
+       fprintf(stderr,
+               "usage: newfs_hammer -L label [-f] [-b bootsize] "
+               "[-r redosize] [-V version] special ...\n"
+       );
+       exit(1);
+}
+
+/*
+ * Convert the size in bytes to a human readable string.
+ */
+static
+const char *
+sizetostr(hammer2_off_t size)
+{
+       static char buf[32];
+
+       if (size < 1024 / 2) {
+               snprintf(buf, sizeof(buf), "%6.2f", (double)size);
+       } else if (size < 1024 * 1024 / 2) {
+               snprintf(buf, sizeof(buf), "%6.2fKB",
+                       (double)size / 1024);
+       } else if (size < 1024 * 1024 * 1024LL / 2) {
+               snprintf(buf, sizeof(buf), "%6.2fMB",
+                       (double)size / (1024 * 1024));
+       } else if (size < 1024 * 1024 * 1024LL * 1024LL / 2) {
+               snprintf(buf, sizeof(buf), "%6.2fGB",
+                       (double)size / (1024 * 1024 * 1024LL));
+       } else {
+               snprintf(buf, sizeof(buf), "%6.2fTB",
+                       (double)size / (1024 * 1024 * 1024LL * 1024LL));
+       }
+       return(buf);
+}
+
+/*
+ * Convert a string to a 64 bit signed integer with various requirements.
+ */
+static int64_t
+getsize(const char *str, int64_t minval, int64_t maxval, int powerof2)
+{
+       int64_t val;
+       char *ptr;
+
+       val = strtoll(str, &ptr, 0);
+       switch(*ptr) {
+       case 't':
+       case 'T':
+               val *= 1024;
+               /* fall through */
+       case 'g':
+       case 'G':
+               val *= 1024;
+               /* fall through */
+       case 'm':
+       case 'M':
+               val *= 1024;
+               /* fall through */
+       case 'k':
+       case 'K':
+               val *= 1024;
+               break;
+       default:
+               errx(1, "Unknown suffix in number '%s'\n", str);
+               /* not reached */
+       }
+       if (ptr[1]) {
+               errx(1, "Unknown suffix in number '%s'\n", str);
+               /* not reached */
+       }
+       if (val < minval) {
+               errx(1, "Value too small: %s, min is %s\n",
+                    str, sizetostr(minval));
+               /* not reached */
+       }
+       if (val > maxval) {
+               errx(1, "Value too large: %s, max is %s\n",
+                    str, sizetostr(maxval));
+               /* not reached */
+       }
+       if ((powerof2 & 1) && (val ^ (val - 1)) != ((val << 1) - 1)) {
+               errx(1, "Value not power of 2: %s\n", str);
+               /* not reached */
+       }
+       if ((powerof2 & 2) && (val & HAMMER2_NEWFS_ALIGNMASK)) {
+               errx(1, "Value not an integral multiple of %dK: %s",
+                    HAMMER2_NEWFS_ALIGN / 1024, str);
+               /* not reached */
+       }
+       return(val);
+}
+
+static uint64_t
+nowtime(void)
+{
+       struct timeval tv;
+       uint64_t xtime;
+
+       gettimeofday(&tv, NULL);
+       xtime = tv.tv_sec * 1000000LL + tv.tv_usec;
+       return(xtime);
+}
+
+/*
+ * Figure out how big the volume is.
+ */
+static
+hammer2_off_t
+check_volume(const char *path, int *fdp)
+{
+       struct partinfo pinfo;
+       struct stat st;
+       hammer2_off_t size;
+
+       /*
+        * Get basic information about the volume
+        */
+       *fdp = open(path, O_RDWR);
+       if (*fdp < 0)
+               err(1, "Unable to open %s R+W", path);
+       if (ioctl(*fdp, DIOCGPART, &pinfo) < 0) {
+               /*
+                * Allow the formatting of regular files as HAMMER2 volumes
+                */
+               if (fstat(*fdp, &st) < 0)
+                       err(1, "Unable to stat %s", path);
+               size = st.st_size;
+       } else {
+               /*
+                * When formatting a block device as a HAMMER2 volume the
+                * sector size must be compatible.  HAMMER2 uses 64K
+                * filesystem buffers but logical buffers for direct I/O
+                * can be as small as HAMMER2_LOGSIZE (16KB).
+                */
+               if (pinfo.reserved_blocks) {
+                       errx(1, "HAMMER cannot be placed in a partition "
+                               "which overlaps the disklabel or MBR");
+               }
+               if (pinfo.media_blksize > HAMMER2_PBUFSIZE ||
+                   HAMMER2_PBUFSIZE % pinfo.media_blksize) {
+                       errx(1, "A media sector size of %d is not supported",
+                            pinfo.media_blksize);
+               }
+               size = pinfo.media_size;
+       }
+       printf("Volume %-15s size %s\n", path, sizetostr(size));
+       return (size);
+}
+
+/*
+ * Create the volume header, the super-root directory inode, and
+ * the writable snapshot subdirectory (named via the label) which
+ * is to be the initial mount point, or at least the first mount point.
+ *
+ * [----reserved_area----][boot_area][aux_area]
+ * [[vol_hdr]...         ]                      [sroot][root]
+ *
+ * The sroot and root inodes eat 512 bytes each.  newfs labels can only be
+ * 64 bytes so the root (snapshot) inode does not need to extend past 512
+ * bytes.  We use the correct hash slot correct but note that because
+ * directory hashes are chained 16x, any slot in the inode will work.
+ *
+ * Also format the allocation map.
+ *
+ * NOTE: The passed total_space is 8MB-aligned to avoid edge cases.
+ */
+static
+void
+format_hammer2(int fd, hammer2_off_t total_space, hammer2_off_t free_space)
+{
+       char *buf = malloc(HAMMER2_PBUFSIZE);
+       hammer2_volume_data_t *vol;
+       hammer2_inode_data_t *rawip;
+       hammer2_blockref_t sroot_blockref;
+       hammer2_blockref_t root_blockref;
+       uint64_t now;
+       hammer2_off_t volu_base = 0;
+       hammer2_off_t boot_base = HAMMER2_ZONE_SEG;
+       hammer2_off_t aux_base = boot_base + BootAreaSize;
+       hammer2_off_t alloc_base = aux_base + AuxAreaSize;
+       hammer2_off_t tmp_base;
+       size_t n;
+       int i;
+
+       /*
+        * Clear the entire reserve for the first 2G segment and
+        * make sure we can write to the last block.
+        */
+       bzero(buf, HAMMER2_PBUFSIZE);
+       tmp_base = volu_base;
+       for (i = 0; i < HAMMER2_ZONE_BLOCKS_SEG; ++i) {
+               n = pwrite(fd, buf, HAMMER2_PBUFSIZE, tmp_base);
+               if (n != HAMMER2_PBUFSIZE) {
+                       perror("write");
+                       exit(1);
+               }
+               tmp_base += HAMMER2_PBUFSIZE;
+       }
+
+       n = pwrite(fd, buf, HAMMER2_PBUFSIZE,
+                  volu_base + total_space - HAMMER2_PBUFSIZE);
+       if (n != HAMMER2_PBUFSIZE) {
+               perror("write (at-end-of-volume)");
+               exit(1);
+       }
+
+       /*
+        * Reserve space for the super-root inode and the root inode.
+        * Put them in the same 64K block.
+        */
+       assert((alloc_base & HAMMER2_PBUFMASK) == 0);
+
+       alloc_base &= ~HAMMER2_PBUFMASK64;
+       alloc_direct(&alloc_base, &sroot_blockref, HAMMER2_INODE_BYTES);
+       alloc_direct(&alloc_base, &root_blockref, HAMMER2_INODE_BYTES);
+       assert(((sroot_blockref.data_off ^ root_blockref.data_off) &
+               HAMMER2_OFF_MASK_HI) == 0);
+
+       bzero(buf, HAMMER2_PBUFSIZE);
+       now = nowtime();
+
+       /*
+        * Format the root directory inode, which is left empty.
+        */
+       rawip = (void *)(buf + (HAMMER2_OFF_MASK_LO & root_blockref.data_off));
+       rawip->version = HAMMER2_INODE_VERSION_ONE;
+       rawip->ctime = now;
+       rawip->mtime = now;
+       /* rawip->atime = now; NOT IMPL MUST BE ZERO */
+       rawip->btime = now;
+       rawip->type = HAMMER2_OBJTYPE_DIRECTORY;
+       rawip->mode = 0755;
+       rawip->inum = 1;                /* root inode, inumber 1 */
+       rawip->nlinks = 1;              /* directory link count compat */
+
+       rawip->name_len = strlen(Label);
+       bcopy(Label, rawip->filename, rawip->name_len);
+       rawip->name_key = dirhash(rawip->filename, rawip->name_len);
+
+       /*
+        * Compression mode and supported copyids.
+        */
+       rawip->comp_algo = HAMMER2_COMP_AUTOZERO;
+
+       rawip->pfs_id = Hammer2_RPFSId;
+       rawip->pfs_type = HAMMER2_PFSTYPE_MASTER;
+       rawip->op_flags |= HAMMER2_OPFLAG_PFSROOT;
+
+       /* rawip->u.blockset is left empty */
+
+       /*
+        * The root blockref will be stored in the super-root inode as
+        * the only directory entry.  The copyid here is the actual copyid
+        * of the storage ref.
+        *
+        * The key field for a directory entry's blockref is essentially
+        * the name key for the entry.
+        */
+       root_blockref.key = rawip->name_key;
+       root_blockref.copyid = HAMMER2_COPYID_LOCAL;
+       root_blockref.keybits = 0;
+       root_blockref.check.iscsi32.value =
+                                       hammer2_icrc32(rawip, sizeof(*rawip));
+       root_blockref.type = HAMMER2_BREF_TYPE_INODE;
+       root_blockref.methods = HAMMER2_ENC_CHECKMETHOD(HAMMER2_CHECK_ICRC) |
+                               HAMMER2_ENC_COMPMETHOD(HAMMER2_COMP_AUTOZERO);
+
+       /*
+        * Format the super-root directory inode, giving it one directory
+        * entry (root_blockref) and fixup the icrc method.
+        *
+        * The superroot contains one directory entry pointing at the root
+        * inode (named via the label).  Inodes contain one blockset which
+        * is fully associative so we can put the entry anywhere without
+        * having to worry about the hash.  Use index 0.
+        */
+       rawip = (void *)(buf + (HAMMER2_OFF_MASK_LO & sroot_blockref.data_off));
+       rawip->version = HAMMER2_INODE_VERSION_ONE;
+       rawip->ctime = now;
+       rawip->mtime = now;
+       /* rawip->atime = now; NOT IMPL MUST BE ZERO */
+       rawip->btime = now;
+       rawip->type = HAMMER2_OBJTYPE_DIRECTORY;
+       rawip->mode = 0700;             /* super-root - root only */
+       rawip->inum = 0;                /* super root inode, inumber 0 */
+       rawip->nlinks = 2;              /* directory link count compat */
+
+       rawip->name_len = 0;            /* super-root is unnamed */
+       rawip->name_key = 0;
+
+       rawip->comp_algo = HAMMER2_COMP_AUTOZERO;
+
+       /*
+        * The super-root is flagged as a PFS and typically given its own
+        * random FSID, making it possible to mirror an entire HAMMER2 disk
+        * snapshots and all if desired.  PFS ids are used to match up
+        * mirror sources and targets and cluster copy sources and targets.
+        */
+       rawip->pfs_id = Hammer2_SPFSId;
+       rawip->pfs_type = HAMMER2_PFSTYPE_MASTER;
+       rawip->op_flags |= HAMMER2_OPFLAG_PFSROOT;
+
+       /*
+        * The super-root has one directory entry pointing at the named
+        * root inode.
+        */
+       rawip->u.blockset.blockref[0] = root_blockref;
+
+       /*
+        * The sroot blockref will be stored in the volume header.
+        */
+       sroot_blockref.copyid = HAMMER2_COPYID_LOCAL;
+       sroot_blockref.keybits = 0;
+       sroot_blockref.check.iscsi32.value =
+                                       hammer2_icrc32(rawip, sizeof(*rawip));
+       sroot_blockref.type = HAMMER2_BREF_TYPE_INODE;
+       sroot_blockref.methods = HAMMER2_ENC_CHECKMETHOD(HAMMER2_CHECK_ICRC) |
+                                HAMMER2_ENC_COMPMETHOD(HAMMER2_COMP_AUTOZERO);
+
+       /*
+        * Write out the 64K HAMMER2 block containing the root and sroot.
+        */
+       n = pwrite(fd, buf, HAMMER2_PBUFSIZE,
+                  root_blockref.data_off & HAMMER2_OFF_MASK_HI);
+       if (n != HAMMER2_PBUFSIZE) {
+               perror("write");
+               exit(1);
+       }
+
+       /*
+        * Format the volume header.
+        *
+        * The volume header points to sroot_blockref.  Also be absolutely
+        * sure that allocator_beg is set.
+        */
+       bzero(buf, HAMMER2_PBUFSIZE);
+       vol = (void *)buf;
+
+       vol->magic = HAMMER2_VOLUME_ID_HBO;
+       vol->boot_beg = boot_base;
+       vol->boot_end = boot_base + BootAreaSize;
+       vol->aux_beg = aux_base;
+       vol->aux_end = aux_base + AuxAreaSize;
+       vol->volu_size = total_space;
+       vol->version = Hammer2Version;
+       vol->flags = 0;
+
+       vol->fsid = Hammer2_FSId;
+       vol->fstype = Hammer2_FSType;
+
+       vol->allocator_size = free_space;
+       vol->allocator_free = free_space;
+       vol->allocator_beg = alloc_base;
+
+       vol->sroot_blockset.blockref[0] = sroot_blockref;
+       vol->mirror_tid = 0;
+       vol->alloc_tid = 16;
+       vol->icrc_sects[HAMMER2_VOL_ICRC_SECT1] =
+                       hammer2_icrc32((char *)vol + HAMMER2_VOLUME_ICRC1_OFF,
+                                      HAMMER2_VOLUME_ICRC1_SIZE);
+
+       /*
+        * Set ICRC_SECT0 after all remaining elements of sect0 have been
+        * populated in the volume header.  Note hat ICRC_SECT* (except for
+        * SECT0) are part of sect0.
+        */
+       vol->icrc_sects[HAMMER2_VOL_ICRC_SECT0] =
+                       hammer2_icrc32((char *)vol + HAMMER2_VOLUME_ICRC0_OFF,
+                                      HAMMER2_VOLUME_ICRC0_SIZE);
+       vol->icrc_volheader =
+                       hammer2_icrc32((char *)vol + HAMMER2_VOLUME_ICRCVH_OFF,
+                                      HAMMER2_VOLUME_ICRCVH_SIZE);
+
+       /*
+        * Write the volume header and all alternates.
+        */
+       for (i = 0; i < HAMMER2_NUM_VOLHDRS; ++i) {
+               if (i * HAMMER2_ZONE_BYTES64 >= total_space)
+                       break;
+               n = pwrite(fd, buf, HAMMER2_PBUFSIZE,
+                          volu_base + i * HAMMER2_ZONE_BYTES64);
+               if (n != HAMMER2_PBUFSIZE) {
+                       perror("write");
+                       exit(1);
+               }
+       }
+
+       /*
+        * Cleanup
+        */
+       free(buf);
+}
+
+static void
+alloc_direct(hammer2_off_t *basep, hammer2_blockref_t *bref, size_t bytes)
+{
+       int radix;
+
+       radix = 0;
+       assert(bytes);
+       while ((bytes & 1) == 0) {
+               bytes >>= 1;
+               ++radix;
+       }
+       assert(bytes == 1);
+       if (radix < HAMMER2_MIN_RADIX)
+               radix = HAMMER2_MIN_RADIX;
+
+       bzero(bref, sizeof(*bref));
+       bref->data_off = *basep | radix;
+       bref->vradix = radix;
+
+       *basep += 1U << radix;
+}
+
+/*
+ * Borrow HAMMER1's directory hash algorithm #1 with a few modifications.
+ * The filename is split into fields which are hashed separately and then
+ * added together.
+ *
+ * Differences include: bit 63 must be set to 1 for HAMMER2 (HAMMER1 sets
+ * it to 0), this is because bit63=0 is used for hidden hardlinked inodes.
+ * (This means we do not need to do a 0-check/or-with-0x100000000 either).
+ *
+ * Also, the iscsi crc code is used instead of the old crc32 code.
+ */
+static hammer2_key_t
+dirhash(const unsigned char *name, size_t len)
+{
+       const unsigned char *aname = name;
+       uint32_t crcx;
+       uint64_t key;
+       size_t i;
+       size_t j;
+
+       /*
+        * Filesystem version 6 or better will create directories
+        * using the ALG1 dirhash.  This hash breaks the filename
+        * up into domains separated by special characters and
+        * hashes each domain independently.
+        *
+        * We also do a simple sub-sort using the first character
+        * of the filename in the top 5-bits.
+        */
+       key = 0;
+
+       /*
+        * m32
+        */
+       crcx = 0;
+       for (i = j = 0; i < len; ++i) {
+               if (aname[i] == '.' ||
+                   aname[i] == '-' ||
+                   aname[i] == '_' ||
+                   aname[i] == '~') {
+                       if (i != j)
+                               crcx += hammer2_icrc32(aname + j, i - j);
+                       j = i + 1;
+               }
+       }
+       if (i != j)
+               crcx += hammer2_icrc32(aname + j, i - j);
+
+       /*
+        * The directory hash utilizes the top 32 bits of the 64-bit key.
+        * Bit 63 must be set to 1.
+        */
+       crcx |= 0x80000000U;
+       key |= (uint64_t)crcx << 32;
+
+       /*
+        * l16 - crc of entire filename
+        *
+        * This crc reduces degenerate hash collision conditions
+        */
+       crcx = hammer2_icrc32(aname, len);
+       crcx = crcx ^ (crcx << 16);
+       key |= crcx & 0xFFFF0000U;
+
+       /*
+        * Set bit 15.  This allows readdir to strip bit 63 so a positive
+        * 64-bit cookie/offset can always be returned, and still guarantee
+        * that the values 0x0000-0x7FFF are available for artificial entries.
+        * ('.' and '..').
+        */
+       key |= 0x8000U;
+
+       return (key);
+}
diff --git a/sys/vfs/hammer2/CHANGES b/sys/vfs/hammer2/CHANGES
new file mode 100644 (file)
index 0000000..e9243b0
--- /dev/null
@@ -0,0 +1,10 @@
+
+                           DESIGN CHANGES & ISSUES
+
+* Indirect blocks have to be fully associative (all 1024 entries) for now,
+  I haven't figured out a way to break it down into smaller associative
+  blocks without breaking copies.
+
+* (temporary) all data blocks are 64K at the moment.
+
+* currently directory cookies are non-linear.
diff --git a/sys/vfs/hammer2/DESIGN b/sys/vfs/hammer2/DESIGN
new file mode 100644 (file)
index 0000000..fe10627
--- /dev/null
@@ -0,0 +1,402 @@
+
+                           HAMMER2 DESIGN DOCUMENT
+
+                               Matthew Dillon
+                                08-Feb-2012
+                            dillon@backplane.com
+
+* These features have been speced in the media structures.
+
+* Implementation work has begun.
+
+* A working filesystem with some features implemented is expected by July 2012.
+
+* A fully functional filesystem with most (but not all) features is expected
+  by the end of 2012.
+
+* All elements of the filesystem have been designed except for the freemap
+  (which isn't needed for initial work).  8MB per 2GB of filesystem
+  storage has been reserved for the freemap.  The design of the freemap
+  is expected to be completely speced by mid-year.
+
+* This is my only project this year.  I'm not going to be doing any major
+  kernel bug hunting this year.
+
+                               Feature List
+
+* Multiple roots (allowing snapshots to be mounted).  This is implemented
+  via the super-root concept.  When mounting a HAMMER2 filesystem you specify
+  a device path and a directory name in the super-root.
+
+* HAMMER1 had PFS's.  HAMMER2 does not.  Instead, in HAMMER2 any directory
+  in the tree can be configured as a PFS, causing all elements recursively
+  underneath that directory to become a part of that PFS.
+
+* Writable snapshots.  Any subdirectory tree can be snapshotted.  Snapshots
+  show up in the super-root.  It is possible to snapshot a subdirectory
+  and then later snapshot a parent of that subdirectory... really there are
+  no limitations here.
+
+* Directory sub-hierarchy based quotas and space and inode usage tracking.
+  Any directory sub-tree, whether at a mount point or not, tracks aggregate
+  inode use and data space use.  This is stored in the directory inode all
+  the way up the chain.
+
+* Incremental queueless mirroring / mirroring-streams.  Because HAMMER2 is
+  block-oriented and copy-on-write each blockref tracks both direct
+  modifications to the referenced data via (modify_tid) and indirect
+  modifications to the referenced data or any sub-tree via (mirror_tid).
+  This makes it possible to do an incremental scan of meta-data that covers
+  only changes made since the mirror_tid recorded in a prior-run.
+
+  This feature is also intended to be used to locate recently allocated
+  blocks and thus be able to fixup the freemap after a crash.
+
+  HAMMER2 mirroring works a bit differently than HAMMER1 mirroring in
+  that HAMMER2 does not keep track of 'deleted' records.  Instead any
+  recursion by the mirroring code which finds that (modify_tid) has
+  been updated must also send the direct block table or indirect block
+  table state it winds up recursing through so the target can check
+  similar key ranges and locate elements to be deleted.  This can be
+  avoided if the mirroring stream is mostly caught up in that very recent
+  deletions will be cached in memory and can be queried, allowing shorter
+  record deletions to be passed in the stream instead.
+
+* Will support multiple compression algorithms configured on subdirectory
+  tree basis and on a file basis.  Up to 64K block compression will be used.
+  Only compression ratios near powers of 2 that are at least 2:1 (e.g. 2:1,
+  4:1, 8:1, etc) will work in this scheme because physical block allocations
+  in HAMMER2 are always power-of-2.
+
+  Compression algorithm #0 will mean no compression and no zero-checking.
+  Compression algorithm #1 will mean zero-checking but no other compression.
+  Real compression will be supported starting with algorithm 2.
+
+* Zero detection on write (writing all-zeros), which requires the data
+  buffer to be scanned, will be supported as compression algorithm #1.
+  This allows the writing of 0's to create holes and will be the default
+  compression algorithm for HAMMER2.
+
+* Copies support for redundancy.  Each copy has its own blockref.  The
+  blockrefs representing the copies must exist within the same blockset
+  (set of 8 blockrefs), though I may relax this requirement in the
+  implementation.
+
+  The design is such that the filesystem should be able to function at
+  full speed even if disks are pulled or inserted, as long as at least one
+  good copy is present.  A background task will be needed to resynchronize
+  missing copies (or remove excessive copies in the case where the copies
+  value is reduced on a live filesystem).
+
+  Copies are specified using the same copyinfo[] array that is used to
+  specify cluster interconnections for PFS's.
+
+* Clusterable with MESI cache coherency and dynamic granularity.
+  The media format for HAMMER1 was less condusive to logical clustering
+  than I had hoped so I was never able to get that aspect of my personal goals
+  working with HAMMER1.  HAMMER2 effectively solves the issues that cropped
+  up with HAMMER1 (mainly that HAMMER1's B-Tree did not reflect the logical
+  file/directory hierarchy, making cache coherency very difficult).
+
+* Hardlinks will be supported.  All other standard features will be supported
+  too of course.  Hardlinks in this sort of filesystem require significant
+  work.
+
+* The media blockref structure is now large enough to support up to a 192-bit
+  check value, which would typically be a cryptographic hash of some sort.
+  Multiple check value algorithms will be supported with the default being
+  a simple 32-bit iSCSI CRC.
+
+* Fully verified deduplication will be supported and automatic (and
+  necessary in many respects).
+
+* Non-verified de-duplication will be supported as a configurable option on
+  a file or subdirectory tree.  Non-verified deduplication would use the
+  largest available check code (192 bits) and not bother to verify data
+  matches during the dedup pass, which is necessary on extremely large
+  filesystems with a great deal of deduplicable data (as otherwise a large
+  chunk of the media would have to be read to implement the dedup).
+
+  This feature is intended only for those files where occassional corruption
+  is ok, such as in a large data store of farmed web content.
+
+                               GENERAL DESIGN
+
+HAMMER2 generally implements a copy-on-write block design for the filesystem,
+which is very different from HAMMER1's B-Tree design.  Because the design
+is copy-on-write it can be trivially snapshotted simply by referencing an
+existing block, and because the media structures logically match a standard
+filesystem directory/file hierarchy snapshots and other similar operations
+can be trivially performed on an entire subdirectory tree at any level in
+the filesystem.
+
+The copy-on-write nature of the filesystem implies that any modification
+whatsoever will have to eventually synchronize new disk blocks all the way
+to the super-root of the filesystem and the volume header itself.  This forms
+the basis for crash recovery.  All disk writes are to new blocks except for
+the volume header, thus allowing all writes to run concurrently except for
+the volume header update at the end.
+
+Clearly this method requires intermediate modifications to the chain to be
+cached so multiple modifications can be aggregated prior to being
+synchronized.  One advantage, however, is that the cache can be flushed at
+any time WITHOUT having to allocate yet another new block when further
+modifications are made as long as the volume header has not yet been flushed.
+This means that buffer cache overhead is very well bounded and can handle
+filesystem operations of any complexity even on boxes with very small amounts
+of physical memory.
+
+I intend to implement a shortcut to make fsync()'s run fast, and that is to
+allow deep updates to blockrefs to shortcut to auxillary space in the
+volume header to satisfy the fsync requirement.  The related blockref is
+then recorded when the filesystem is mounted after a crash and the update
+chain is reconstituted when a matching blockref is encountered again during
+normal operation of the filesystem.
+
+Basically this means that no real work needs to be done at mount-time
+even after a crash.
+
+Directories are hashed, and another major design element is that directory
+entries ARE INODES.  They are one and the same.  In addition to directory
+entries being inodes the data for very small files (512 bytes or smaller)
+can be directly embedded in the inode (overloaded onto the same space that
+the direct blockref array uses).  This should result in very high
+performance.
+
+Inode numbers are not spatially referenced, which complicates NFS servers
+but doesn't complicate anything else.  The inode number is stored in the
+inode itself, an absolutely necessary feature in order to support the
+hugely flexible snapshots that we want to have in HAMMER2.
+
+                                 HARDLINKS
+
+Hardlinks are a particularly sticky problem for HAMMER2 due to the lack of
+a spatial reference to the inode number.  We do not want to have to have
+an index of inode numbers for any basic HAMMER2 feature if we can help it.
+
+Hardlinks are handled by placing the inode for a multiply-hardlinked file
+in the closest common parent directory.  If "a/x" and "a/y" are hardlinked
+the inode for the hardlinked file will be placed in directory "a", e.g.
+"a/3239944", but it will be invisible and will be in an out-of-band namespace.
+The directory entries "a/x" and "a/y" will be given the same inode number
+but in fact just be placemarks that cause HAMMER2 to recurse upwards through
+the directory tree to find the invisible inode number.
+
+Because directories are hashed and a different namespace (hash key range)
+is used for hardlinked inodes, standard directory scans are able to trivially
+skip this invisible namespace and inode-specific lookups can restrict their
+lookup to within this space.
+
+The nature of snapshotting makes handling link-count 2->1 and 1->2 cases
+trivial.  Basically the inode media structure is copied as needed to break-up
+or re-form the standard directory entry/inode.  There are no backpointers in
+HAMMER2 and no reference counts on the blocks (see FREEMAP NOTES below), so
+it is an utterly trivial operation.
+
+                               FREEMAP NOTES
+
+In order to implement fast snapshots (and writable snapshots for that
+matter), HAMMER2 does NOT ref-count allocations.  The freemap which
+is still under design just won't do that.  All the freemap does is
+keep track of 100% free blocks.
+
+This not only trivializes all the snapshot features it also trivializes
+hardlink handling and solves the problem of keeping the freemap sychronized
+in the event of a crash.  Now all we have to do after a crash is make
+sure blocks allocated before the freemap was flushed are properly
+marked as allocated in the allocmap.  This is a trivial exercise using the
+same algorithm the mirror streaming code uses (which is very similar to
+HAMMER1)... an incremental meta-data scan that covers only the blocks that
+might have been allocated between the last allocation map sync and now.
+
+Thus the freemap does not have to be synchronized during a fsync().
+
+The complexity is in figuring out what can be freed... that is, when one
+can mark blocks in the freemap as being free.  HAMMER2 implements this as
+a background task which essentially must scan available meta-data to
+determine which blocks are not being referenced.
+
+Part of the ongoing design work is finding ways to reduce the scope of this
+meta-data scan so the entire filesystem's meta-data does not need to be
+scanned (though in tests with HAMMER1, even full meta-data scans have
+turned out to be fairly low cost).  In other words, its an area that we
+can continue to improve on as the filesystem matures.  Not only that, but
+we can completely change the freemap algorithms without creating
+incompatibilities (at worse simply having to require that a R+W mount do
+a full meta-data scan when upgrading or downgrading the freemap algorithm).
+
+                                 CLUSTERING
+
+Clustering, as always, is the most difficult bit but we have some advantages
+with HAMMER2 that we did not have with HAMMER1.  First, HAMMER2's media
+structures generally follow the kernel's filesystem hiearchy.  Second,
+HAMMER2's writable snapshots make it possible to implement several forms
+of multi-master clustering.
+
+The mount device path you specify serves to bootstrap your entry into
+the cluster.  This can be local media or directly specify a network
+cluster connection (or several).  When a local media mount is used the
+volume header is scanned for local copies and the best volume header is
+selected from all available copies.  Multiple devices may be specified for
+redundancy.
+
+The volume header on local media also contains cluster connection
+specifications keyed by super-root pfsid.  Network connections are
+maintained to all targets.  ALL ELEMENTS ARE TREATED ACCORDING TO TYPE
+NO MATTER WHICH ONE YOU MOUNT FROM.
+
+The actual networked cluster may be far larger than the elements you list
+in the hammer2_copy_data[] array, but your machine will only make direct
+connections as specified by the array.
+
+In the simplest case you simply network a few machines together as ring 0
+masters and each client connects directly to all the masters (and/or are
+the masters themselves).  Thus any quorum operation is straight-forward.
+These master nodes are labeled 'ring 0'.
+
+If you have too many clients to reasonably connect directly you set up
+sub-clusters as satellites.  This is called 'ring 1'.  Ring 1 may contain
+several sub-clusters.  A client then connects to all the nodes in a
+particular sub-cluster (typically 3).  The quorum protocol runs as per
+normal except that once the operation is resolved against the sub-cluster
+an aggregation must be resolved against the master nodes (ring 0).  The
+sub-cluster does this for the client... all the client sees is the normal
+quorum operation against the sub-cluster.
+
+Since each node in the sub-cluster connects to all master nodes we get
+a multiplication.  If we set a reasonable upper limit of, say, 256
+connections at each master node then ring 1 may contain 85 sub-clusters x 3
+nodes in each sub-cluster.
+
+In the most complex case when one wishes to support potentially millions
+of clients then further fan-out is required into ring 2, ring 3, and
+so forth.  However, each sub-cluster in ring 2 must only connect to
+1 sub-cluster in ring 1 (otherwise the cache state will become mightily
+confused).  Using reasonable metrics this will allow ring 2 to contain
+85 * 85 = 7225 sub-clusters.  At this point you could have 1000 clients
+connect to each sub-cluster and support 7.2 million clients, but if that
+isn't enough going to another ring will support 61M clients, and so forth.
+
+Each ring imposes additional latencies for cache operations but the key
+to making this work efficiently is that the satellite clusters can negotiate
+coarse-grained cache coherency locks with the next lower ring and then
+fan-out finer-grained locks to the next higher ring.  Since caching can
+occur anywhere (including on the connecting client), it is the cache
+coherency lock that ultimately dictates efficiency and allows a client
+(or satellite) to access large amoutns of data from local storage.
+
+Modifying operations, particularly commits, also have higher latencies
+when multiple rings are in use.  In this situation it is possible to
+short-cut localized operations by having competing clients connect to
+to sub-clusters which are near each other topologically... having the
+competing clients connect to the same sub-cluster would be the most optimal.
+
+In addition, sub-clusters (typically in ring 1) can act in SOFT_MASTER mode
+which allows the sub-cluster to acknowledge a full commit within its own
+quorum only, and then resolve asynchronously to the masters in ring 0.
+
+The nodes in these intermediate rings can be pure proxies with only memory
+caches, use local media for persistent cache, or use local media to
+completely slave the filesystem.
+
+    ADMIN      - Media does not participate, administrative proxy only
+    CACHE      - Media only acts as a persistent cache
+    COPY       - Media only acts as a local copy
+    SLAVE      - Media is a RO slave that can be mounted RW
+
+    SOFT_SLAVE - This is a SLAVE which can become writable when
+                 the quorum is not available, but is not guaranteed
+                 to be able to be merged back when the quorum becomes
+                 available again.  Elements which cannot be merged
+                 back remain localized and writable until manual
+                 or scripted intervention recombines them.
+
+    SOFT_MASTER        - Similar to the above but can form a sub-cluster
+                 and run the quorum protocol within the sub-cluster
+                 to serve machines that connect to the sub-cluster
+                 when the master cluster is not available.
+
+                 The SOFT_MASTER nodes in a sub-cluster must be
+                 fully interconnected with each other.
+
+    MASTER     - This is a MASTER node in the quorum protocol.
+
+                 The MASTER nodes in a cluster must be fully
+                 interconnected with each other.
+
+There are four major protocols:
+
+    Quorum protocol
+
+       This protocol is used between MASTER nodes to vote on operations
+       and resolve deadlocks.
+
+       This protocol is used between SOFT_MASTER nodes in a sub-cluster
+       to vote on operations, resolve deadlocks, determine what the latest
+       transaction id for an element is, and to perform commits.
+
+    Cache sub-protocol
+
+       This is the MESI sub-protocol which runs under the Quorum
+       protocol.  This protocol is used to maintain cache state for
+       sub-trees to ensure that operations remain cache coherent.
+
+       Depending on administrative rights this protocol may or may
+       not allow a leaf node in the cluster to hold a cache element
+       indefinitely.  The administrative controller may preemptively
+       downgrade a leaf with insufficient administrative rights
+       without giving it a chance to synchronize any modified state
+       back to the cluster.
+
+    Proxy protocol
+
+       The Quorum and Cache protocols only operate between MASTER
+       and SOFT_MASTER nodes.  All other node types must use the
+       Proxy protocol to perform similar actions.  This protocol
+       differs in that proxy requests are typically sent to just
+       one adjacent node and that node then maintains state and
+       forwards the request or performs the required operation.
+       When the link is lost to the proxy, the proxy automatically
+       forwards a deletion of the state to the other nodes based on
+       what it has recorded.
+
+       If a leaf has insufficient administrative rights it may not
+       be allowed to actually initiate a quorum operation and may only
+       be allowed to maintain partial MESI cache state or perhaps none
+       at all (since cache state can block other machines in the
+       cluster).  Instead a leaf with insufficient rights will have to
+       make due with a preemptive loss of cache state and any allowed
+       modifying operations will have to be forwarded to the proxy which
+       continues forwarding it until a node with sufficient administrative
+       rights is encountered.
+
+       To reduce issues and give the cluster more breath, sub-clusters
+       made up of SOFT_MASTERs can be formed in order to provide full
+       cache coherent within a subset of machines and yet still tie them
+       into a greater cluster that they normally would not have such
+       access to.  This effectively makes it possible to create a two
+       or three-tier fan-out of groups of machines which are cache-coherent
+       within the group, but perhaps not between groups, and use other
+       means to synchronize between the groups.
+
+    Media protocol
+
+       This is basically the physical media protocol.
+
+There are lots of ways to implement multi-master environments using the
+above core features but the implementation is going to be fairly complex
+even with HAMMER2's feature set.
+
+Keep in mind that modifications propagate all the way to the super-root
+and volume header, so in any clustered arrangement the use of (modify_tid)
+and (mirror_tid) is critical in determining the synchronization state of
+portion(s) of the filesystem.
+
+Specifically, since any modification propagates to the root the (mirror_tid)
+in higher level directories is going to be in a constant state of flux.  This
+state of flux DOES NOT invalidate the cache state for these higher levels
+of directories.  Instead, the (modify_tid) is used on a node-by-node basis
+to determine cache state at any given level, and (mirror_tid) is used to
+determine whether any recursively underlying state is desynchronized.
+The inode structure also has two additional transaction ids used to optimize
+path lookups, stat, and directory lookup/scan operations.
diff --git a/sys/vfs/hammer2/Makefile b/sys/vfs/hammer2/Makefile
new file mode 100644 (file)
index 0000000..f0b2ffe
--- /dev/null
@@ -0,0 +1,12 @@
+# Makefile for hammer2 vfs
+#
+#
+.PATH: ${.CURDIR}
+
+CFLAGS+= -DINVARIANTS
+KMOD=  hammer2
+SRCS=  hammer2_vfsops.c hammer2_vnops.c hammer2_inode.c
+SRCS+= hammer2_chain.c hammer2_freemap.c hammer2_subr.c hammer2_icrc.c
+SRCS+= hammer2_ioctl.c
+
+.include <bsd.kmod.mk>
diff --git a/sys/vfs/hammer2/TODO b/sys/vfs/hammer2/TODO
new file mode 100644 (file)
index 0000000..df6ae9d
--- /dev/null
@@ -0,0 +1,56 @@
+* Nesting problems in the flusher.
+
+* Inefficient vfsync due to thousands of file buffers, one per-vnode.
+  (need to aggregate using a device buffer?)
+
+* Adjust the flusher to unlock the parent after the child is locked,
+  then restart if the parent changed out from under us.  This will
+  greatly reduce namecache contention.
+
+* Use bp->b_dep to interlock the buffer with the chain structure so the
+  strategy code can calculate the crc and assert that the chain is marked
+  modified (not yet flushed).
+
+* Deleted inode not reachable via tree for volume flush but still reachable
+  via fsync/inactive/reclaim.  Its tree can be destroyed at that point.
+
+* The direct write code needs to invalidate any underlying physical buffers.
+  Direct write needs to be implemented.
+
+* Make sure a resized block (hammer2_chain_resize()) calculates a new
+  hash code in the parent bref
+
+* The freemap allocator needs to getblk/clrbuf/bdwrite any partial
+  block allocations (less than 64KB) that allocate out of a new 64K
+  block, to avoid causing a read-before-write I/O.
+
+* Check flush race upward recursion setting SUBMODIFIED vs downward
+  recursion checking SUBMODIFIED then locking (must clear before the
+  recursion and might need additional synchronization)
+
+* There is definitely a flush race in the hardlink implementation between
+  the forwarding entries and the actual (hidden) hardlink inode.
+
+  This will require us to associate a small hard-link-adjust structure
+  with the chain whenever we create or delete hardlinks, on top of
+  adjusting the hardlink inode itself.  Any actual flush to the media
+  has to synchronize the correct nlinks value based on whether related
+  created or deleted hardlinks were also flushed.
+
+* When a directory entry is created and also if an indirect block is
+  created and entries moved into it, the directory seek position can
+  potentially become incorrect during a scan.
+
+* When a directory entry is deleted a directory seek position depending
+  on that key can cause readdir to skip entries.
+
+* TWO PHASE COMMIT - store two data offsets in the chain, and
+  hammer2_chain_delete() needs to leave the chain intact if MODIFIED2 is
+  set on its buffer until the flusher gets to it?
+
+
+                               OPTIMIZATIONS
+
+* If a file is unlinked buts its descriptors is left open and used, we
+  should allow data blocks on-media to be reused since there is no
+  topology left to point at them.
diff --git a/sys/vfs/hammer2/donew b/sys/vfs/hammer2/donew
new file mode 100755 (executable)
index 0000000..db573ea
--- /dev/null
@@ -0,0 +1,5 @@
+#!/bin/csh
+#
+
+umount /mnt
+newfs_hammer2 -L ROOT /dev/da0s1d
diff --git a/sys/vfs/hammer2/donew2 b/sys/vfs/hammer2/donew2
new file mode 100755 (executable)
index 0000000..d98c5a2
--- /dev/null
@@ -0,0 +1,5 @@
+#!/bin/csh
+#
+
+umount /mnt
+newfs_hammer2 -L ROOT /dev/da0s1b
diff --git a/sys/vfs/hammer2/dossd b/sys/vfs/hammer2/dossd
new file mode 100755 (executable)
index 0000000..946bf7b
--- /dev/null
@@ -0,0 +1,11 @@
+#!/bin/csh
+#
+
+umount /mnt >& /dev/null
+kldunload hammer2.ko >& /dev/null
+kldstat | fgrep hammer2.ko >& /dev/null
+if ( $status > 0 ) then
+    kldload /usr/obj/usr/src/sys/vfs/hammer2/hammer2.ko
+endif
+mount_hammer2 /dev/da0s1d@ROOT /mnt
+sysctl vfs.hammer2.debug=0
diff --git a/sys/vfs/hammer2/dossd2 b/sys/vfs/hammer2/dossd2
new file mode 100755 (executable)
index 0000000..124869c
--- /dev/null
@@ -0,0 +1,11 @@
+#!/bin/csh
+#
+
+umount /mnt >& /dev/null
+kldunload hammer2.ko >& /dev/null
+kldstat | fgrep hammer2.ko >& /dev/null
+if ( $status > 0 ) then
+    kldload /usr/obj/usr/src/sys/vfs/hammer2/hammer2.ko
+endif
+mount_hammer2 /dev/da0s1b@ROOT /mnt
+sysctl vfs.hammer2.debug=0
diff --git a/sys/vfs/hammer2/dotest b/sys/vfs/hammer2/dotest
new file mode 100755 (executable)
index 0000000..803de0c
--- /dev/null
@@ -0,0 +1,11 @@
+#!/bin/csh
+#
+
+# ./mkvntest
+umount /mnt >& /dev/null
+kldunload hammer2.ko >& /dev/null
+kldstat | fgrep hammer2.ko >& /dev/null
+if ( $status > 0 ) then
+    kldload /usr/obj/usr/src/sys/vfs/hammer2/hammer2.ko
+endif
+mount_hammer2 /dev/vn0@ROOT /mnt
diff --git a/sys/vfs/hammer2/hammer2.h b/sys/vfs/hammer2/hammer2.h
new file mode 100644 (file)
index 0000000..265c2c8
--- /dev/null
@@ -0,0 +1,468 @@
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * This header file contains structures used internally by the HAMMER2
+ * implementation.  See hammer2_disk.h for on-disk structures.
+ */
+
+#ifndef _VFS_HAMMER2_HAMMER2_H_
+#define _VFS_HAMMER2_HAMMER2_H_
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/kernel.h>
+#include <sys/conf.h>
+#include <sys/systm.h>
+#include <sys/tree.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/proc.h>
+#include <sys/mountctl.h>
+#include <sys/priv.h>
+#include <sys/stat.h>
+#include <sys/globaldata.h>
+#include <sys/lockf.h>
+#include <sys/buf.h>
+#include <sys/queue.h>
+#include <sys/limits.h>
+#include <sys/buf2.h>
+#include <sys/signal2.h>
+#include <sys/tree.h>
+
+#include "hammer2_disk.h"
+#include "hammer2_mount.h"
+#include "hammer2_ioctl.h"
+
+struct hammer2_chain;
+struct hammer2_inode;
+struct hammer2_mount;
+struct hammer2_pfsmount;
+
+/*
+ * The chain structure tracks blockref recursions all the way to
+ * the root volume.  These consist of indirect blocks, inodes,
+ * and eventually the volume header.
+ *
+ * The chain structure is embedded in the hammer2_mount, hammer2_inode,
+ * and other system memory structures.  The chain structure typically
+ * implements the reference count and busy flag for the larger structure.
+ *
+ * It is always possible to track a chain element all the way back to the
+ * root by following the (parent) links.  (index) is a type-dependent index
+ * in the parent indicating where in the parent the chain element resides.
+ *
+ * When a blockref is added or deleted the related chain element is marked
+ * modified and all of its parents are marked SUBMODIFIED (the parent
+ * recursion can stop once we hit a node that is already marked SUBMODIFIED).
+ * A deleted chain element must remain intact until synchronized against
+ * its parent.
+ *
+ * The blockref at (parent, index) is not adjusted until the modified chain
+ * element is flushed and unmarked.  Until then the child's blockref may
+ * not match the blockref at (parent, index).
+ */
+SPLAY_HEAD(hammer2_chain_splay, hammer2_chain);
+
+struct hammer2_chain {
+       struct hammer2_blockref bref;
+       struct hammer2_blockref bref_flush;     /* synchronized w/MOVED bit */
+       struct hammer2_chain *parent;           /* return chain to root */
+       struct hammer2_chain_splay shead;
+       SPLAY_ENTRY(hammer2_chain) snode;
+       TAILQ_ENTRY(hammer2_chain) flush_node;  /* flush deferral list */
+       union {
+               struct hammer2_inode *ip;
+               struct hammer2_indblock *np;
+               struct hammer2_data *dp;
+               void *mem;
+       } u;
+
+       struct buf      *bp;            /* buffer cache (ro) */
+       hammer2_media_data_t *data;     /* modified copy of data (rw) */
+       u_int           bytes;          /* physical size of data */
+       struct lock     lk;             /* lockmgr lock */
+       int             index;          /* index in parent */
+       u_int           refs;
+       u_int           busy;           /* soft-busy */
+       u_int           flags;
+};
+
+typedef struct hammer2_chain hammer2_chain_t;
+
+int hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2);
+SPLAY_PROTOTYPE(hammer2_chain_splay, hammer2_chain, snode, hammer2_chain_cmp);
+
+/*
+ * MOVED - This bit is set during the flush when the MODIFIED bit is cleared,
+ *        indicating that the parent's blocktable must inherit a change to
+ *        the bref (typically a block reallocation)
+ *
+ *        It must also be set in situations where a chain is not MODIFIED
+ *        but whos bref has changed (typically due to fields other than
+ *        a block reallocation).
+ */
+#define HAMMER2_CHAIN_MODIFIED         0x00000001      /* active mods */
+#define HAMMER2_CHAIN_DIRTYEMBED       0x00000002      /* inode embedded */
+#define HAMMER2_CHAIN_DIRTYBP          0x00000004      /* dirty on unlock */
+#define HAMMER2_CHAIN_SUBMODIFIED      0x00000008      /* 1+ subs modified */
+#define HAMMER2_CHAIN_DELETED          0x00000010
+#define HAMMER2_CHAIN_INITIAL          0x00000020      /* initial create */
+#define HAMMER2_CHAIN_FLUSHED          0x00000040      /* flush on unlock */
+#define HAMMER2_CHAIN_MOVED            0x00000080      /* bref changed */
+#define HAMMER2_CHAIN_IOFLUSH          0x00000100      /* bawrite on put */
+#define HAMMER2_CHAIN_DEFERRED         0x00000200      /* on a deferral list*/
+#define HAMMER2_CHAIN_DESTROYED                0x00000400      /* destroying */
+#define HAMMER2_CHAIN_MODIFIED_AUX     0x00000800      /* hmp->vchain only */
+#define HAMMER2_CHAIN_MODIFY_TID       0x00001000      /* mod updates field */
+#define HAMMER2_CHAIN_MOUNTED          0x00002000      /* PFS is mounted */
+
+/*
+ * Flags passed to hammer2_chain_lookup() and hammer2_chain_next()
+ */
+#define HAMMER2_LOOKUP_NOLOCK          0x00000001      /* ref only */
+#define HAMMER2_LOOKUP_NODATA          0x00000002      /* data left NULL */
+
+/*
+ * Flags passed to hammer2_chain_modify() and hammer2_chain_resize()
+ *
+ * NOTE: OPTDATA allows us to avoid instantiating buffers for INDIRECT
+ *      blocks in the INITIAL-create state.
+ *
+ * NOTE: NO_MODIFY_TID tells the function to not set HAMMER2_CHAIN_MODIFY_TID
+ *      when marking the chain modified (used when a sub-chain modification
+ *      propagates upward).
+ */
+#define HAMMER2_MODIFY_NOSUB           0x00000001      /* do not set SUBMOD */
+#define HAMMER2_MODIFY_OPTDATA         0x00000002      /* data can be NULL */
+#define HAMMER2_MODIFY_NO_MODIFY_TID   0x00000004
+
+/*
+ * Flags passed to hammer2_chain_lock()
+ */
+#define HAMMER2_RESOLVE_NEVER          1
+#define HAMMER2_RESOLVE_MAYBE          2
+#define HAMMER2_RESOLVE_ALWAYS         3
+
+/*
+ * Cluster different types of storage together for allocations
+ */
+#define HAMMER2_FREECACHE_INODE                0
+#define HAMMER2_FREECACHE_INDIR                1
+#define HAMMER2_FREECACHE_DATA         2
+#define HAMMER2_FREECACHE_UNUSED3      3
+#define HAMMER2_FREECACHE_TYPES                4
+
+/*
+ * BMAP read-ahead maximum parameters
+ */
+#define HAMMER2_BMAP_COUNT             16      /* max bmap read-ahead */
+#define HAMMER2_BMAP_BYTES             (HAMMER2_PBUFSIZE * HAMMER2_BMAP_COUNT)
+
+/*
+ * Misc
+ */
+#define HAMMER2_FLUSH_DEPTH_LIMIT      40      /* stack recursion limit */
+
+/*
+ * HAMMER2 IN-MEMORY CACHE OF MEDIA STRUCTURES
+ *
+ * There is an in-memory representation of all on-media data structure.
+ *
+ * When accessed read-only the data will be mapped to the related buffer
+ * cache buffer.
+ *
+ * When accessed read-write (marked modified) a kmalloc()'d copy of the
+ * is created which can then be modified.  The copy is destroyed when a
+ * filesystem block is allocated to replace it.
+ *
+ * Active inodes (those with vnodes attached) will maintain the kmalloc()'d
+ * copy for both the read-only and the read-write case.  The combination of
+ * (bp) and (data) determines whether (data) was allocated or not.
+ *
+ * The in-memory representation may remain cached (for example in order to
+ * placemark clustering locks) even after the related data has been
+ * detached.
+ */
+
+/*
+ * A hammer2 inode.
+ */
+struct hammer2_inode {
+       struct hammer2_mount    *hmp;           /* Global mount */
+       struct hammer2_pfsmount *pmp;           /* PFS mount */
+       struct hammer2_inode    *pip;           /* parent inode */
+       struct vnode            *vp;
+       hammer2_chain_t         chain;
+       struct hammer2_inode_data ip_data;
+       struct lockf            advlock;
+       u_int                   depth;          /* directory depth */
+       hammer2_off_t           delta_dcount;   /* adjust data_count */
+       hammer2_off_t           delta_icount;   /* adjust inode_count */
+};
+
+typedef struct hammer2_inode hammer2_inode_t;
+
+/*
+ * A hammer2 indirect block
+ */
+struct hammer2_indblock {
+       hammer2_chain_t         chain;
+};
+
+typedef struct hammer2_indblock hammer2_indblock_t;
+
+/*
+ * A hammer2 data block
+ */
+struct hammer2_data {
+       hammer2_chain_t         chain;
+};
+
+typedef struct hammer2_data hammer2_data_t;
+
+struct hammer2_freecache {
+       hammer2_off_t   bulk;
+       hammer2_off_t   single;
+};
+
+typedef struct hammer2_freecache hammer2_freecache_t;
+
+/*
+ * Global (per device) mount structure for device (aka vp->v_mount->hmp)
+ */
+struct hammer2_mount {
+       struct vnode    *devvp;         /* device vnode */
+       int             ronly;          /* read-only mount */
+       int             pmp_count;      /* PFS mounts backed by us */
+       TAILQ_ENTRY(hammer2_mount) mntentry; /* hammer2_mntlist */
+
+       struct malloc_type *minode;
+       int             ninodes;
+       int             maxinodes;
+
+       struct malloc_type *mchain;
+       int             nipstacks;
+       int             maxipstacks;
+       hammer2_chain_t vchain;         /* anchor chain */
+       hammer2_chain_t *schain;        /* super-root */
+       struct lock     alloclk;        /* lockmgr lock */
+       struct lock     voldatalk;      /* lockmgr lock */
+
+       hammer2_volume_data_t voldata;
+       hammer2_freecache_t freecache[HAMMER2_FREECACHE_TYPES][HAMMER2_MAX_RADIX+1];
+};
+
+typedef struct hammer2_mount hammer2_mount_t;
+
+/*
+ * Per-PFS mount structure for device (aka vp->v_mount)
+ */
+struct hammer2_pfsmount {
+       struct mount            *mp;            /* kernel mount */
+       struct hammer2_mount    *hmp;           /* device global mount */
+       hammer2_chain_t         *rchain;        /* PFS root chain */
+       hammer2_inode_t         *iroot;         /* PFS root inode */
+       struct netexport        export;         /* nfs export */
+       int                     ronly;          /* read-only mount */
+};
+
+typedef struct hammer2_pfsmount hammer2_pfsmount_t;
+
+#if defined(_KERNEL)
+
+MALLOC_DECLARE(M_HAMMER2);
+
+#define VTOI(vp)       ((hammer2_inode_t *)(vp)->v_data)
+#define ITOV(ip)       ((ip)->vp)
+
+static __inline
+hammer2_pfsmount_t *
+MPTOPMP(struct mount *mp)
+{
+       return ((hammer2_pfsmount_t *)mp->mnt_data);
+}
+
+static __inline
+hammer2_mount_t *
+MPTOHMP(struct mount *mp)
+{
+       return (((hammer2_pfsmount_t *)mp->mnt_data)->hmp);
+}
+
+extern struct vop_ops hammer2_vnode_vops;
+extern struct vop_ops hammer2_spec_vops;
+extern struct vop_ops hammer2_fifo_vops;
+
+extern int hammer2_debug;
+extern int hammer2_cluster_enable;
+extern int hammer2_hardlink_enable;
+extern long hammer2_iod_file_read;
+extern long hammer2_iod_meta_read;
+extern long hammer2_iod_indr_read;
+extern long hammer2_iod_file_write;
+extern long hammer2_iod_meta_write;
+extern long hammer2_iod_indr_write;
+extern long hammer2_iod_volu_write;
+extern long hammer2_ioa_file_read;
+extern long hammer2_ioa_meta_read;
+extern long hammer2_ioa_indr_read;
+extern long hammer2_ioa_file_write;
+extern long hammer2_ioa_meta_write;
+extern long hammer2_ioa_indr_write;
+extern long hammer2_ioa_volu_write;
+
+/*
+ * hammer2_subr.c
+ */
+void hammer2_inode_lock_ex(hammer2_inode_t *ip);
+void hammer2_inode_unlock_ex(hammer2_inode_t *ip);
+void hammer2_inode_lock_sh(hammer2_inode_t *ip);
+void hammer2_inode_unlock_sh(hammer2_inode_t *ip);
+void hammer2_inode_busy(hammer2_inode_t *ip);
+void hammer2_inode_unbusy(hammer2_inode_t *ip);
+void hammer2_voldata_lock(hammer2_mount_t *hmp);
+void hammer2_voldata_unlock(hammer2_mount_t *hmp);
+
+void hammer2_mount_exlock(hammer2_mount_t *hmp);
+void hammer2_mount_shlock(hammer2_mount_t *hmp);
+void hammer2_mount_unlock(hammer2_mount_t *hmp);
+
+int hammer2_get_dtype(hammer2_inode_t *ip);
+int hammer2_get_vtype(hammer2_inode_t *ip);
+u_int8_t hammer2_get_obj_type(enum vtype vtype);
+void hammer2_time_to_timespec(u_int64_t xtime, struct timespec *ts);
+u_int64_t hammer2_timespec_to_time(struct timespec *ts);
+u_int32_t hammer2_to_unix_xid(uuid_t *uuid);
+void hammer2_guid_to_uuid(uuid_t *uuid, u_int32_t guid);
+
+hammer2_key_t hammer2_dirhash(const unsigned char *name, size_t len);
+int hammer2_bytes_to_radix(size_t bytes);
+
+int hammer2_calc_logical(hammer2_inode_t *ip, hammer2_off_t uoff,
+                        hammer2_key_t *lbasep, hammer2_key_t *leofp);
+void hammer2_update_time(uint64_t *timep);
+
+/*
+ * hammer2_inode.c
+ */
+struct vnode *hammer2_igetv(hammer2_inode_t *ip, int *errorp);
+
+void hammer2_inode_lock_nlinks(hammer2_inode_t *ip);
+void hammer2_inode_unlock_nlinks(hammer2_inode_t *ip);
+hammer2_inode_t *hammer2_inode_alloc(hammer2_pfsmount_t *pmp, void *data);
+void hammer2_inode_free(hammer2_inode_t *ip);
+void hammer2_inode_ref(hammer2_inode_t *ip);
+void hammer2_inode_drop(hammer2_inode_t *ip);
+int hammer2_inode_calc_alloc(hammer2_key_t filesize);
+
+int hammer2_inode_create(hammer2_inode_t *dip,
+                       struct vattr *vap, struct ucred *cred,
+                       const uint8_t *name, size_t name_len,
+                       hammer2_inode_t **nipp);
+
+int hammer2_inode_duplicate(hammer2_inode_t *dip,
+                       hammer2_inode_t *oip, hammer2_inode_t **nipp,
+                       const uint8_t *name, size_t name_len);
+int hammer2_inode_connect(hammer2_inode_t *dip, hammer2_inode_t *oip,
+                       const uint8_t *name, size_t name_len);
+
+int hammer2_unlink_file(hammer2_inode_t *dip,
+                       const uint8_t *name, size_t name_len,
+                       int isdir, hammer2_inode_t *retain_ip);
+int hammer2_hardlink_consolidate(hammer2_inode_t **ipp, hammer2_inode_t *tdip);
+int hammer2_hardlink_deconsolidate(hammer2_inode_t *dip,
+                       hammer2_chain_t **chainp, hammer2_inode_t **ipp);
+int hammer2_hardlink_find(hammer2_inode_t *dip, hammer2_chain_t **chainp,
+                       hammer2_inode_t **ipp);
+
+/*
+ * hammer2_chain.c
+ */
+void hammer2_modify_volume(hammer2_mount_t *hmp);
+hammer2_chain_t *hammer2_chain_alloc(hammer2_mount_t *hmp,
+                               hammer2_blockref_t *bref);
+void hammer2_chain_free(hammer2_mount_t *hmp, hammer2_chain_t *chain);
+void hammer2_chain_ref(hammer2_mount_t *hmp, hammer2_chain_t *chain);
+void hammer2_chain_drop(hammer2_mount_t *hmp, hammer2_chain_t *chain);
+int hammer2_chain_lock(hammer2_mount_t *hmp, hammer2_chain_t *chain, int how);
+void hammer2_chain_moved(hammer2_mount_t *hmp, hammer2_chain_t *chain);
+void hammer2_chain_modify(hammer2_mount_t *hmp, hammer2_chain_t *chain,
+                               int flags);
+void hammer2_chain_resize(hammer2_inode_t *ip, hammer2_chain_t *chain,
+                               int nradix, int flags);
+void hammer2_chain_unlock(hammer2_mount_t *hmp, hammer2_chain_t *chain);
+hammer2_chain_t *hammer2_chain_find(hammer2_mount_t *hmp,
+                               hammer2_chain_t *parent, int index);
+hammer2_chain_t *hammer2_chain_get(hammer2_mount_t *hmp,
+                               hammer2_chain_t *parent,
+                               int index, int flags);
+hammer2_chain_t *hammer2_chain_lookup(hammer2_mount_t *hmp,
+                               hammer2_chain_t **parentp,
+                               hammer2_key_t key_beg, hammer2_key_t key_end,
+                               int flags);
+hammer2_chain_t *hammer2_chain_next(hammer2_mount_t *hmp,
+                               hammer2_chain_t **parentp,
+                               hammer2_chain_t *chain,
+                               hammer2_key_t key_beg, hammer2_key_t key_end,
+                               int flags);
+hammer2_chain_t *hammer2_chain_create(hammer2_mount_t *hmp,
+                               hammer2_chain_t *parent,
+                               hammer2_chain_t *chain,
+                               hammer2_key_t key, int keybits,
+                               int type, size_t bytes);
+void hammer2_chain_delete(hammer2_mount_t *hmp, hammer2_chain_t *parent,
+                               hammer2_chain_t *chain, int retain);
+void hammer2_chain_flush(hammer2_mount_t *hmp, hammer2_chain_t *chain,
+                               hammer2_tid_t modify_tid);
+void hammer2_chain_commit(hammer2_mount_t *hmp, hammer2_chain_t *chain);
+
+/*
+ * hammer2_ioctl.c
+ */
+int hammer2_ioctl(hammer2_inode_t *ip, u_long com, void *data,
+                               int fflag, struct ucred *cred);
+
+/*
+ * hammer2_freemap.c
+ */
+hammer2_off_t hammer2_freemap_alloc(hammer2_mount_t *hmp,
+                               int type, size_t bytes);
+void hammer2_freemap_free(hammer2_mount_t *hmp, hammer2_off_t data_off,
+                               int type);
+
+#endif /* !_KERNEL */
+#endif /* !_VFS_HAMMER2_HAMMER2_H_ */
diff --git a/sys/vfs/hammer2/hammer2_chain.c b/sys/vfs/hammer2/hammer2_chain.c
new file mode 100644 (file)
index 0000000..b04684b
--- /dev/null
@@ -0,0 +1,2779 @@
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * This subsystem handles direct and indirect block searches, recursions,
+ * creation, and deletion.  Chains of blockrefs are tracked and modifications
+ * are flag for propagation... eventually all the way back to the volume
+ * header.
+ */
+
+#include <sys/cdefs.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/types.h>
+#include <sys/lock.h>
+#include <sys/uuid.h>
+
+#include "hammer2.h"
+
+static int hammer2_indirect_optimize;  /* XXX SYSCTL */
+
+static hammer2_chain_t *hammer2_chain_create_indirect(
+                       hammer2_mount_t *hmp, hammer2_chain_t *parent,
+                       hammer2_key_t key, int keybits);
+
+/*
+ * Splay tree
+ */
+SPLAY_GENERATE(hammer2_chain_splay, hammer2_chain, snode, hammer2_chain_cmp);
+
+int
+hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2)
+{
+       return(chain2->index - chain1->index);
+}
+
+/*
+ * Recursively mark the parent chain elements so flushes can find
+ * modified elements.  Stop when we hit a chain already flagged
+ * SUBMODIFIED, but ignore the SUBMODIFIED bit that might be set
+ * in chain itself.
+ *
+ * SUBMODIFIED is not set on the chain passed in.
+ *
+ * XXX rename of parent can create a SMP race
+ */
+static void
+hammer2_chain_parent_setsubmod(hammer2_mount_t *hmp, hammer2_chain_t *chain)
+{
+       hammer2_chain_t *parent;
+
+       parent = chain->parent;
+       while (parent && (parent->flags & HAMMER2_CHAIN_SUBMODIFIED) == 0) {
+               atomic_set_int(&parent->flags, HAMMER2_CHAIN_SUBMODIFIED);
+               parent = parent->parent;
+       }
+}
+
+/*
+ * Allocate a new disconnected chain element representing the specified
+ * bref.  The chain element is locked exclusively and refs is set to 1.
+ *
+ * This essentially allocates a system memory structure representing one
+ * of the media structure types, including inodes.
+ */
+hammer2_chain_t *
+hammer2_chain_alloc(hammer2_mount_t *hmp, hammer2_blockref_t *bref)
+{
+       hammer2_chain_t *chain;
+       hammer2_inode_t *ip;
+       hammer2_indblock_t *np;
+       hammer2_data_t *dp;
+       u_int bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
+
+       /*
+        * Construct the appropriate system structure.
+        */
+       switch(bref->type) {
+       case HAMMER2_BREF_TYPE_INODE:
+               ip = kmalloc(sizeof(*ip), hmp->minode, M_WAITOK | M_ZERO);
+               chain = &ip->chain;
+               chain->u.ip = ip;
+               lockinit(&chain->lk, "inode", 0, LK_CANRECURSE);
+               ip->hmp = hmp;
+               break;
+       case HAMMER2_BREF_TYPE_INDIRECT:
+               np = kmalloc(sizeof(*np), hmp->mchain, M_WAITOK | M_ZERO);
+               chain = &np->chain;
+               chain->u.np = np;
+               lockinit(&chain->lk, "iblk", 0, LK_CANRECURSE);
+               break;
+       case HAMMER2_BREF_TYPE_DATA:
+               dp = kmalloc(sizeof(*dp), hmp->mchain, M_WAITOK | M_ZERO);
+               chain = &dp->chain;
+               chain->u.dp = dp;
+               lockinit(&chain->lk, "dblk", 0, LK_CANRECURSE);
+               break;
+       case HAMMER2_BREF_TYPE_VOLUME:
+               chain = NULL;
+               panic("hammer2_chain_alloc volume type illegal for op");
+       default:
+               chain = NULL;
+               panic("hammer2_chain_alloc: unrecognized blockref type: %d",
+                     bref->type);
+       }
+
+       /*
+        * Only set bref_flush if the bref has a real media offset, otherwise
+        * the caller has to wait for the chain to be modified/block-allocated
+        * before a blockref can be synchronized with its (future) parent.
+        */
+       chain->bref = *bref;
+       if (bref->data_off & ~HAMMER2_OFF_MASK_RADIX)
+               chain->bref_flush = *bref;
+       chain->index = -1;              /* not yet assigned */
+       chain->refs = 1;
+       chain->bytes = bytes;
+       lockmgr(&chain->lk, LK_EXCLUSIVE);
+
+       return (chain);
+}
+
+/*
+ * Free a disconnected chain element
+ */
+void
+hammer2_chain_free(hammer2_mount_t *hmp, hammer2_chain_t *chain)
+{
+       void *mem;
+
+       if (chain->bref.type == HAMMER2_BREF_TYPE_INODE ||
+           chain->bref.type == HAMMER2_BREF_TYPE_VOLUME) {
+               chain->data = NULL;
+       }
+
+       KKASSERT(chain->bp == NULL);
+       KKASSERT(chain->data == NULL);
+       KKASSERT(chain->bref.type != HAMMER2_BREF_TYPE_INODE ||
+                chain->u.ip->vp == NULL);
+
+       if ((mem = chain->u.mem) != NULL) {
+               chain->u.mem = NULL;
+               if (chain->bref.type == HAMMER2_BREF_TYPE_INODE)
+                       kfree(mem, hmp->minode);
+               else
+                       kfree(mem, hmp->mchain);
+       }
+}
+
+/*
+ * Add a reference to a chain element (for shared access).  The chain
+ * element must already have at least 1 ref controlled by the caller.
+ */
+void
+hammer2_chain_ref(hammer2_mount_t *hmp, hammer2_chain_t *chain)
+{
+       KKASSERT(chain->refs > 0);
+       atomic_add_int(&chain->refs, 1);
+}
+
+/*
+ * Drop the callers reference to the chain element.  If the ref count
+ * reaches zero the chain element and its related structure (typically an
+ * inode or indirect block) will be freed and the parent will be
+ * recursively dropped.
+ *
+ * MOVED and MODIFIED elements hold additional references so it should not
+ * be possible for the count on a modified element to drop to 0.
+ *
+ * The chain element must NOT be locked by the caller.
+ *
+ * The parent might or might not be locked by the caller but if so it
+ * will also be referenced so we shouldn't recurse upward.
+ */
+void
+hammer2_chain_drop(hammer2_mount_t *hmp, hammer2_chain_t *chain)
+{
+       hammer2_chain_t *parent;
+       u_int refs;
+
+       while (chain) {
+               refs = chain->refs;
+               cpu_ccfence();
+               KKASSERT(refs > 0);
+               if (refs == 1) {
+                       KKASSERT(chain != &hmp->vchain);
+                       parent = chain->parent;
+                       if (parent)
+                               lockmgr(&parent->lk, LK_EXCLUSIVE);
+                       if (atomic_cmpset_int(&chain->refs, 1, 0)) {
+                               /*
+                                * Succeeded, recurse and drop parent.
+                                * These chain elements should be synchronized
+                                * so no delta data or inode count updates
+                                * should be needed.
+                                */
+                               KKASSERT((chain->flags &
+                                         (HAMMER2_CHAIN_MOVED |
+                                          HAMMER2_CHAIN_MODIFIED)) == 0);
+                               if (!(chain->flags & HAMMER2_CHAIN_DELETED)) {
+                                       SPLAY_REMOVE(hammer2_chain_splay,
+                                                    &parent->shead, chain);
+                                       atomic_set_int(&chain->flags,
+                                                      HAMMER2_CHAIN_DELETED);
+                                       /* parent refs dropped via recursion */
+                               }
+                               chain->parent = NULL;
+                               if (parent)
+                                       lockmgr(&parent->lk, LK_RELEASE);
+                               hammer2_chain_free(hmp, chain);
+                               chain = parent;
+                               /* recurse on parent */
+                       } else {
+                               if (parent)
+                                       lockmgr(&parent->lk, LK_RELEASE);
+                               /* retry the same chain */
+                       }
+               } else {
+                       if (atomic_cmpset_int(&chain->refs, refs, refs - 1)) {
+                               /*
+                                * Succeeded, count did not reach zero so
+                                * cut out of the loop.
+                                */
+                               break;
+                       }
+                       /* retry the same chain */
+               }
+       }
+}
+
+/*
+ * Ref and lock a chain element, acquiring its data with I/O if necessary,
+ * and specify how you would like the data to be resolved.
+ *
+ * Returns 0 on success or an error code if the data could not be acquired.
+ * The chain element is locked either way.
+ *
+ * The lock is allowed to recurse, multiple locking ops will aggregate
+ * the requested resolve types.  Once data is assigned it will not be
+ * removed until the last unlock.
+ *
+ * HAMMER2_RESOLVE_NEVER - Do not resolve the data element.
+ *                        (typically used to avoid device/logical buffer
+ *                         aliasing for data)
+ *
+ * HAMMER2_RESOLVE_MAYBE - Do not resolve data elements for chains in
+ *                        the INITIAL-create state (indirect blocks only).
+ *
+ *                        Do not resolve data elements for DATA chains.
+ *                        (typically used to avoid device/logical buffer
+ *                         aliasing for data)
+ *
+ * HAMMER2_RESOLVE_ALWAYS- Always resolve the data element.
+ *
+ *
+ * NOTE: Embedded elements (volume header, inodes) are always resolved
+ *      regardless.
+ *
+ * NOTE: Specifying HAMMER2_RESOLVE_ALWAYS on a newly-created non-embedded
+ *      element will instantiate and zero its buffer, and flush it on
+ *      release.
+ *
+ * NOTE: (data) elements are normally locked RESOLVE_NEVER or RESOLVE_MAYBE
+ *      so as not to instantiate a device buffer, which could alias against
+ *      a logical file buffer.  However, if ALWAYS is specified the
+ *      device buffer will be instantiated anyway.
+ */
+int
+hammer2_chain_lock(hammer2_mount_t *hmp, hammer2_chain_t *chain, int how)
+{
+       hammer2_blockref_t *bref;
+       hammer2_off_t pbase;
+       hammer2_off_t peof;
+       size_t boff;
+       size_t bbytes;
+       int error;
+       char *bdata;
+
+       /*
+        * Lock the element.  Under certain conditions this might end up
+        * being a recursive lock.
+        */
+       KKASSERT(chain->refs > 0);
+       atomic_add_int(&chain->refs, 1);
+       lockmgr(&chain->lk, LK_EXCLUSIVE);
+
+       /*
+        * If we already have a valid data pointer no further action is
+        * necessary.
+        */
+       if (chain->data)
+               return (0);
+
+       /*
+        * Do we have to resolve the data?
+        */
+       switch(how) {
+       case HAMMER2_RESOLVE_NEVER:
+               return(0);
+       case HAMMER2_RESOLVE_MAYBE:
+               if (chain->flags & HAMMER2_CHAIN_INITIAL)
+                       return(0);
+               if (chain->bref.type == HAMMER2_BREF_TYPE_DATA)
+                       return(0);
+               /* fall through */
+       case HAMMER2_RESOLVE_ALWAYS:
+               break;
+       }
+
+       /*
+        * We must resolve to a device buffer, either by issuing I/O or
+        * by creating a zero-fill element.  We do not mark the buffer
+        * dirty when creating a zero-fill element (the hammer2_chain_modify()
+        * API must still be used to do that).
+        *
+        * The device buffer is variable-sized in powers of 2 down
+        * to HAMMER2_MINALLOCSIZE (typically 1K).  A 64K physical storage
+        * chunk always contains buffers of the same size. (XXX)
+        *
+        * The minimum physical IO size may be larger than the variable
+        * block size.
+        */
+       bref = &chain->bref;
+
+       if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
+               bbytes = HAMMER2_MINIOSIZE;
+       pbase = bref->data_off & ~(hammer2_off_t)(bbytes - 1);
+       peof = (pbase + HAMMER2_PBUFSIZE64) & ~HAMMER2_PBUFMASK64;
+       boff = bref->data_off & HAMMER2_OFF_MASK & (bbytes - 1);
+       KKASSERT(pbase != 0);
+
+       /*
+        * The getblk() optimization can only be used on newly created
+        * elements if the physical block size matches the request.
+        */
+       if ((chain->flags & HAMMER2_CHAIN_INITIAL) &&
+           chain->bytes == bbytes) {
+               chain->bp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
+               error = 0;
+       } else if (hammer2_cluster_enable) {
+               error = cluster_read(hmp->devvp, peof, pbase, bbytes,
+                                    HAMMER2_PBUFSIZE, HAMMER2_PBUFSIZE,
+                                    &chain->bp);
+       } else {
+               error = bread(hmp->devvp, pbase, bbytes, &chain->bp);
+       }
+
+       if (error) {
+               kprintf("hammer2_chain_get: I/O error %016jx: %d\n",
+                       (intmax_t)pbase, error);
+               bqrelse(chain->bp);
+               chain->bp = NULL;
+               return (error);
+       }
+
+       /*
+        * Zero the data area if the chain is in the INITIAL-create state.
+        */
+       bdata = (char *)chain->bp->b_data + boff;
+       if (chain->flags & HAMMER2_CHAIN_INITIAL) {
+               bzero(bdata, chain->bytes);
+               chain->bp->b_flags |= B_CACHE;
+               bdirty(chain->bp);
+       }
+
+       /*
+        * Setup the data pointer, either pointing it to an embedded data
+        * structure and copying the data from the buffer, or pointing it
+        * into the buffer.
+        *
+        * The buffer is not retained when copying to an embedded data
+        * structure in order to avoid potential deadlocks or recursions
+        * on the same physical buffer.
+        */
+       switch (bref->type) {
+       case HAMMER2_BREF_TYPE_VOLUME:
+               /*
+                * Copy data from bp to embedded buffer
+                */
+               panic("hammer2_chain_lock: called on unresolved volume header");
+#if 0
+               /* NOT YET */
+               KKASSERT(pbase == 0);
+               KKASSERT(chain->bytes == HAMMER2_PBUFSIZE);
+               bcopy(bdata, &hmp->voldata, chain->bytes);
+               chain->data = (void *)&hmp->voldata;
+               bqrelse(chain->bp);
+               chain->bp = NULL;
+#endif
+               break;
+       case HAMMER2_BREF_TYPE_INODE:
+               /*
+                * Copy data from bp to embedded buffer, do not retain the
+                * device buffer.
+                */
+               bcopy(bdata, &chain->u.ip->ip_data, chain->bytes);
+               chain->data = (void *)&chain->u.ip->ip_data;
+               bqrelse(chain->bp);
+               chain->bp = NULL;
+               break;
+       case HAMMER2_BREF_TYPE_INDIRECT:
+       case HAMMER2_BREF_TYPE_DATA:
+       default:
+               /*
+                * Point data at the device buffer and leave bp intact.
+                */
+               chain->data = (void *)bdata;
+               break;
+       }
+       return (0);
+}
+
+/*
+ * Unlock and deref a chain element.
+ *
+ * On the last lock release any non-embedded data (chain->bp) will be
+ * retired.
+ */
+void
+hammer2_chain_unlock(hammer2_mount_t *hmp, hammer2_chain_t *chain)
+{
+       long *counterp;
+
+       /*
+        * Undo a recursive lock
+        */
+       if (lockcountnb(&chain->lk) > 1) {
+               KKASSERT(chain->refs > 1);
+               atomic_add_int(&chain->refs, -1);
+               lockmgr(&chain->lk, LK_RELEASE);
+               return;
+       }
+
+       /*
+        * Shortcut the case if the data is embedded or not resolved.
+        * Do NOT null-out pointers to embedded data (e.g. inode).
+        */
+       if (chain->bp == NULL) {
+               lockmgr(&chain->lk, LK_RELEASE);
+               hammer2_chain_drop(hmp, chain);
+               return;
+       }
+
+       /*
+        * Statistics
+        */
+       if ((chain->flags & HAMMER2_CHAIN_DIRTYBP) == 0) {
+               ;
+       } else if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
+               switch(chain->bref.type) {
+               case HAMMER2_BREF_TYPE_DATA:
+                       counterp = &hammer2_ioa_file_write;
+                       break;
+               case HAMMER2_BREF_TYPE_INODE:
+                       counterp = &hammer2_ioa_meta_write;
+                       break;
+               case HAMMER2_BREF_TYPE_INDIRECT:
+                       counterp = &hammer2_ioa_indr_write;
+                       break;
+               default:
+                       counterp = &hammer2_ioa_volu_write;
+                       break;
+               }
+               ++*counterp;
+       } else {
+               switch(chain->bref.type) {
+               case HAMMER2_BREF_TYPE_DATA:
+                       counterp = &hammer2_iod_file_write;
+                       break;
+               case HAMMER2_BREF_TYPE_INODE:
+                       counterp = &hammer2_iod_meta_write;
+                       break;
+               case HAMMER2_BREF_TYPE_INDIRECT:
+                       counterp = &hammer2_iod_indr_write;
+                       break;
+               default:
+                       counterp = &hammer2_iod_volu_write;
+                       break;
+               }
+               ++*counterp;
+       }
+
+       /*
+        * Clean out the bp.
+        *
+        * If a device buffer was used for data be sure to destroy the
+        * buffer when we are done to avoid aliases (XXX what about the
+        * underlying VM pages?).
+        */
+       if (chain->bref.type == HAMMER2_BREF_TYPE_DATA)
+               chain->bp->b_flags |= B_RELBUF;
+
+       chain->data = NULL;
+       if (chain->flags & HAMMER2_CHAIN_DIRTYBP) {
+               atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
+               if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
+                       atomic_clear_int(&chain->flags,
+                                        HAMMER2_CHAIN_IOFLUSH);
+                       chain->bp->b_flags |= B_RELBUF;
+                       cluster_awrite(chain->bp);
+               } else {
+                       chain->bp->b_flags |= B_CLUSTEROK;
+                       bdwrite(chain->bp);
+               }
+       } else {
+               if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
+                       atomic_clear_int(&chain->flags,
+                                        HAMMER2_CHAIN_IOFLUSH);
+                       chain->bp->b_flags |= B_RELBUF;
+                       brelse(chain->bp);
+               } else {
+                       /* bp might still be dirty */
+                       bqrelse(chain->bp);
+               }
+       }
+       chain->bp = NULL;
+       lockmgr(&chain->lk, LK_RELEASE);
+       hammer2_chain_drop(hmp, chain);
+}
+
+/*
+ * Resize the chain's physical storage allocation.  Chains can be resized
+ * smaller without reallocating the storage.  Resizing larger will reallocate
+ * the storage.
+ *
+ * Must be passed a locked chain.
+ *
+ * If you want the resize code to copy the data to the new block then the
+ * caller should lock the chain RESOLVE_MAYBE or RESOLVE_ALWAYS.
+ *
+ * If the caller already holds a logical buffer containing the data and
+ * intends to bdwrite() that buffer resolve with RESOLVE_NEVER.  The resize
+ * operation will then not copy the data.
+ *
+ * This function is mostly used with DATA blocks locked RESOLVE_NEVER in order
+ * to avoid instantiating a device buffer that conflicts with the vnode
+ * data buffer.
+ *
+ * XXX flags currently ignored, uses chain->bp to detect data/no-data.
+ */
+void
+hammer2_chain_resize(hammer2_inode_t *ip, hammer2_chain_t *chain,
+                    int nradix, int flags)
+{
+       hammer2_mount_t *hmp = ip->hmp;
+       struct buf *nbp;
+       hammer2_off_t pbase;
+       size_t obytes;
+       size_t nbytes;
+       size_t bbytes;
+       int boff;
+       char *bdata;
+       int error;
+
+       /*
+        * Only data and indirect blocks can be resized for now
+        */
+       KKASSERT(chain != &hmp->vchain);
+       KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA ||
+                chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT);
+
+       /*
+        * Nothing to do if the element is already the proper size
+        */
+       obytes = chain->bytes;
+       nbytes = 1U << nradix;
+       if (obytes == nbytes)
+               return;
+
+       /*
+        * Set MODIFIED and add a chain ref to prevent destruction.  Both
+        * modified flags share the same ref.
+        *
+        * If the chain is already marked MODIFIED then we can safely
+        * return the previous allocation to the pool without having to
+        * worry about snapshots.
+        */
+       if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) {
+               atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED |
+                                             HAMMER2_CHAIN_MODIFY_TID);
+               hammer2_chain_ref(hmp, chain);
+       } else {
+               hammer2_freemap_free(hmp, chain->bref.data_off,
+                                    chain->bref.type);
+       }
+
+       /*
+        * Relocate the block, even if making it smaller (because different
+        * block sizes may be in different regions).
+        */
+       chain->bref.data_off = hammer2_freemap_alloc(hmp, chain->bref.type,
+                                                    nbytes);
+       chain->bytes = nbytes;
+       ip->delta_dcount += (ssize_t)(nbytes - obytes); /* XXX atomic */
+
+       /*
+        * The device buffer may be larger than the allocation size.
+        */
+       if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
+               bbytes = HAMMER2_MINIOSIZE;
+       pbase = chain->bref.data_off & ~(hammer2_off_t)(bbytes - 1);
+       boff = chain->bref.data_off & HAMMER2_OFF_MASK & (bbytes - 1);
+
+       /*
+        * Only copy the data if resolved, otherwise the caller is
+        * responsible.
+        */
+       if (chain->bp) {
+               KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
+                        chain->bref.type == HAMMER2_BREF_TYPE_DATA);
+               KKASSERT(chain != &hmp->vchain);        /* safety */
+
+               /*
+                * The getblk() optimization can only be used if the
+                * physical block size matches the request.
+                */
+               if (nbytes == bbytes) {
+                       nbp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
+                       error = 0;
+               } else {
+                       error = bread(hmp->devvp, pbase, bbytes, &nbp);
+                       KKASSERT(error == 0);
+               }
+               bdata = (char *)nbp->b_data + boff;
+
+               if (nbytes < obytes) {
+                       bcopy(chain->data, bdata, nbytes);
+               } else {
+                       bcopy(chain->data, bdata, obytes);
+                       bzero(bdata + obytes, nbytes - obytes);
+               }
+
+               /*
+                * NOTE: The INITIAL state of the chain is left intact.
+                *
+                * NOTE: Because of the reallocation we have to set DIRTYBP
+                *       if INITIAL is not set.
+                *
+                * NOTE: We set B_NOCACHE to throw away the previous bp and
+                *       any VM backing store, even if it was dirty.
+                *       Otherwise we run the risk of a logical/device
+                *       conflict on reallocation.
+                */
+               chain->bp->b_flags |= B_RELBUF | B_NOCACHE;
+               brelse(chain->bp);
+               chain->bp = nbp;
+               chain->data = (void *)bdata;
+               if ((chain->flags & HAMMER2_CHAIN_INITIAL) == 0)
+                       atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
+       }
+       hammer2_chain_parent_setsubmod(hmp, chain);
+}
+
+/*
+ * Convert a locked chain that was retrieved read-only to read-write.
+ *
+ * If not already marked modified a new physical block will be allocated
+ * and assigned to the bref.
+ *
+ * Non-data blocks - The chain should be locked to at least the RESOLVE_MAYBE
+ *                  level or the COW operation will not work.
+ *
+ * Data blocks    - The chain is usually locked RESOLVE_NEVER so as not to
+ *                  run the data through the device buffers.
+ */
+void
+hammer2_chain_modify(hammer2_mount_t *hmp, hammer2_chain_t *chain, int flags)
+{
+       struct buf *nbp;
+       int error;
+       hammer2_off_t pbase;
+       size_t bbytes;
+       size_t boff;
+       void *bdata;
+
+       /*
+        * Tells flush that modify_tid must be updated, otherwise only
+        * mirror_tid is updated.  This is the default.
+        */
+       if ((flags & HAMMER2_MODIFY_NO_MODIFY_TID) == 0)
+               atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFY_TID);
+
+       /*
+        * If the chain is already marked MODIFIED we can just return.
+        */
+       if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
+               if ((flags & HAMMER2_MODIFY_OPTDATA) == 0 &&
+                   chain->bp == NULL) {
+                       goto skip1;
+               }
+               return;
+       }
+
+       /*
+        * Set MODIFIED and add a chain ref to prevent destruction.  Both
+        * modified flags share the same ref.
+        */
+       atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
+       hammer2_chain_ref(hmp, chain);
+
+       /*
+        * We must allocate the copy-on-write block.
+        *
+        * If the data is embedded no other action is required.
+        *
+        * If the data is not embedded we acquire and clear the
+        * new block.  If chain->data is not NULL we then do the
+        * copy-on-write.  chain->data will then be repointed to the new
+        * buffer and the old buffer will be released.
+        *
+        * For newly created elements with no prior allocation we go
+        * through the copy-on-write steps except without the copying part.
+        */
+       if (chain != &hmp->vchain) {
+               if ((hammer2_debug & 0x0001) &&
+                   (chain->bref.data_off & HAMMER2_OFF_MASK)) {
+                       kprintf("Replace %d\n", chain->bytes);
+               }
+               chain->bref.data_off =
+                       hammer2_freemap_alloc(hmp, chain->bref.type,
+                                             chain->bytes);
+               /* XXX failed allocation */
+       }
+
+       /*
+        * If data instantiation is optional and the chain has no current
+        * data association (typical for DATA and newly-created INDIRECT
+        * elements), don't instantiate the buffer now.
+        */
+       if ((flags & HAMMER2_MODIFY_OPTDATA) && chain->bp == NULL)
+               goto skip2;
+
+skip1:
+       /*
+        * Setting the DIRTYBP flag will cause the buffer to be dirtied or
+        * written-out on unlock.  This bit is independent of the MODIFIED
+        * bit because the chain may still need meta-data adjustments done
+        * by virtue of MODIFIED for its parent, and the buffer can be
+        * flushed out (possibly multiple times) by the OS before that.
+        *
+        * Clearing the INITIAL flag (for indirect blocks) indicates that
+        * a zero-fill buffer has been instantiated.
+        */
+       atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
+       atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
+
+       /*
+        * We currently should never instantiate a device buffer for a
+        * data chain.
+        */
+       KKASSERT(chain->bref.type != HAMMER2_BREF_TYPE_DATA);
+
+       /*
+        * Execute COW operation
+        */
+       switch(chain->bref.type) {
+       case HAMMER2_BREF_TYPE_VOLUME:
+       case HAMMER2_BREF_TYPE_INODE:
+               /*
+                * The data is embedded, no copy-on-write operation is
+                * needed.
+                */
+               KKASSERT(chain->bp == NULL);
+               break;
+       case HAMMER2_BREF_TYPE_DATA:
+       case HAMMER2_BREF_TYPE_INDIRECT:
+               /*
+                * Perform the copy-on-write operation
+                */
+               KKASSERT(chain != &hmp->vchain);        /* safety */
+               /*
+                * The device buffer may be larger than the allocation size.
+                */
+               if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
+                       bbytes = HAMMER2_MINIOSIZE;
+               pbase = chain->bref.data_off & ~(hammer2_off_t)(bbytes - 1);
+               boff = chain->bref.data_off & HAMMER2_OFF_MASK & (bbytes - 1);
+
+               /*
+                * The getblk() optimization can only be used if the
+                * physical block size matches the request.
+                */
+               if (chain->bytes == bbytes) {
+                       nbp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
+                       error = 0;
+               } else {
+                       error = bread(hmp->devvp, pbase, bbytes, &nbp);
+                       KKASSERT(error == 0);
+               }
+               bdata = (char *)nbp->b_data + boff;
+
+               /*
+                * Copy or zero-fill on write depending on whether
+                * chain->data exists or not.
+                */
+               if (chain->data) {
+                       bcopy(chain->data, bdata, chain->bytes);
+                       KKASSERT(chain->bp != NULL);
+               } else {
+                       bzero(bdata, chain->bytes);
+               }
+               if (chain->bp) {
+                       chain->bp->b_flags |= B_RELBUF;
+                       brelse(chain->bp);
+               }
+               chain->bp = nbp;
+               chain->data = bdata;
+               break;
+       default:
+               panic("hammer2_chain_modify: illegal non-embedded type %d",
+                     chain->bref.type);
+               break;
+
+       }
+skip2:
+       if ((flags & HAMMER2_MODIFY_NOSUB) == 0)
+               hammer2_chain_parent_setsubmod(hmp, chain);
+}
+
+/*
+ * Mark the volume as having been modified.  This short-cut version
+ * does not have to lock the volume's chain, which allows the ioctl
+ * code to make adjustments to connections without deadlocking.
+ */
+void
+hammer2_modify_volume(hammer2_mount_t *hmp)
+{
+       hammer2_voldata_lock(hmp);
+       atomic_set_int(&hmp->vchain.flags, HAMMER2_CHAIN_MODIFIED_AUX);
+       hammer2_voldata_unlock(hmp);
+}
+
+/*
+ * Locate an in-memory chain.  The parent must be locked.  The in-memory
+ * chain is returned or NULL if no in-memory chain is present.
+ *
+ * NOTE: A chain on-media might exist for this index when NULL is returned.
+ */
+hammer2_chain_t *
+hammer2_chain_find(hammer2_mount_t *hmp, hammer2_chain_t *parent, int index)
+{
+       hammer2_chain_t dummy;
+       hammer2_chain_t *chain;
+
+       dummy.index = index;
+       chain = SPLAY_FIND(hammer2_chain_splay, &parent->shead, &dummy);
+       return (chain);
+}
+
+/*
+ * Return a locked chain structure with all associated data acquired.
+ *
+ * Caller must lock the parent on call, the returned child will be locked.
+ */
+hammer2_chain_t *
+hammer2_chain_get(hammer2_mount_t *hmp, hammer2_chain_t *parent,
+                 int index, int flags)
+{
+       hammer2_blockref_t *bref;
+       hammer2_chain_t *chain;
+       hammer2_chain_t dummy;
+       int how;
+
+       /*
+        * Figure out how to lock.  MAYBE can be used to optimized
+        * the initial-create state for indirect blocks.
+        */
+       if (flags & (HAMMER2_LOOKUP_NODATA | HAMMER2_LOOKUP_NOLOCK))
+               how = HAMMER2_RESOLVE_NEVER;
+       else
+               how = HAMMER2_RESOLVE_MAYBE;
+
+       /*
+        * First see if we have a (possibly modified) chain element cached
+        * for this (parent, index).  Acquire the data if necessary.
+        *
+        * If chain->data is non-NULL the chain should already be marked
+        * modified.
+        */
+       dummy.index = index;
+       chain = SPLAY_FIND(hammer2_chain_splay, &parent->shead, &dummy);
+       if (chain) {
+               if (flags & HAMMER2_LOOKUP_NOLOCK)
+                       hammer2_chain_ref(hmp, chain);
+               else
+                       hammer2_chain_lock(hmp, chain, how);
+               return(chain);
+       }
+
+       /*
+        * the get function must always succeed, panic if there's no
+        * data to index.
+        */
+       if (parent->flags & HAMMER2_CHAIN_INITIAL) {
+               panic("hammer2_chain_get: Missing bref(1)");
+               /* NOT REACHED */
+       }
+
+       /*
+        * Otherwise lookup the bref and issue I/O (switch on the parent)
+        */
+       switch(parent->bref.type) {
+       case HAMMER2_BREF_TYPE_INODE:
+               KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT);
+               bref = &parent->data->ipdata.u.blockset.blockref[index];
+               break;
+       case HAMMER2_BREF_TYPE_INDIRECT:
+               KKASSERT(parent->data != NULL);
+               KKASSERT(index >= 0 &&
+                        index < parent->bytes / sizeof(hammer2_blockref_t));
+               bref = &parent->data->npdata.blockref[index];
+               break;
+       case HAMMER2_BREF_TYPE_VOLUME:
+               KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT);
+               bref = &hmp->voldata.sroot_blockset.blockref[index];
+               break;
+       default:
+               bref = NULL;
+               panic("hammer2_chain_get: unrecognized blockref type: %d",
+                     parent->bref.type);
+       }
+       if (bref->type == 0) {
+               panic("hammer2_chain_get: Missing bref(2)");
+               /* NOT REACHED */
+       }
+
+       /*
+        * Allocate a chain structure representing the existing media
+        * entry.
+        *
+        * The locking operation we do later will issue I/O to read it.
+        */
+       chain = hammer2_chain_alloc(hmp, bref);
+
+       /*
+        * Link the chain into its parent.  Caller is expected to hold an
+        * exclusive lock on the parent.
+        */
+       chain->parent = parent;
+       chain->index = index;
+       if (SPLAY_INSERT(hammer2_chain_splay, &parent->shead, chain))
+               panic("hammer2_chain_link: collision");
+       KKASSERT(parent->refs > 0);
+       atomic_add_int(&parent->refs, 1);       /* for splay entry */
+
+       /*
+        * Additional linkage for inodes.  Reuse the parent pointer to
+        * find the parent directory.
+        */
+       if (bref->type == HAMMER2_BREF_TYPE_INODE) {
+               while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT)
+                       parent = parent->parent;
+               if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) {
+                       chain->u.ip->pip = parent->u.ip;
+                       chain->u.ip->pmp = parent->u.ip->pmp;
+                       chain->u.ip->depth = parent->u.ip->depth + 1;
+               }
+       }
+
+       /*
+        * Our new chain structure has already been referenced and locked
+        * but the lock code handles the I/O so call it to resolve the data.
+        * Then release one of our two exclusive locks.
+        *
+        * If NOLOCK is set the release will release the one-and-only lock.
+        */
+       if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) {
+               hammer2_chain_lock(hmp, chain, how);    /* recusive lock */
+               hammer2_chain_drop(hmp, chain);         /* excess ref */
+       }
+       lockmgr(&chain->lk, LK_RELEASE);                /* from alloc */
+
+       return (chain);
+}
+
+/*
+ * Locate any key between key_beg and key_end inclusive.  (*parentp)
+ * typically points to an inode but can also point to a related indirect
+ * block and this function will recurse upwards and find the inode again.
+ *
+ * WARNING!  THIS DOES NOT RETURN KEYS IN LOGICAL KEY ORDER!  ANY KEY
+ *          WITHIN THE RANGE CAN BE RETURNED.  HOWEVER, AN ITERATION
+ *          WHICH PICKS UP WHERE WE LEFT OFF WILL CONTINUE THE SCAN.
+ *
+ * (*parentp) must be exclusively locked and referenced and can be an inode
+ * or an existing indirect block within the inode.
+ *
+ * On return (*parentp) will be modified to point at the deepest parent chain
+ * element encountered during the search, as a helper for an insertion or
+ * deletion.   The new (*parentp) will be locked and referenced and the old
+ * will be unlocked and dereferenced (no change if they are both the same).
+ *
+ * The matching chain will be returned exclusively locked and referenced.
+ *
+ * NULL is returned if no match was found, but (*parentp) will still
+ * potentially be adjusted.
+ *
+ * This function will also recurse up the chain if the key is not within the
+ * current parent's range.  (*parentp) can never be set to NULL.  An iteration
+ * can simply allow (*parentp) to float inside the loop.
+ */
+hammer2_chain_t *
+hammer2_chain_lookup(hammer2_mount_t *hmp, hammer2_chain_t **parentp,
+                    hammer2_key_t key_beg, hammer2_key_t key_end,
+                    int flags)
+{
+       hammer2_chain_t *parent;
+       hammer2_chain_t *chain;
+       hammer2_chain_t *tmp;
+       hammer2_blockref_t *base;
+       hammer2_blockref_t *bref;
+       hammer2_key_t scan_beg;
+       hammer2_key_t scan_end;
+       int count = 0;
+       int i;
+
+       /*
+        * Recurse (*parentp) upward if necessary until the parent completely
+        * encloses the key range or we hit the inode.
+        */
+       parent = *parentp;
+       while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
+               scan_beg = parent->bref.key;
+               scan_end = scan_beg +
+                          ((hammer2_key_t)1 << parent->bref.keybits) - 1;
+               if (key_beg >= scan_beg && key_end <= scan_end)
+                       break;
+               hammer2_chain_ref(hmp, parent);         /* ref old parent */
+               hammer2_chain_unlock(hmp, parent);      /* unlock old parent */
+               parent = parent->parent;
+                                                       /* lock new parent */
+               hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_MAYBE);
+               hammer2_chain_drop(hmp, *parentp);      /* drop old parent */
+               *parentp = parent;                      /* new parent */
+       }
+
+again:
+       /*
+        * Locate the blockref array.  Currently we do a fully associative
+        * search through the array.
+        */
+       switch(parent->bref.type) {
+       case HAMMER2_BREF_TYPE_INODE:
+               /*
+                * Special shortcut for embedded data returns the inode
+                * itself.  Callers must detect this condition and access
+                * the embedded data (the strategy code does this for us).
+                *
+                * This is only applicable to regular files and softlinks.
+                */
+               if (parent->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA) {
+                       if (flags & HAMMER2_LOOKUP_NOLOCK)
+                               hammer2_chain_ref(hmp, parent);
+                       else
+                               hammer2_chain_lock(hmp, parent,
+                                                  HAMMER2_RESOLVE_ALWAYS);
+                       return (parent);
+               }
+               base = &parent->data->ipdata.u.blockset.blockref[0];
+               count = HAMMER2_SET_COUNT;
+               break;
+       case HAMMER2_BREF_TYPE_INDIRECT:
+               /*
+                * Optimize indirect blocks in the INITIAL state to avoid
+                * I/O.
+                */
+               if (parent->flags & HAMMER2_CHAIN_INITIAL) {
+                       base = NULL;
+               } else {
+                       if (parent->data == NULL)
+                               panic("parent->data is NULL");
+                       base = &parent->data->npdata.blockref[0];
+               }
+               count = parent->bytes / sizeof(hammer2_blockref_t);
+               break;
+       case HAMMER2_BREF_TYPE_VOLUME:
+               base = &hmp->voldata.sroot_blockset.blockref[0];
+               count = HAMMER2_SET_COUNT;
+               break;
+       default:
+               panic("hammer2_chain_lookup: unrecognized blockref type: %d",
+                     parent->bref.type);
+               base = NULL;    /* safety */
+               count = 0;      /* safety */
+       }
+
+       /*
+        * If the element and key overlap we use the element.
+        */
+       bref = NULL;
+       for (i = 0; i < count; ++i) {
+               tmp = hammer2_chain_find(hmp, parent, i);
+               if (tmp) {
+                       bref = &tmp->bref;
+                       KKASSERT(bref->type != 0);
+               } else if (base == NULL || base[i].type == 0) {
+                       continue;
+               } else {
+                       bref = &base[i];
+               }
+               scan_beg = bref->key;
+               scan_end = scan_beg + ((hammer2_key_t)1 << bref->keybits) - 1;
+               if (key_beg <= scan_end && key_end >= scan_beg)
+                       break;
+       }
+       if (i == count) {
+               if (key_beg == key_end)
+                       return (NULL);
+               return (hammer2_chain_next(hmp, parentp, NULL,
+                                          key_beg, key_end, flags));
+       }
+
+       /*
+        * Acquire the new chain element.  If the chain element is an
+        * indirect block we must search recursively.
+        */
+       chain = hammer2_chain_get(hmp, parent, i, flags);
+       if (chain == NULL)
+               return (NULL);
+
+       /*
+        * If the chain element is an indirect block it becomes the new
+        * parent and we loop on it.
+        *
+        * The parent always has to be locked with at least RESOLVE_MAYBE,
+        * so it might need a fixup if the caller passed incompatible flags.
+        */
+       if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
+               hammer2_chain_unlock(hmp, parent);
+               *parentp = parent = chain;
+               if (flags & HAMMER2_LOOKUP_NOLOCK) {
+                       hammer2_chain_lock(hmp, chain, HAMMER2_RESOLVE_MAYBE);
+                       hammer2_chain_drop(hmp, chain); /* excess ref */
+               } else if (flags & HAMMER2_LOOKUP_NODATA) {
+                       hammer2_chain_lock(hmp, chain, HAMMER2_RESOLVE_MAYBE);
+                       hammer2_chain_unlock(hmp, chain);
+               }
+               goto again;
+       }
+
+       /*
+        * All done, return chain
+        */
+       return (chain);
+}
+
+/*
+ * After having issued a lookup we can iterate all matching keys.
+ *
+ * If chain is non-NULL we continue the iteration from just after it's index.
+ *
+ * If chain is NULL we assume the parent was exhausted and continue the
+ * iteration at the next parent.
+ *
+ * parent must be locked on entry and remains locked throughout.  chain's
+ * lock status must match flags.
+ */
+hammer2_chain_t *
+hammer2_chain_next(hammer2_mount_t *hmp, hammer2_chain_t **parentp,
+                  hammer2_chain_t *chain,
+                  hammer2_key_t key_beg, hammer2_key_t key_end,
+                  int flags)
+{
+       hammer2_chain_t *parent;
+       hammer2_chain_t *tmp;
+       hammer2_blockref_t *base;
+       hammer2_blockref_t *bref;
+       hammer2_key_t scan_beg;
+       hammer2_key_t scan_end;
+       int i;
+       int count;
+
+       parent = *parentp;
+
+again:
+       /*
+        * Calculate the next index and recalculate the parent if necessary.
+        */
+       if (chain) {
+               /*
+                * Continue iteration within current parent.  If not NULL
+                * the passed-in chain may or may not be locked, based on
+                * the LOOKUP_NOLOCK flag (passed in as returned from lookup
+                * or a prior next).
+                */
+               i = chain->index + 1;
+               if (flags & HAMMER2_LOOKUP_NOLOCK)
+                       hammer2_chain_drop(hmp, chain);
+               else
+                       hammer2_chain_unlock(hmp, chain);
+
+               /*
+                * Any scan where the lookup returned degenerate data embedded
+                * in the inode has an invalid index and must terminate.
+                */
+               if (chain == parent)
+                       return(NULL);
+               chain = NULL;
+       } else if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT) {
+               /*
+                * We reached the end of the iteration.
+                */
+               return (NULL);
+       } else {
+               /*
+                * Continue iteration with next parent unless the current
+                * parent covers the range.
+                */
+               hammer2_chain_t *nparent;
+
+               scan_beg = parent->bref.key;
+               scan_end = scan_beg +
+                           ((hammer2_key_t)1 << parent->bref.keybits) - 1;
+               if (key_beg >= scan_beg && key_end <= scan_end)
+                       return (NULL);
+
+               i = parent->index + 1;
+               nparent = parent->parent;
+               hammer2_chain_ref(hmp, nparent);        /* ref new parent */
+               hammer2_chain_unlock(hmp, parent);      /* unlock old parent */
+                                                       /* lock new parent */
+               hammer2_chain_lock(hmp, nparent, HAMMER2_RESOLVE_MAYBE);
+               hammer2_chain_drop(hmp, nparent);       /* drop excess ref */
+               *parentp = parent = nparent;
+       }
+
+again2:
+       /*
+        * Locate the blockref array.  Currently we do a fully associative
+        * search through the array.
+        */
+       switch(parent->bref.type) {
+       case HAMMER2_BREF_TYPE_INODE:
+               base = &parent->data->ipdata.u.blockset.blockref[0];
+               count = HAMMER2_SET_COUNT;
+               break;
+       case HAMMER2_BREF_TYPE_INDIRECT:
+               if (parent->flags & HAMMER2_CHAIN_INITIAL) {
+                       base = NULL;
+               } else {
+                       KKASSERT(parent->data != NULL);
+                       base = &parent->data->npdata.blockref[0];
+               }
+               count = parent->bytes / sizeof(hammer2_blockref_t);
+               break;
+       case HAMMER2_BREF_TYPE_VOLUME:
+               base = &hmp->voldata.sroot_blockset.blockref[0];
+               count = HAMMER2_SET_COUNT;
+               break;
+       default:
+               panic("hammer2_chain_next: unrecognized blockref type: %d",
+                     parent->bref.type);
+               base = NULL;    /* safety */
+               count = 0;      /* safety */
+               break;
+       }
+       KKASSERT(i <= count);
+
+       /*
+        * Look for the key.  If we are unable to find a match and an exact
+        * match was requested we return NULL.  If a range was requested we
+        * run hammer2_chain_next() to iterate.
+        */
+       bref = NULL;
+       while (i < count) {
+               tmp = hammer2_chain_find(hmp, parent, i);
+               if (tmp) {
+                       bref = &tmp->bref;
+               } else if (base == NULL || base[i].type == 0) {
+                       ++i;
+                       continue;
+               } else {
+                       bref = &base[i];
+               }
+               scan_beg = bref->key;
+               scan_end = scan_beg + ((hammer2_key_t)1 << bref->keybits) - 1;
+               if (key_beg <= scan_end && key_end >= scan_beg)
+                       break;
+               ++i;
+       }
+
+       /*
+        * If we couldn't find a match recurse up a parent to continue the
+        * search.
+        */
+       if (i == count)
+             &n