Merge branches 'hammer2' and 'master' of ssh://crater.dragonflybsd.org/repository...
authorMatthew Dillon <dillon@apollo.backplane.com>
Sat, 11 Aug 2012 21:56:01 +0000 (14:56 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Sat, 11 Aug 2012 21:56:01 +0000 (14:56 -0700)
52 files changed:
lib/libstand/hammer2.c [new file with mode: 0644]
sbin/dhclient/dhclient.c
sbin/hammer2/Makefile [new file with mode: 0644]
sbin/hammer2/TODO [new file with mode: 0644]
sbin/hammer2/cmd_debug.c [new file with mode: 0644]
sbin/hammer2/cmd_leaf.c [new file with mode: 0644]
sbin/hammer2/cmd_pfs.c [new file with mode: 0644]
sbin/hammer2/cmd_remote.c [new file with mode: 0644]
sbin/hammer2/cmd_rsa.c [new file with mode: 0644]
sbin/hammer2/cmd_service.c [new file with mode: 0644]
sbin/hammer2/cmd_snapshot.c [new file with mode: 0644]
sbin/hammer2/cmd_stat.c [new file with mode: 0644]
sbin/hammer2/crypto.c [new file with mode: 0644]
sbin/hammer2/hammer2.h [new file with mode: 0644]
sbin/hammer2/icrc.c [new file with mode: 0644]
sbin/hammer2/main.c [new file with mode: 0644]
sbin/hammer2/msg.c [new file with mode: 0644]
sbin/hammer2/msg_lnk.c [new file with mode: 0644]
sbin/hammer2/network.h [new file with mode: 0644]
sbin/hammer2/subs.c [new file with mode: 0644]
sbin/mount_hammer2/Makefile [new file with mode: 0644]
sbin/mount_hammer2/mount_hammer2.c [new file with mode: 0644]
sbin/newfs_hammer2/Makefile [new file with mode: 0644]
sbin/newfs_hammer2/newfs_hammer2.8 [new file with mode: 0644]
sbin/newfs_hammer2/newfs_hammer2.c [new file with mode: 0644]
sys/vfs/hammer2/CHANGES [new file with mode: 0644]
sys/vfs/hammer2/DESIGN [new file with mode: 0644]
sys/vfs/hammer2/Makefile [new file with mode: 0644]
sys/vfs/hammer2/TODO [new file with mode: 0644]
sys/vfs/hammer2/donew [new file with mode: 0755]
sys/vfs/hammer2/donew2 [new file with mode: 0755]
sys/vfs/hammer2/dossd [new file with mode: 0755]
sys/vfs/hammer2/dossd2 [new file with mode: 0755]
sys/vfs/hammer2/dotest [new file with mode: 0755]
sys/vfs/hammer2/hammer2.h [new file with mode: 0644]
sys/vfs/hammer2/hammer2_ccms.c [new file with mode: 0644]
sys/vfs/hammer2/hammer2_ccms.h [new file with mode: 0644]
sys/vfs/hammer2/hammer2_chain.c [new file with mode: 0644]
sys/vfs/hammer2/hammer2_disk.h [new file with mode: 0644]
sys/vfs/hammer2/hammer2_freemap.c [new file with mode: 0644]
sys/vfs/hammer2/hammer2_icrc.c [new file with mode: 0644]
sys/vfs/hammer2/hammer2_inode.c [new file with mode: 0644]
sys/vfs/hammer2/hammer2_ioctl.c [new file with mode: 0644]
sys/vfs/hammer2/hammer2_ioctl.h [new file with mode: 0644]
sys/vfs/hammer2/hammer2_mount.h [new file with mode: 0644]
sys/vfs/hammer2/hammer2_msg.c [new file with mode: 0644]
sys/vfs/hammer2/hammer2_msgops.c [new file with mode: 0644]
sys/vfs/hammer2/hammer2_network.h [new file with mode: 0644]
sys/vfs/hammer2/hammer2_subr.c [new file with mode: 0644]
sys/vfs/hammer2/hammer2_vfsops.c [new file with mode: 0644]
sys/vfs/hammer2/hammer2_vnops.c [new file with mode: 0644]
sys/vfs/hammer2/mkvntest [new file with mode: 0755]

diff --git a/lib/libstand/hammer2.c b/lib/libstand/hammer2.c
new file mode 100644 (file)
index 0000000..0d4cf4e
--- /dev/null
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/uuid.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <string.h>
+#include <strings.h>
+#include <errno.h>
+
+#include <hammer2/hammer2_disk.h>
+
+struct hammer2 {
+       int                             fd;     /* Device fd */
+       struct hammer2_blockref         sroot;  /* Superroot blockref */
+};
+
+struct inode {
+       struct hammer2_inode_data       dat;    /* raw inode data */
+       off_t                           doff;   /* disk inode offset */
+};
+
+off_t blockoff(ref)
+       struct hammer2_blockref ref;
+{
+
+}
+
+hinit(hfs)
+       struct hammer2 *hfs;
+{
+       struct hammer2_volume_data volhdr;
+       ssize_t rc;
+       hammer2_crc_t crc0;
+
+       rc = pread(hfs->fd, &volhdr, HAMMER2_VOLUME_SIZE, 0);
+       if (volhdr.magic == HAMMER2_VOLUME_ID_HBO) {
+               printf("Valid HAMMER2 filesystem\n");
+       } else {
+               return (-1);
+       }
+
+       hfs->sroot = volhdr.sroot_blockref;
+       return (0);
+}
+
+shread(hfs, ino, buf, off, len)
+       struct hammer2 *hfs;
+       struct inode *ino;
+       char *buf;
+       off_t off;
+       size_t len;
+{
+       /*
+        * Read [off, off+len) from inode ino rather than from disk
+        * offsets; correctly decodes blockrefs/indirs/...
+        */
+}
+
+struct inode *hlookup1(hfs, ino, name)
+       struct hammer2 *hfs;
+       struct inode *ino;
+       char *name;
+{
+       static struct inode filino;
+       off_t off;
+       int rc;
+
+       bzero(&filino, sizeof(struct inode));
+
+       for (off = 0;
+            off < ino->dat.size;
+            off += sizeof(struct hammer2_inode_data))
+       {
+               rc = shread(hfs, ino, &filino.dat, off,
+                           sizeof(struct hammer2_inode_data));
+               if (rc != sizeof(struct hammer2_inode_data))
+                       continue;
+               if (strcmp(name, &filino.dat.filename) == 0)
+                       return (&filino);
+       }
+
+       return (NULL);
+}
+
+struct inode *hlookup(hfs, name)
+       struct hammer2 *hfs;
+       char *name;
+{
+       /* Name is of form /SUPERROOT/a/b/c/file */
+
+}
+
+void hstat(hfs, ino, sb)
+       struct hammer2 *hfs;
+       struct inode *ino;
+       struct stat *sb;
+{
+
+}
+
+main(argc, argv)
+       int argc;
+       char *argv[];
+{
+       struct hammer2 hammer2;
+       struct inode *ino;
+       struct stat sb;
+       int i;
+
+       if (argc < 2) {
+               fprintf(stderr, "usage: hammer2 <dev>\n");
+               exit(1);
+       }
+
+       hammer2.fd = open(argv[1], O_RDONLY);
+       if (hammer2.fd < 0) {
+               fprintf(stderr, "unable to open %s\n", argv[1]);
+               exit(1);
+       }
+
+       if (hinit(&hammer2)) {
+               fprintf(stderr, "invalid fs\n");
+               close(hammer2.fd);
+               exit(1);
+       }
+
+       for (i = 2; i < argc; i++) {
+               ino = hlookup(&hammer2, argv[i]);
+               if (ino == NULL) {
+                       fprintf(stderr, "hlookup %s\n", argv[i]);
+                       continue;
+               }
+               hstat(&hammer2, ino, &sb);
+
+               printf("%s %lld", argv[i], sb.st_size);
+
+       }
+}
index 9d421ed..7ec01ff 100644 (file)
@@ -170,15 +170,26 @@ routehandler(void)
        struct iaddr a;
        ssize_t n;
 
+       /*
+        * Read one message non-blocking, ignore signals.
+        */
        do {
                n = read(routefd, &msg, sizeof(msg));
        } while (n == -1 && errno == EINTR);
 
+       /*
+        * No operation if no messages ready or the message is
+        * not sized properly.
+        */
        rtm = (struct rt_msghdr *)msg;
        if (n < sizeof(rtm->rtm_msglen) || n < rtm->rtm_msglen ||
-           rtm->rtm_version != RTM_VERSION)
+           rtm->rtm_version != RTM_VERSION) {
                return;
+       }
 
+       /*
+        * Process the message.
+        */
        switch (rtm->rtm_type) {
        case RTM_NEWADDR:
                ifam = (struct ifa_msghdr *)rtm;
@@ -187,8 +198,10 @@ routehandler(void)
                if (findproto((char *)(ifam + 1), ifam->ifam_addrs) != AF_INET)
                        break;
                sa = get_ifa((char *)(ifam + 1), ifam->ifam_addrs);
-               if (sa == NULL)
+               if (sa == NULL) {
+                       warning("RTM_NEWADDR: No IFA");
                        goto die;
+               }
 
                if ((a.len = sizeof(struct in_addr)) > sizeof(a.iabuf))
                        error("king bula sez: len mismatch");
@@ -205,6 +218,7 @@ routehandler(void)
                        /* new addr is the one we set */
                        break;
 
+               warning("RTM_NEWADDR: Our IFA not found");
                goto die;
        case RTM_DELADDR:
                ifam = (struct ifa_msghdr *)rtm;
@@ -214,13 +228,16 @@ routehandler(void)
                        break;
                if (scripttime == 0 || t < scripttime + 10)
                        break;
+               warning("RTM_DELADDR: Third-party %d", (int)(t - scripttime));
                goto die;
        case RTM_IFINFO:
                ifm = (struct if_msghdr *)rtm;
                if (ifm->ifm_index != ifi->index)
                        break;
-               if ((rtm->rtm_flags & RTF_UP) == 0)
+               if ((rtm->rtm_flags & RTF_UP) == 0) {
+                       warning("RTM_IFINFO: Interface not up");
                        goto die;
+               }
 
                linkstat =
                    LINK_STATE_IS_UP(ifm->ifm_data.ifi_link_state) ? 1 : 0;
@@ -238,8 +255,10 @@ routehandler(void)
        case RTM_IFANNOUNCE:
                ifan = (struct if_announcemsghdr *)rtm;
                if (ifan->ifan_what == IFAN_DEPARTURE &&
-                   ifan->ifan_index == ifi->index)
+                   ifan->ifan_index == ifi->index) {
+                       warning("RTM_IFANNOUNCE: Interface departure");
                        goto die;
+               }
                break;
        default:
                break;
diff --git a/sbin/hammer2/Makefile b/sbin/hammer2/Makefile
new file mode 100644 (file)
index 0000000..3de282d
--- /dev/null
@@ -0,0 +1,19 @@
+PROG=  hammer2
+SRCS=  main.c subs.c icrc.c crypto.c
+SRCS+= cmd_remote.c cmd_snapshot.c cmd_pfs.c
+SRCS+= cmd_service.c cmd_leaf.c cmd_debug.c
+SRCS+= cmd_rsa.c cmd_stat.c
+SRCS+= msg.c msg_lnk.c
+#MAN=  hammer2.8
+NOMAN= TRUE
+DEBUG_FLAGS=-g
+
+CFLAGS+= -I${.CURDIR}/../../sys
+CFLAGS+= -pthread
+LDADD= -lm -lutil -lmd -lcrypto
+DPADD= ${LIBM} ${LIBUTIL} ${LIBMD} ${LIBCRYPTO}
+
+#.PATH: ${.CURDIR}/../../sys/libkern
+#SRCS+= crc32.c
+
+.include <bsd.prog.mk>
diff --git a/sbin/hammer2/TODO b/sbin/hammer2/TODO
new file mode 100644 (file)
index 0000000..33e49f9
--- /dev/null
@@ -0,0 +1,9 @@
+
+TODO:
+       msg_lnk.c  msg_lnk_signal rescans all connections.  What we really
+                  need to do is queue the conn (when possible) or the node
+                  (when possible), or just flag it w/ a global, and then
+                  only scan a subset.
+
+       link->dist propagation is still a bit screwy when there are a
+       lot of parallel network links.
diff --git a/sbin/hammer2/cmd_debug.c b/sbin/hammer2/cmd_debug.c
new file mode 100644 (file)
index 0000000..369e913
--- /dev/null
@@ -0,0 +1,543 @@
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "hammer2.h"
+
+#define SHOW_TAB       2
+
+static void shell_rcvmsg(hammer2_msg_t *msg);
+static void shell_ttymsg(hammer2_iocom_t *iocom);
+static void hammer2_shell_parse(hammer2_msg_t *msg);
+
+/************************************************************************
+ *                                 SHELL                               *
+ ************************************************************************/
+
+int
+cmd_shell(const char *hostname)
+{
+       struct hammer2_iocom iocom;
+       hammer2_msg_t *msg;
+       int fd;
+
+       /*
+        * Connect to the target
+        */
+       fd = hammer2_connect(hostname);
+       if (fd < 0)
+               return 1;
+
+       /*
+        * Run the session.  The remote end transmits our prompt.
+        */
+       hammer2_iocom_init(&iocom, fd, 0, NULL, shell_rcvmsg, shell_ttymsg);
+       fcntl(0, F_SETFL, O_NONBLOCK);
+       printf("debug: connected\n");
+
+       msg = hammer2_msg_alloc(iocom.router, 0, HAMMER2_DBG_SHELL,
+                               NULL, NULL);
+       hammer2_msg_write(msg);
+       hammer2_iocom_core(&iocom);
+       fprintf(stderr, "debug: disconnected\n");
+       close(fd);
+       return 0;
+}
+
+/*
+ * Callback from hammer2_iocom_core() when messages might be present
+ * on the socket.
+ */
+static
+void
+shell_rcvmsg(hammer2_msg_t *msg)
+{
+       switch(msg->any.head.cmd & HAMMER2_MSGF_TRANSMASK) {
+       case HAMMER2_LNK_ERROR:
+       case HAMMER2_LNK_ERROR | HAMMER2_MSGF_REPLY:
+               /*
+                * One-way non-transactional LNK_ERROR messages typically
+                * indicate a connection failure.  Error code 0 is used by
+                * the debug shell to indicate no more results from last cmd.
+                */
+               if (msg->any.head.error) {
+                       fprintf(stderr, "Stream failure: %s\n",
+                               hammer2_msg_str(msg));
+               } else {
+                       write(1, "debug> ", 7);
+               }
+               break;
+       case HAMMER2_LNK_ERROR | HAMMER2_MSGF_DELETE:
+               /* ignore termination of LNK_CONN */
+               break;
+       case HAMMER2_DBG_SHELL:
+               /*
+                * We send the commands, not accept them.
+                * (one-way message, not transactional)
+                */
+               hammer2_msg_reply(msg, HAMMER2_MSG_ERR_NOSUPP);
+               break;
+       case HAMMER2_DBG_SHELL | HAMMER2_MSGF_REPLY:
+               /*
+                * A reply from the remote is data we copy to stdout.
+                * (one-way message, not transactional)
+                */
+               if (msg->aux_size) {
+                       msg->aux_data[msg->aux_size - 1] = 0;
+                       write(1, msg->aux_data, strlen(msg->aux_data));
+               }
+               break;
+       case HAMMER2_LNK_CONN | HAMMER2_MSGF_CREATE:
+               fprintf(stderr, "Debug Shell is ignoring received LNK_CONN\n");
+               hammer2_msg_reply(msg, HAMMER2_MSG_ERR_NOSUPP);
+               break;
+       case HAMMER2_LNK_CONN | HAMMER2_MSGF_DELETE:
+               break;
+       default:
+               /*
+                * Ignore any unknown messages, Terminate any unknown
+                * transactions with an error.
+                */
+               fprintf(stderr, "Unknown message: %s\n", hammer2_msg_str(msg));
+               if (msg->any.head.cmd & HAMMER2_MSGF_CREATE)
+                       hammer2_msg_reply(msg, HAMMER2_MSG_ERR_NOSUPP);
+               if (msg->any.head.cmd & HAMMER2_MSGF_DELETE)
+                       hammer2_msg_reply(msg, HAMMER2_MSG_ERR_NOSUPP);
+               break;
+       }
+}
+
+static
+void
+shell_ttymsg(hammer2_iocom_t *iocom)
+{
+       hammer2_msg_t *msg;
+       char buf[256];
+       size_t len;
+
+       if (fgets(buf, sizeof(buf), stdin) != NULL) {
+               len = strlen(buf);
+               if (len && buf[len - 1] == '\n')
+                       buf[--len] = 0;
+               ++len;
+               msg = hammer2_msg_alloc(iocom->router, len, HAMMER2_DBG_SHELL,
+                                       NULL, NULL);
+               bcopy(buf, msg->aux_data, len);
+               hammer2_msg_write(msg);
+       } else if (feof(stdin)) {
+               /*
+                * Set EOF flag without setting any error code for normal
+                * EOF.
+                */
+               iocom->flags |= HAMMER2_IOCOMF_EOF;
+       } else {
+               clearerr(stdin);
+       }
+}
+
+/*
+ * This is called from the master node to process a received debug
+ * shell command.  We process the command, outputting the results,
+ * then finish up by outputting another prompt.
+ */
+void
+hammer2_msg_dbg(hammer2_msg_t *msg)
+{
+       switch(msg->any.head.cmd & HAMMER2_MSGF_CMDSWMASK) {
+       case HAMMER2_DBG_SHELL:
+               /*
+                * This is a command which we must process.
+                * When we are finished we generate a final reply.
+                */
+               if (msg->aux_data)
+                       msg->aux_data[msg->aux_size - 1] = 0;
+               hammer2_shell_parse(msg);
+               hammer2_msg_reply(msg, 0);
+               break;
+       case HAMMER2_DBG_SHELL | HAMMER2_MSGF_REPLY:
+               /*
+                * A reply just prints out the string.  No newline is added
+                * (it is expected to be embedded if desired).
+                */
+               if (msg->aux_data)
+                       msg->aux_data[msg->aux_size - 1] = 0;
+               if (msg->aux_data)
+                       write(2, msg->aux_data, strlen(msg->aux_data));
+               break;
+       default:
+               hammer2_msg_reply(msg, HAMMER2_MSG_ERR_NOSUPP);
+               break;
+       }
+}
+
+static void shell_span(hammer2_router_t *router, char *cmdbuf);
+/*static void shell_tree(hammer2_router_t *router, char *cmdbuf);*/
+
+static void
+hammer2_shell_parse(hammer2_msg_t *msg)
+{
+       hammer2_router_t *router = msg->router;
+       char *cmdbuf = msg->aux_data;
+       char *cmd = strsep(&cmdbuf, " \t");
+
+       if (cmd == NULL || *cmd == 0) {
+               ;
+       } else if (strcmp(cmd, "span") == 0) {
+               shell_span(router, cmdbuf);
+       } else if (strcmp(cmd, "tree") == 0) {
+               shell_tree(router, cmdbuf);
+       } else if (strcmp(cmd, "help") == 0 || strcmp(cmd, "?") == 0) {
+               router_printf(router, "help            Command help\n");
+               router_printf(router, "span <host>     Span to target host\n");
+               router_printf(router, "tree            Dump spanning tree\n");
+       } else {
+               router_printf(router, "Unrecognized command: %s\n", cmd);
+       }
+}
+
+static void
+shell_span(hammer2_router_t *router, char *cmdbuf)
+{
+       const char *hostname = strsep(&cmdbuf, " \t");
+       pthread_t thread;
+       int fd;
+
+       /*
+        * Connect to the target
+        */
+       if (hostname == NULL) {
+               fd = -1;
+       } else {
+               fd = hammer2_connect(hostname);
+       }
+
+       /*
+        * Start master service
+        */
+       if (fd < 0) {
+               router_printf(router, "Connection to %s failed\n", hostname);
+       } else {
+               router_printf(router, "Connected to %s\n", hostname);
+               pthread_create(&thread, NULL,
+                              master_service, (void *)(intptr_t)fd);
+               /*pthread_join(thread, &res);*/
+       }
+}
+
+/*
+ * Returns text debug output to the original defined by (msg).  (msg) is
+ * not modified and stays intact.  We use a one-way message with REPLY set
+ * to distinguish between a debug command and debug terminal output.
+ *
+ * To prevent loops router_printf() can filter the message (cmd) related
+ * to the router_printf().  We filter out DBG messages.
+ */
+void
+router_printf(hammer2_router_t *router, const char *ctl, ...)
+{
+       hammer2_msg_t *rmsg;
+       va_list va;
+       char buf[1024];
+       size_t len;
+
+       va_start(va, ctl);
+       vsnprintf(buf, sizeof(buf), ctl, va);
+       va_end(va);
+       len = strlen(buf) + 1;
+
+       rmsg = hammer2_msg_alloc(router, len, HAMMER2_DBG_SHELL |
+                                             HAMMER2_MSGF_REPLY,
+                                NULL, NULL);
+       bcopy(buf, rmsg->aux_data, len);
+
+       hammer2_msg_write(rmsg);
+}
+
+/************************************************************************
+ *                             DEBUGSPAN                               *
+ ************************************************************************
+ *
+ * Connect to the target manually (not via the cluster list embedded in
+ * a hammer2 filesystem) and initiate the SPAN protocol.
+ */
+int
+cmd_debugspan(const char *hostname)
+{
+       pthread_t thread;
+       int fd;
+       void *res;
+
+       /*
+        * Connect to the target
+        */
+       fd = hammer2_connect(hostname);
+       if (fd < 0)
+               return 1;
+
+       printf("debugspan: connected to %s, starting CONN/SPAN\n", hostname);
+       pthread_create(&thread, NULL, master_service, (void *)(intptr_t)fd);
+       pthread_join(thread, &res);
+       return(0);
+}
+
+/************************************************************************
+ *                                 SHOW                                *
+ ************************************************************************/
+
+static void show_bref(int fd, int tab, int bi, hammer2_blockref_t *bref);
+static void tabprintf(int tab, const char *ctl, ...);
+
+int
+cmd_show(const char *devpath)
+{
+       hammer2_blockref_t broot;
+       int fd;
+
+       fd = open(devpath, O_RDONLY);
+       if (fd < 0) {
+               perror("open");
+               return 1;
+       }
+       bzero(&broot, sizeof(broot));
+       broot.type = HAMMER2_BREF_TYPE_VOLUME;
+       broot.data_off = 0 | HAMMER2_PBUFRADIX;
+       show_bref(fd, 0, 0, &broot);
+       close(fd);
+
+       return 0;
+}
+
+static void
+show_bref(int fd, int tab, int bi, hammer2_blockref_t *bref)
+{
+       hammer2_media_data_t media;
+       hammer2_blockref_t *bscan;
+       int bcount;
+       int i;
+       int didnl;
+       int namelen;
+       int obrace = 1;
+       size_t bytes;
+       const char *type_str;
+       char *str = NULL;
+
+       switch(bref->type) {
+       case HAMMER2_BREF_TYPE_EMPTY:
+               type_str = "empty";
+               break;
+       case HAMMER2_BREF_TYPE_INODE:
+               type_str = "inode";
+               break;
+       case HAMMER2_BREF_TYPE_INDIRECT:
+               type_str = "indblk";
+               break;
+       case HAMMER2_BREF_TYPE_DATA:
+               type_str = "data";
+               break;
+       case HAMMER2_BREF_TYPE_VOLUME:
+               type_str = "volume";
+               break;
+       default:
+               type_str = "unknown";
+               break;
+       }
+
+
+       tabprintf(tab, "%s.%-3d %016jx %016jx/%-2d mir=%016jx mod=%016jx ",
+              type_str, bi, (intmax_t)bref->data_off,
+              (intmax_t)bref->key, (intmax_t)bref->keybits,
+              (intmax_t)bref->mirror_tid, (intmax_t)bref->modify_tid);
+       tab += SHOW_TAB;
+
+       bytes = (size_t)1 << (bref->data_off & HAMMER2_OFF_MASK_RADIX);
+       if (bytes < HAMMER2_MINIOSIZE || bytes > sizeof(media)) {
+               printf("(bad block size %zd)\n", bytes);
+               return;
+       }
+       if (bref->type != HAMMER2_BREF_TYPE_DATA || VerboseOpt >= 1) {
+               lseek(fd, bref->data_off & ~HAMMER2_OFF_MASK_RADIX, 0);
+               if (read(fd, &media, bytes) != (ssize_t)bytes) {
+                       printf("(media read failed)\n");
+                       return;
+               }
+       }
+
+       bscan = NULL;
+       bcount = 0;
+       didnl = 0;
+
+       switch(bref->type) {
+       case HAMMER2_BREF_TYPE_EMPTY:
+               obrace = 0;
+               break;
+       case HAMMER2_BREF_TYPE_INODE:
+               printf("{\n");
+               if (media.ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA) {
+                       /* no blockrefs */
+               } else {
+                       bscan = &media.ipdata.u.blockset.blockref[0];
+                       bcount = HAMMER2_SET_COUNT;
+               }
+               namelen = media.ipdata.name_len;
+               if (namelen > HAMMER2_INODE_MAXNAME)
+                       namelen = 0;
+               tabprintf(tab, "filename \"%*.*s\"\n",
+                         namelen, namelen, media.ipdata.filename);
+               tabprintf(tab, "version  %d\n", media.ipdata.version);
+               tabprintf(tab, "uflags   0x%08x\n",
+                         media.ipdata.uflags);
+               if (media.ipdata.rmajor || media.ipdata.rminor) {
+                       tabprintf(tab, "rmajor   %d\n",
+                                 media.ipdata.rmajor);
+                       tabprintf(tab, "rminor   %d\n",
+                                 media.ipdata.rminor);
+               }
+               tabprintf(tab, "ctime    %s\n",
+                         hammer2_time64_to_str(media.ipdata.ctime, &str));
+               tabprintf(tab, "mtime    %s\n",
+                         hammer2_time64_to_str(media.ipdata.mtime, &str));
+               tabprintf(tab, "atime    %s\n",
+                         hammer2_time64_to_str(media.ipdata.atime, &str));
+               tabprintf(tab, "btime    %s\n",
+                         hammer2_time64_to_str(media.ipdata.btime, &str));
+               tabprintf(tab, "uid      %s\n",
+                         hammer2_uuid_to_str(&media.ipdata.uid, &str));
+               tabprintf(tab, "gid      %s\n",
+                         hammer2_uuid_to_str(&media.ipdata.gid, &str));
+               tabprintf(tab, "type     %s\n",
+                         hammer2_iptype_to_str(media.ipdata.type));
+               tabprintf(tab, "opflgs   0x%02x\n",
+                         media.ipdata.op_flags);
+               tabprintf(tab, "capflgs  0x%04x\n",
+                         media.ipdata.cap_flags);
+               tabprintf(tab, "mode     %-7o\n",
+                         media.ipdata.mode);
+               tabprintf(tab, "inum     0x%016jx\n",
+                         media.ipdata.inum);
+               tabprintf(tab, "size     %ju\n",
+                         (uintmax_t)media.ipdata.size);
+               tabprintf(tab, "nlinks   %ju\n",
+                         (uintmax_t)media.ipdata.nlinks);
+               tabprintf(tab, "iparent  0x%016jx\n",
+                         (uintmax_t)media.ipdata.iparent);
+               tabprintf(tab, "name_key 0x%016jx\n",
+                         (uintmax_t)media.ipdata.name_key);
+               tabprintf(tab, "name_len %u\n",
+                         media.ipdata.name_len);
+               tabprintf(tab, "ncopies  %u\n",
+                         media.ipdata.ncopies);
+               tabprintf(tab, "compalg  %u\n",
+                         media.ipdata.comp_algo);
+               if (media.ipdata.op_flags & HAMMER2_OPFLAG_PFSROOT) {
+                       tabprintf(tab, "pfs_type %u (%s)\n",
+                                 media.ipdata.pfs_type,
+                                 hammer2_pfstype_to_str(media.ipdata.pfs_type));
+                       tabprintf(tab, "pfs_inum 0x%016jx\n",
+                                 (uintmax_t)media.ipdata.pfs_inum);
+                       tabprintf(tab, "pfs_clid %s\n",
+                                 hammer2_uuid_to_str(&media.ipdata.pfs_clid,
+                                                     &str));
+                       tabprintf(tab, "pfs_fsid %s\n",
+                                 hammer2_uuid_to_str(&media.ipdata.pfs_fsid,
+                                                     &str));
+               }
+               tabprintf(tab, "data_quota  %ju\n",
+                         (uintmax_t)media.ipdata.data_quota);
+               tabprintf(tab, "data_count  %ju\n",
+                         (uintmax_t)media.ipdata.data_count);
+               tabprintf(tab, "inode_quota %ju\n",
+                         (uintmax_t)media.ipdata.inode_quota);
+               tabprintf(tab, "inode_count %ju\n",
+                         (uintmax_t)media.ipdata.inode_count);
+               tabprintf(tab, "attr_tid    0x%016jx\n",
+                         (uintmax_t)media.ipdata.attr_tid);
+               if (media.ipdata.type == HAMMER2_OBJTYPE_DIRECTORY) {
+                       tabprintf(tab, "dirent_tid  %016jx\n",
+                                 (uintmax_t)media.ipdata.dirent_tid);
+               }
+               break;
+       case HAMMER2_BREF_TYPE_INDIRECT:
+               bscan = &media.npdata.blockref[0];
+               bcount = bytes / sizeof(hammer2_blockref_t);
+               didnl = 1;
+               printf("{\n");
+               break;
+       case HAMMER2_BREF_TYPE_DATA:
+               if (VerboseOpt >= 2) {
+                       printf("{\n");
+               } else {
+                       printf("\n");
+                       obrace = 0;
+               }
+               break;
+       case HAMMER2_BREF_TYPE_VOLUME:
+               bscan = &media.voldata.sroot_blockset.blockref[0];
+               bcount = HAMMER2_SET_COUNT;
+               printf("{\n");
+               break;
+       default:
+               break;
+       }
+       if (str)
+               free(str);
+       for (i = 0; i < bcount; ++i) {
+               if (bscan[i].type != HAMMER2_BREF_TYPE_EMPTY) {
+                       if (didnl == 0) {
+                               printf("\n");
+                               didnl = 1;
+                       }
+                       show_bref(fd, tab, i, &bscan[i]);
+               }
+       }
+       tab -= SHOW_TAB;
+       if (obrace) {
+               if (bref->type == HAMMER2_BREF_TYPE_INODE)
+                       tabprintf(tab, "} (%s.%d, \"%s\")\n",
+                                 type_str, bi, media.ipdata.filename);
+               else
+                       tabprintf(tab, "} (%s.%d)\n", type_str,bi);
+       }
+}
+
+static
+void
+tabprintf(int tab, const char *ctl, ...)
+{
+       va_list va;
+
+       printf("%*.*s", tab, tab, "");
+       va_start(va, ctl);
+       vprintf(ctl, va);
+       va_end(va);
+}
diff --git a/sbin/hammer2/cmd_leaf.c b/sbin/hammer2/cmd_leaf.c
new file mode 100644 (file)
index 0000000..a9026e2
--- /dev/null
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "hammer2.h"
+
+/*
+ * Start-up the leaf daemon for a PFS on this machine.
+ *
+ * One leaf daemon is run for each mounted PFS.  The daemon may multi-thread
+ * to improve performance if desired.  The daemon performs the following
+ * functions:
+ *
+ *     (1) Makes and maintains connections to all cluster nodes found for
+ *         the PFS, retrieved from the REMOTE configuration stored in
+ *         the HAMMER2 mount.  A localhost connection is always implied
+ *         (using the backbone), but also having more direct connections
+ *         can result in higher performance.
+ *
+ *         This also includes any required encryption or authentication.
+ *
+ *     (2) Runs the spanning tree protocol as a leaf, meaning that
+ *         the leaf daemon does not serve as a relay and the individual
+ *         connections made in (1) do not cross-connect.
+ *
+ *     (3) Obtains the PFS's registration and makes it available to the
+ *         cluster via the spanning tree protocol.
+ *
+ *     (4) Creates a communications pipe to the HAMMER2 VFS in the kernel
+ *         (installed via ioctl()) which the HAMMER2 VFS uses to accept and
+ *         communicate high-level requests.
+ *
+ *     (5) Performs all complex high-level messaging protocol operations,
+ *         such as quorum operations, maintains persistent cache state,
+ *         and so on and so forth.
+ *
+ * As you may have noted, the leaf daemon serves as an intermediary between
+ * the kernel and the rest of the cluster.  The kernel will issue high level
+ * protocol commands to the leaf which performs the protocol and sends a
+ * response.  The kernel does NOT have to deal with the quorum or other
+ * complex maintainance.
+ *
+ * Basically the kernel is simply another client from the point of view
+ * of the high-level protocols, requesting cache state locks and such from
+ * the leaf (in a degenerate situation one master lock is all that is needed).
+ * If the kernel PFS has local media storage that storage can be used for
+ * numerous purposes, such as caching, and in the degenerate non-clustered
+ * case simply represents the one-and-only master copy of the filesystem.
+ */
+int
+cmd_leaf(const char *sel_info)
+{
+       int ecode = 0;
+       int fd;
+
+       /*
+        * Obtain an ioctl descriptor and retrieve the registration info
+        * for the PFS.
+        */
+       if ((fd = hammer2_ioctl_handle(sel_info)) < 0)
+               return(1);
+
+       /*
+        * Start a daemon to interconnect the HAMMER2 PFS in-kernel to the
+        * master-node daemon.  This daemon's thread will spend most of its
+        * time in the kernel.
+        */
+/*     hammer2_demon(helper_pfs_interlink, (void *)(intptr_t)fd);*/
+       if (NormalExit)
+               close(fd);
+
+       return ecode;
+}
+
+#if 0
+/*
+ * LEAF interconnect between PFS and the messaging core.  We create a
+ * socket connection to the messaging core, register the PFS with the
+ * core, and then pass the messaging descriptor to the kernel.
+ *
+ * The kernel takes over operation of the interconnect until the filesystem
+ * is unmounted or the descriptor is lost or explicitly terminated via
+ * a hammer2 command.
+ *
+ * This is essentially a localhost connection, so we don't have to worry
+ * about encryption.  Any encryption will be handled by the messaging
+ * core.
+ */
+static
+void *
+leaf_connect(void *data)
+{
+       int fd;
+
+       fd = (int)(intptr_t)data;
+
+       return (NULL);
+}
+#endif
diff --git a/sbin/hammer2/cmd_pfs.c b/sbin/hammer2/cmd_pfs.c
new file mode 100644 (file)
index 0000000..03c3566
--- /dev/null
@@ -0,0 +1,196 @@
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "hammer2.h"
+
+int
+cmd_pfs_list(const char *sel_path)
+{
+       hammer2_ioc_pfs_t pfs;
+       int ecode = 0;
+       int count = 0;
+       int fd;
+       uint32_t status;
+       char *pfs_id_str = NULL;
+
+       if ((fd = hammer2_ioctl_handle(sel_path)) < 0)
+               return(1);
+       bzero(&pfs, sizeof(pfs));
+
+       while ((pfs.name_key = pfs.name_next) != (hammer2_key_t)-1) {
+               if (ioctl(fd, HAMMER2IOC_PFS_GET, &pfs) < 0) {
+                       perror("ioctl");
+                       ecode = 1;
+                       break;
+               }
+               if (count == 0) {
+                       printf("Type        "
+                              "ClusterId (pfs_clid)                 "
+                              "Label\n");
+               }
+               switch(pfs.pfs_type) {
+               case HAMMER2_PFSTYPE_NONE:
+                       printf("NONE        ");
+                       break;
+               case HAMMER2_PFSTYPE_ADMIN:
+                       printf("ADMIN       ");
+                       break;
+               case HAMMER2_PFSTYPE_CACHE:
+                       printf("CACHE       ");
+                       break;
+               case HAMMER2_PFSTYPE_COPY:
+                       printf("COPY        ");
+                       break;
+               case HAMMER2_PFSTYPE_SLAVE:
+                       printf("SLAVE       ");
+                       break;
+               case HAMMER2_PFSTYPE_SOFT_SLAVE:
+                       printf("SOFT_SLAVE  ");
+                       break;
+               case HAMMER2_PFSTYPE_SOFT_MASTER:
+                       printf("SOFT_MASTER ");
+                       break;
+               case HAMMER2_PFSTYPE_MASTER:
+                       printf("MASTER      ");
+                       break;
+               default:
+                       printf("%02x          ", pfs.pfs_type);
+                       break;
+               }
+               uuid_to_string(&pfs.pfs_clid, &pfs_id_str, &status);
+               printf("%s ", pfs_id_str);
+               free(pfs_id_str);
+               pfs_id_str = NULL;
+               printf("%s\n", pfs.name);
+               ++count;
+       }
+       close(fd);
+
+       return (ecode);
+}
+
+int
+cmd_pfs_getid(const char *sel_path, const char *name, int privateid)
+{
+       hammer2_ioc_pfs_t pfs;
+       int ecode = 0;
+       int fd;
+       uint32_t status;
+       char *pfs_id_str = NULL;
+
+       if ((fd = hammer2_ioctl_handle(sel_path)) < 0)
+               return(1);
+       bzero(&pfs, sizeof(pfs));
+
+       snprintf(pfs.name, sizeof(pfs.name), "%s", name);
+       if (ioctl(fd, HAMMER2IOC_PFS_LOOKUP, &pfs) < 0) {
+               perror("ioctl");
+               ecode = 1;
+       } else {
+               if (privateid)
+                       uuid_to_string(&pfs.pfs_fsid, &pfs_id_str, &status);
+               else
+                       uuid_to_string(&pfs.pfs_clid, &pfs_id_str, &status);
+               printf("%s\n", pfs_id_str);
+               free(pfs_id_str);
+               pfs_id_str = NULL;
+       }
+       close(fd);
+       return (ecode);
+}
+
+
+int
+cmd_pfs_create(const char *sel_path, const char *name,
+              uint8_t pfs_type, const char *uuid_str)
+{
+       hammer2_ioc_pfs_t pfs;
+       int ecode = 0;
+       int fd;
+       uint32_t status;
+
+       /*
+        * Default to MASTER
+        */
+       if (pfs_type == HAMMER2_PFSTYPE_NONE) {
+               pfs_type = HAMMER2_PFSTYPE_MASTER;
+       }
+
+       if ((fd = hammer2_ioctl_handle(sel_path)) < 0)
+               return(1);
+       bzero(&pfs, sizeof(pfs));
+       snprintf(pfs.name, sizeof(pfs.name), "%s", name);
+       pfs.pfs_type = pfs_type;
+       if (uuid_str) {
+               uuid_from_string(uuid_str, &pfs.pfs_clid, &status);
+       } else {
+               uuid_create(&pfs.pfs_clid, &status);
+       }
+       if (status == uuid_s_ok)
+               uuid_create(&pfs.pfs_fsid, &status);
+       if (status == uuid_s_ok) {
+               if (ioctl(fd, HAMMER2IOC_PFS_CREATE, &pfs) < 0) {
+                       perror("ioctl");
+                       ecode = 1;
+               }
+       } else {
+               fprintf(stderr, "hammer2: pfs_create: badly formed uuid\n");
+               ecode = 1;
+       }
+       close(fd);
+       return (ecode);
+}
+
+int
+cmd_pfs_delete(const char *sel_path, const char *name)
+{
+       hammer2_ioc_pfs_t pfs;
+       int ecode = 0;
+       int fd;
+
+       if ((fd = hammer2_ioctl_handle(sel_path)) < 0)
+               return(1);
+       bzero(&pfs, sizeof(pfs));
+       snprintf(pfs.name, sizeof(pfs.name), "%s", name);
+
+       if (ioctl(fd, HAMMER2IOC_PFS_CREATE, &pfs) < 0) {
+               fprintf(stderr, "hammer2: pfs_delete(%s): %s\n",
+                       name, strerror(errno));
+               ecode = 1;
+       }
+       close(fd);
+
+       return (ecode);
+}
diff --git a/sbin/hammer2/cmd_remote.c b/sbin/hammer2/cmd_remote.c
new file mode 100644 (file)
index 0000000..97116b1
--- /dev/null
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "hammer2.h"
+
+int
+cmd_remote_connect(const char *sel_path, const char *url)
+{
+       hammer2_ioc_remote_t remote;
+       int ecode = 0;
+       int fd;
+
+       if ((fd = hammer2_ioctl_handle(sel_path)) < 0)
+               return(1);
+       bzero(&remote, sizeof(remote));
+       remote.copyid = -1;
+       remote.fd = -1;
+       if (strlen(url) >= sizeof(remote.copy1.path)) {
+               fprintf(stderr, "hammer2: connect: Path too long\n");
+               close(fd);
+               return(1);
+       }
+       snprintf(remote.copy1.path, sizeof(remote.copy1.path), "%s", url);
+       if (ioctl(fd, HAMMER2IOC_REMOTE_ADD, &remote) < 0) {
+               perror("ioctl");
+               ecode = 1;
+       }
+       close(fd);
+       return 0;;
+}
+
+int
+cmd_remote_disconnect(const char *sel_path, const char *url)
+{
+       hammer2_ioc_remote_t remote;
+       int ecode = 0;
+       int fd;
+
+       if ((fd = hammer2_ioctl_handle(sel_path)) < 0)
+               return(1);
+       bzero(&remote, sizeof(remote));
+       remote.copyid = -1;
+       remote.fd = -1;
+       if (strlen(url) >= sizeof(remote.copy1.path)) {
+               fprintf(stderr, "hammer2: disconnect: Path too long\n");
+               close(fd);
+               return(1);
+       }
+       snprintf(remote.copy1.path, sizeof(remote.copy1.path), "%s", url);
+       if (ioctl(fd, HAMMER2IOC_REMOTE_DEL, &remote) < 0) {
+               perror("ioctl");
+               ecode = 1;
+       }
+       close(fd);
+       return 0;;
+}
+
+int
+cmd_remote_status(const char *sel_path, int all_opt __unused)
+{
+       hammer2_ioc_remote_t remote;
+       int ecode = 0;
+       int count = 0;
+       int fd;
+
+       if ((fd = hammer2_ioctl_handle(sel_path)) < 0)
+               return(1);
+       bzero(&remote, sizeof(remote));
+
+       while ((remote.copyid = remote.nextid) >= 0) {
+               if (ioctl(fd, HAMMER2IOC_REMOTE_SCAN, &remote) < 0) {
+                       perror("ioctl");
+                       ecode = 1;
+                       break;
+               }
+               if (remote.copy1.copyid == 0)
+                       continue;
+               if (count == 0)
+                       printf("CPYID LABEL           STATUS PATH\n");
+               printf("%5d %-15s %c%c%c.%02x %s\n",
+                       remote.copy1.copyid,
+                       remote.copy1.label,
+                       '-', '-', '-',
+                       remote.copy1.priority,
+                       remote.copy1.path);
+               ++count;
+       }
+       if (count == 0)
+               printf("No linkages found\n");
+       return (ecode);
+}
diff --git a/sbin/hammer2/cmd_rsa.c b/sbin/hammer2/cmd_rsa.c
new file mode 100644 (file)
index 0000000..75d3893
--- /dev/null
@@ -0,0 +1,379 @@
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "hammer2.h"
+
+/*
+ * Should be run as root.  Creates /etc/hammer2/rsa.{pub,prv} using
+ * an openssl command.
+ */
+int
+cmd_rsainit(const char *dir_path)
+{
+       struct stat st;
+       int ecode;
+       char *str1;
+       char *str2;
+       char *cmd;
+       mode_t old_umask;
+
+       /*
+        * Create the directory if necessary
+        */
+       if (stat(dir_path, &st) < 0) {
+               str1 = strdup(dir_path);
+               str2 = str1 - 1;
+
+               while ((str2 = strchr(str2 + 1, '/')) != NULL) {
+                       *str2 = 0;
+                       mkdir(str1, 0755);
+                       *str2 = '/';
+               }
+               mkdir(str1, 0700);
+               free(str1);
+       }
+       asprintf(&str1, "%s/rsa.prv", dir_path);
+       asprintf(&str2, "%s/rsa.pub", dir_path);
+
+       if (stat(str1, &st) < 0) {
+               old_umask = umask(077);
+               asprintf(&cmd, "openssl genrsa -out %s 2048", str1);
+               umask(old_umask);
+               ecode = system(cmd);
+               free(cmd);
+               chmod(str1, 0400);
+               if (ecode) {
+                       fprintf(stderr,
+                               "hammer2 rsainit: private key gen failed\n");
+                       free(str2);
+                       free(str1);
+                       return 1;
+               }
+               printf("hammer2 rsainit: created %s\n", str1);
+               remove(str2);
+       } else {
+               printf("hammer2 rsainit: Using existing private key in %s\n",
+                      str1);
+       }
+       if (stat(str2, &st) < 0) {
+               asprintf(&cmd, "openssl rsa -in %s -out %s -pubout",
+                        str1, str2);
+               ecode = system(cmd);
+               free(cmd);
+               if (ecode) {
+                       fprintf(stderr,
+                               "hammer2 rsainit: public key gen failed\n");
+                       free(str2);
+                       free(str1);
+                       return 1;
+               }
+               printf("hammer2 rsainit: created %s\n", str2);
+       } else {
+               printf("hammer2 rsainit: both keys already exist\n");
+       }
+       free(str2);
+       free(str1);
+
+       return 0;
+}
+
+int
+cmd_rsaenc(const char **keyfiles, int nkeys)
+{
+       RSA **keys = calloc(nkeys, sizeof(RSA *));
+       int *ispub = calloc(nkeys, sizeof(int));
+       int ecode = 0;
+       int blksize = 0;
+       int i;
+       int off;
+       int n;
+       unsigned char *data_in;
+       unsigned char *data_out;
+
+       for (i = 0; i < nkeys; ++i) {
+               FILE *fp;
+               const char *sfx;
+
+               sfx = strrchr(keyfiles[i], '.');
+               if (sfx && strcmp(sfx, ".pub") == 0) {
+                       fp = fopen(keyfiles[i], "r");
+                       if (fp == NULL) {
+                               fprintf(stderr, "hammer2 rsaenc: unable to "
+                                               "open %s\n", keyfiles[i]);
+                               ecode = 1;
+                               goto done;
+                       }
+                       keys[i] = PEM_read_RSA_PUBKEY(fp, NULL, NULL, NULL);
+                       ispub[i] = 1;
+                       fclose(fp);
+                       if (keys[i] == NULL) {
+                               fprintf(stderr, "hammer2 rsaenc: unable to "
+                                               "parse public key from %s\n",
+                                               keyfiles[i]);
+                               ecode = 1;
+                               goto done;
+                       }
+               } else if (sfx && strcmp(sfx, ".prv") == 0) {
+                       fp = fopen(keyfiles[i], "r");
+                       if (fp == NULL) {
+                               fprintf(stderr, "hammer2 rsaenc: unable to "
+                                               "open %s\n", keyfiles[i]);
+                               ecode = 1;
+                               goto done;
+                       }
+                       keys[i] = PEM_read_RSAPrivateKey(fp, NULL, NULL, NULL);
+                       fclose(fp);
+                       if (keys[i] == NULL) {
+                               fprintf(stderr, "hammer2 rsaenc: unable to "
+                                               "parse private key from %s\n",
+                                               keyfiles[i]);
+                               ecode = 1;
+                               goto done;
+                       }
+               } else {
+                       fprintf(stderr, "hammer2: rsaenc: key files must end "
+                                       "in .pub or .prv\n");
+                       ecode = 1;
+                       goto done;
+               }
+               if (i == 0)
+                       blksize = RSA_size(keys[i]);
+               else
+                       assert(blksize == RSA_size(keys[i]));
+       }
+       fprintf(stderr, "blksize %d\n", blksize);
+
+       /*
+        *
+        */
+       data_in = malloc(blksize);
+       data_out = malloc(blksize);
+       off = 0;
+       while ((n = read(0, data_in + off, blksize - off)) > 0) {
+               off += n;
+               if (off == blksize) {
+                       for (i = 0; i < nkeys; ++i) {
+                               if (ispub[i])
+                                       RSA_public_encrypt(blksize,
+                                                          data_in, data_out,
+                                                          keys[i],
+                                                          RSA_NO_PADDING);
+                               else
+                                       RSA_private_encrypt(blksize,
+                                                          data_in, data_out,
+                                                          keys[i],
+                                                          RSA_NO_PADDING);
+                               if (i + 1 != nkeys)
+                                       bcopy(data_out, data_in, blksize);
+                       }
+                       if (write(1, data_out, blksize) != blksize) {
+                               perror("write");
+                               ecode = 1;
+                               break;
+                       }
+                       off = 0;
+               }
+       }
+       if (off && ecode == 0) {
+               if (off < blksize)
+                       bzero(data_in + off, blksize - off);
+               for (i = 0; i < nkeys; ++i) {
+                       if (ispub[i])
+                               RSA_public_encrypt(blksize,
+                                                  data_in, data_out,
+                                                  keys[i],
+                                                  RSA_NO_PADDING);
+                       else
+                               RSA_private_encrypt(blksize,
+                                                  data_in, data_out,
+                                                  keys[i],
+                                                  RSA_NO_PADDING);
+                       if (i + 1 != nkeys)
+                               bcopy(data_out, data_in, blksize);
+               }
+               if (write(1, data_out, blksize) != blksize) {
+                       perror("write");
+                       ecode = 1;
+               }
+       }
+       if (n < 0) {
+               perror("read");
+               ecode = 1;
+       }
+       free(data_out);
+       free(data_in);
+done:
+       for (i = 0; i < nkeys; ++i) {
+               if (keys[i])
+                       RSA_free(keys[i]);
+       }
+       free(keys);
+       free(ispub);
+       return (ecode);
+}
+
+int
+cmd_rsadec(const char **keyfiles, int nkeys)
+{
+       RSA **keys = calloc(nkeys, sizeof(RSA *));
+       int *ispub = calloc(nkeys, sizeof(int));
+       int ecode = 0;
+       int blksize = 0;
+       int i;
+       int off;
+       int n;
+       unsigned char *data_in;
+       unsigned char *data_out;
+
+       for (i = 0; i < nkeys; ++i) {
+               FILE *fp;
+               const char *sfx;
+
+               sfx = strrchr(keyfiles[i], '.');
+               if (sfx && strcmp(sfx, ".pub") == 0) {
+                       fp = fopen(keyfiles[i], "r");
+                       if (fp == NULL) {
+                               fprintf(stderr, "hammer2 rsaenc: unable to "
+                                               "open %s\n", keyfiles[i]);
+                               ecode = 1;
+                               goto done;
+                       }
+                       keys[i] = PEM_read_RSA_PUBKEY(fp, NULL, NULL, NULL);
+                       ispub[i] = 1;
+                       fclose(fp);
+                       if (keys[i] == NULL) {
+                               fprintf(stderr, "hammer2 rsaenc: unable to "
+                                               "parse public key from %s\n",
+                                               keyfiles[i]);
+                               ecode = 1;
+                               goto done;
+                       }
+               } else if (sfx && strcmp(sfx, ".prv") == 0) {
+                       fp = fopen(keyfiles[i], "r");
+                       if (fp == NULL) {
+                               fprintf(stderr, "hammer2 rsaenc: unable to "
+                                               "open %s\n", keyfiles[i]);
+                               ecode = 1;
+                               goto done;
+                       }
+                       keys[i] = PEM_read_RSAPrivateKey(fp, NULL, NULL, NULL);
+                       fclose(fp);
+                       if (keys[i] == NULL) {
+                               fprintf(stderr, "hammer2 rsaenc: unable to "
+                                               "parse private key from %s\n",
+                                               keyfiles[i]);
+                               ecode = 1;
+                               goto done;
+                       }
+               } else {
+                       fprintf(stderr, "hammer2: rsaenc: key files must end "
+                                       "in .pub or .prv\n");
+                       ecode = 1;
+                       goto done;
+               }
+               if (i == 0)
+                       blksize = RSA_size(keys[i]);
+               else
+                       assert(blksize == RSA_size(keys[i]));
+       }
+
+       /*
+        *
+        */
+       data_in = malloc(blksize);
+       data_out = malloc(blksize);
+       off = 0;
+       while ((n = read(0, data_in + off, blksize - off)) > 0) {
+               off += n;
+               if (off == blksize) {
+                       for (i = 0; i < nkeys; ++i) {
+                               if (ispub[i])
+                                       RSA_public_decrypt(blksize,
+                                                          data_in, data_out,
+                                                          keys[i],
+                                                          RSA_NO_PADDING);
+                               else
+                                       RSA_private_decrypt(blksize,
+                                                          data_in, data_out,
+                                                          keys[i],
+                                                          RSA_NO_PADDING);
+                               if (i + 1 != nkeys)
+                                       bcopy(data_out, data_in, blksize);
+                       }
+                       if (write(1, data_out, blksize) != blksize) {
+                               perror("write");
+                               ecode = 1;
+                               break;
+                       }
+                       off = 0;
+               }
+       }
+       if (off) {
+               if (off < blksize)
+                       bzero(data_in + off, blksize - off);
+               for (i = 0; i < nkeys; ++i) {
+                       if (ispub[i])
+                               RSA_public_decrypt(blksize,
+                                                  data_in, data_out,
+                                                  keys[i],
+                                                  RSA_NO_PADDING);
+                       else
+                               RSA_private_decrypt(blksize,
+                                                  data_in, data_out,
+                                                  keys[i],
+                                                  RSA_NO_PADDING);
+                       if (i + 1 != nkeys)
+                               bcopy(data_out, data_in, blksize);
+               }
+               if (write(1, data_out, blksize) != blksize) {
+                       perror("write");
+                       ecode = 1;
+               }
+       }
+       if (n < 0) {
+               perror("read");
+               ecode = 1;
+       }
+       free(data_out);
+       free(data_in);
+done:
+       for (i = 0; i < nkeys; ++i) {
+               if (keys[i])
+                       RSA_free(keys[i]);
+       }
+       free(keys);
+       free(ispub);
+       return (ecode);
+}
diff --git a/sbin/hammer2/cmd_service.c b/sbin/hammer2/cmd_service.c
new file mode 100644 (file)
index 0000000..7564773
--- /dev/null
@@ -0,0 +1,349 @@
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "hammer2.h"
+
+static void *master_accept(void *data);
+static void master_auth_signal(hammer2_router_t *router);
+static void master_auth_rxmsg(hammer2_msg_t *msg);
+static void master_link_signal(hammer2_router_t *router);
+static void master_link_rxmsg(hammer2_msg_t *msg);
+static void master_reconnect(const char *mntpt);
+
+/*
+ * Start-up the master listener daemon for the machine.
+ *
+ * The master listener serves as a rendezvous point in the cluster, accepting
+ * connections, performing registrations and authentications, maintaining
+ * the spanning tree, and keeping track of message state so disconnects can
+ * be handled properly.
+ *
+ * Once authenticated only low-level messaging protocols (which includes
+ * tracking persistent messages) are handled by this daemon.  This daemon
+ * does not run the higher level quorum or locking protocols.
+ *
+ * This daemon can also be told to maintain connections to other nodes,
+ * forming a messaging backbone, which in turn allows PFS's (if desired) to
+ * simply connect to the master daemon via localhost if desired.
+ * Backbones are specified via /etc/hammer2.conf.
+ */
+int
+cmd_service(void)
+{
+       struct sockaddr_in lsin;
+       int on;
+       int lfd;
+
+       /*
+        * Acquire socket and set options
+        */
+       if ((lfd = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
+               fprintf(stderr, "master_listen: socket(): %s\n",
+                       strerror(errno));
+               return 1;
+       }
+       on = 1;
+       setsockopt(lfd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on));
+
+       /*
+        * Setup listen port and try to bind.  If the bind fails we assume
+        * that a master listener process is already running and silently
+        * fail.
+        */
+       bzero(&lsin, sizeof(lsin));
+       lsin.sin_family = AF_INET;
+       lsin.sin_addr.s_addr = INADDR_ANY;
+       lsin.sin_port = htons(HAMMER2_LISTEN_PORT);
+       if (bind(lfd, (struct sockaddr *)&lsin, sizeof(lsin)) < 0) {
+               close(lfd);
+               if (QuietOpt == 0) {
+                       fprintf(stderr,
+                               "master listen: daemon already running\n");
+               }
+               return 0;
+       }
+       if (QuietOpt == 0)
+               fprintf(stderr, "master listen: startup\n");
+       listen(lfd, 50);
+
+       /*
+        * Fork and disconnect the controlling terminal and parent process,
+        * executing the specified function as a pthread.
+        *
+        * Returns to the original process which can then continue running.
+        * In debug mode this call will create the pthread without forking
+        * and set NormalExit to 0, instead of fork.
+        */
+       hammer2_demon(master_accept, (void *)(intptr_t)lfd);
+       if (NormalExit)
+               close(lfd);
+       return 0;
+}
+
+/*
+ * Master listen/accept thread.  Accept connections on the master socket,
+ * starting a pthread for each one.
+ */
+static
+void *
+master_accept(void *data)
+{
+       struct sockaddr_in asin;
+       socklen_t alen;
+       pthread_t thread;
+       hammer2_master_service_info_t *info;
+       int lfd = (int)(intptr_t)data;
+       int fd;
+       int i;
+       int count;
+       struct statfs *mntbuf = NULL;
+       struct statvfs *mntvbuf = NULL;
+
+       /*
+        * Nobody waits for us
+        */
+       setproctitle("hammer2 master listen");
+       pthread_detach(pthread_self());
+
+       /*
+        * Scan existing hammer2 mounts and reconnect to them using
+        * HAMMER2IOC_RECLUSTER.
+        */
+       count = getmntvinfo(&mntbuf, &mntvbuf, MNT_NOWAIT);
+       for (i = 0; i < count; ++i) {
+               if (strcmp(mntbuf[i].f_fstypename, "hammer2") == 0)
+                       master_reconnect(mntbuf[i].f_mntonname);
+       }
+
+       /*
+        * Accept connections and create pthreads to handle them after
+        * validating the IP.
+        */
+       for (;;) {
+               alen = sizeof(asin);
+               fd = accept(lfd, (struct sockaddr *)&asin, &alen);
+               if (fd < 0) {
+                       if (errno == EINTR)
+                               continue;
+                       break;
+               }
+               thread = NULL;
+               fprintf(stderr, "master_accept: accept fd %d\n", fd);
+               info = malloc(sizeof(*info));
+               bzero(info, sizeof(*info));
+               info->fd = fd;
+               info->detachme = 1;
+               pthread_create(&thread, NULL, master_service, info);
+       }
+       return (NULL);
+}
+
+/*
+ * Normally the mount program supplies a cluster communications
+ * descriptor to the hammer2 vfs on mount, but if you kill the service
+ * daemon and restart it that link will be lost.
+ *
+ * This procedure attempts to [re]connect to existing mounts when
+ * the service daemon is started up before going into its accept
+ * loop.
+ */
+static
+void
+master_reconnect(const char *mntpt)
+{
+       struct hammer2_ioc_recluster recls;
+       hammer2_master_service_info_t *info;
+       pthread_t thread;
+       int fd;
+       int pipefds[2];
+
+       fd = open(mntpt, O_RDONLY);
+       if (fd < 0) {
+               fprintf(stderr, "reconnect %s: no access to mount\n", mntpt);
+               return;
+       }
+       if (pipe(pipefds) < 0) {
+               fprintf(stderr, "reconnect %s: pipe() failed\n", mntpt);
+               return;
+       }
+       bzero(&recls, sizeof(recls));
+       recls.fd = pipefds[0];
+       if (ioctl(fd, HAMMER2IOC_RECLUSTER, &recls) < 0) {
+               fprintf(stderr, "reconnect %s: ioctl failed\n", mntpt);
+               close(pipefds[0]);
+               close(pipefds[1]);
+               close(fd);
+               return;
+       }
+       close(pipefds[0]);
+
+       info = malloc(sizeof(*info));
+       bzero(info, sizeof(*info));
+       info->fd = pipefds[1];
+       info->detachme = 1;
+       pthread_create(&thread, NULL, master_service, info);
+}
+
+/*
+ * Service an accepted connection (runs as a pthread)
+ *
+ * (also called from a couple of other places)
+ */
+void *
+master_service(void *data)
+{
+       hammer2_master_service_info_t *info = data;
+       hammer2_iocom_t iocom;
+
+       if (info->detachme)
+               pthread_detach(pthread_self());
+
+       hammer2_iocom_init(&iocom, info->fd, -1,
+                          master_auth_signal,
+                          master_auth_rxmsg,
+                          NULL);
+       hammer2_iocom_core(&iocom);
+
+       fprintf(stderr,
+               "iocom on fd %d terminated error rx=%d, tx=%d\n",
+               info->fd, iocom.ioq_rx.error, iocom.ioq_tx.error);
+       close(info->fd);
+       info->fd = -1;  /* safety */
+       free(info);
+
+       return (NULL);
+}
+
+/************************************************************************
+ *                         AUTHENTICATION                              *
+ ************************************************************************
+ *
+ * Callback via hammer2_iocom_core().
+ *
+ * Additional messaging-based authentication must occur before normal
+ * message operation.  The connection has already been encrypted at
+ * this point.
+ */
+static void master_auth_conn_rx(hammer2_msg_t *msg);
+
+static
+void
+master_auth_signal(hammer2_router_t *router)
+{
+       hammer2_msg_t *msg;
+
+       /*
+        * Transmit LNK_CONN, enabling the SPAN protocol if both sides
+        * agree.
+        *
+        * XXX put additional authentication states here
+        */
+       msg = hammer2_msg_alloc(router, 0, HAMMER2_LNK_CONN |
+                                                  HAMMER2_MSGF_CREATE,
+                               master_auth_conn_rx, NULL);
+       snprintf(msg->any.lnk_conn.label, sizeof(msg->any.lnk_conn.label), "*");
+       hammer2_msg_write(msg);
+
+       hammer2_router_restate(router,
+                             master_link_signal,
+                             master_link_rxmsg,
+                             NULL);
+}
+
+static
+void
+master_auth_conn_rx(hammer2_msg_t *msg)
+{
+       if (msg->any.head.cmd & HAMMER2_MSGF_DELETE)
+               hammer2_msg_reply(msg, 0);
+}
+
+static
+void
+master_auth_rxmsg(hammer2_msg_t *msg __unused)
+{
+}
+
+/************************************************************************
+ *                     POST-AUTHENTICATION SERVICE MSGS                *
+ ************************************************************************
+ *
+ * Callback via hammer2_iocom_core().
+ */
+static
+void
+master_link_signal(hammer2_router_t *router)
+{
+       hammer2_msg_lnk_signal(router);
+}
+
+static
+void
+master_link_rxmsg(hammer2_msg_t *msg)
+{
+       hammer2_state_t *state;
+       uint32_t cmd;
+
+       /*
+        * If the message state has a function established we just
+        * call the function, otherwise we call the appropriate
+        * link-level protocol related to the original command and
+        * let it sort it out.
+        *
+        * Non-transactional one-off messages, on the otherhand,
+        * might have REPLY set.
+        */
+       state = msg->state;
+       cmd = state ? state->msg->any.head.cmd : msg->any.head.cmd;
+
+       fprintf(stderr, "service-receive: %s\n", hammer2_msg_str(msg));
+
+       if (state && state->func) {
+               assert(state->func != NULL);
+               state->func(msg);
+       } else {
+               switch(cmd & HAMMER2_MSGF_PROTOS) {
+               case HAMMER2_MSG_PROTO_LNK:
+                       hammer2_msg_lnk(msg);
+                       break;
+               case HAMMER2_MSG_PROTO_DBG:
+                       hammer2_msg_dbg(msg);
+                       break;
+               default:
+                       hammer2_msg_reply(msg, HAMMER2_MSG_ERR_NOSUPP);
+                       break;
+               }
+       }
+}
diff --git a/sbin/hammer2/cmd_snapshot.c b/sbin/hammer2/cmd_snapshot.c
new file mode 100644 (file)
index 0000000..2d46b9e
--- /dev/null
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "hammer2.h"
+
+/*
+ * The snapshot is named <PFSNAME>_<YYYYMMDD.HHMMSS.TRANSID> unless
+ * overridden by a label.
+ *
+ * When local non-cache media is involved the media is
+ * first synchronized and the snapshot is then based on
+ * the media.
+ *
+ * If the media is remote the snapshot is created on the remote
+ * end (if you have sufficient administrative rights) and a local
+ * ADMIN or CACHE PFS is created with a connection to the snapshot
+ * on the remote.
+ *
+ * If the client has snapshot rights to multiple remotes then TBD.
+ */
diff --git a/sbin/hammer2/cmd_stat.c b/sbin/hammer2/cmd_stat.c
new file mode 100644 (file)
index 0000000..53e03ce
--- /dev/null
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "hammer2.h"
+
+/*
+ * Should be run as root.  Creates /etc/hammer2/rsa.{pub,prv} using
+ * an openssl command.
+ */
+int
+cmd_stat(int ac, const char **av)
+{
+       hammer2_ioc_inode_t ino;
+       const char *cdir = ".";
+       int ec = 0;
+       int w;
+       int i;
+       int fd;
+
+       if (ac == 0) {
+               ac = 1;
+               av = &cdir;
+       }
+       for (i = w = 0; i < ac; ++i) {
+               if (w < (int)strlen(av[i]))
+                       w = (int)strlen(av[i]);
+       }
+       if (w < 16)
+               w = 16;
+       printf("%-*.*s ncp data-use  inode-use kaddr\n", w, w, "PATH");
+       for (i = 0; i < ac; ++i) {
+               if ((fd = open(av[i], O_RDONLY)) < 0) {
+                       fprintf(stderr, "%s: %s\n", av[i], strerror(errno));
+                       ec = 1;
+                       continue;
+               }
+               if (ioctl(fd, HAMMER2IOC_INODE_GET, &ino) < 0) {
+                       fprintf(stderr, "%s: %s\n", av[i], strerror(errno));
+                       ec = 1;
+                       continue;
+               }
+               printf("%-*.*s ", w, w, av[i]);
+               printf("%3d ", ino.ip_data.ncopies);
+               printf("%9s ", sizetostr(ino.ip_data.data_count));
+               printf("%9s ", sizetostr(ino.ip_data.inode_count));
+               printf("%p ", ino.kdata);
+               if (ino.ip_data.data_quota || ino.ip_data.inode_quota) {
+                       printf(" quota ");
+                       printf("%12s", sizetostr(ino.ip_data.data_quota));
+                       printf("/%-12s", sizetostr(ino.ip_data.inode_quota));
+               }
+               printf("\n");
+       }
+       return ec;
+}
diff --git a/sbin/hammer2/crypto.c b/sbin/hammer2/crypto.c
new file mode 100644 (file)
index 0000000..fd5ac6f
--- /dev/null
@@ -0,0 +1,767 @@
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ * by Alex Hornung <alexh@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "hammer2.h"
+#include <sys/endian.h>
+
+/*
+ * Setup crypto for pthreads
+ */
+static pthread_mutex_t *crypto_locks;
+int crypto_count;
+
+static int hammer2_crypto_gcm_init(hammer2_ioq_t *, char *, int, char *, int, int);
+static int hammer2_crypto_gcm_encrypt_chunk(hammer2_ioq_t *, char *, char *, int, int *);
+static int hammer2_crypto_gcm_decrypt_chunk(hammer2_ioq_t *, char *, char *, int, int *);
+
+/*
+ * NOTE: the order of this table needs to match the HAMMER2_CRYPTO_ALGO_*_IDX
+ *       defines in network.h.
+ */
+static struct crypto_algo crypto_algos[] = {
+       {
+               .name      = "aes-256-gcm",
+               .keylen    = HAMMER2_CRYPTO_GCM_KEY_SIZE,
+               .taglen    = HAMMER2_CRYPTO_GCM_TAG_SIZE,
+               .init      = hammer2_crypto_gcm_init,
+               .enc_chunk = hammer2_crypto_gcm_encrypt_chunk,
+               .dec_chunk = hammer2_crypto_gcm_decrypt_chunk
+       },
+       { NULL, 0, 0, NULL, NULL, NULL }
+};
+
+static
+unsigned long
+hammer2_crypto_id_callback(void)
+{
+       return ((unsigned long)(uintptr_t)pthread_self());
+}
+
+static
+void
+hammer2_crypto_locking_callback(int mode, int type,
+                               const char *file __unused, int line __unused)
+{
+       assert(type >= 0 && type < crypto_count);
+       if (mode & CRYPTO_LOCK) {
+               pthread_mutex_lock(&crypto_locks[type]);
+       } else {
+               pthread_mutex_unlock(&crypto_locks[type]);
+       }
+}
+
+void
+hammer2_crypto_setup(void)
+{
+       crypto_count = CRYPTO_num_locks();
+       crypto_locks = calloc(crypto_count, sizeof(crypto_locks[0]));
+       CRYPTO_set_id_callback(hammer2_crypto_id_callback);
+       CRYPTO_set_locking_callback(hammer2_crypto_locking_callback);
+}
+
+static
+int
+hammer2_crypto_gcm_init(hammer2_ioq_t *ioq, char *key, int klen,
+                       char *iv_fixed, int ivlen, int enc)
+{
+       int i, ok;
+
+       if (klen < HAMMER2_CRYPTO_GCM_KEY_SIZE ||
+           ivlen < HAMMER2_CRYPTO_GCM_IV_FIXED_SIZE) {
+               if (DebugOpt)
+                       fprintf(stderr, "Not enough key or iv material\n");
+               return -1;
+       }
+
+       printf("%s key: ", enc ? "Encryption" : "Decryption");
+       for (i = 0; i < HAMMER2_CRYPTO_GCM_KEY_SIZE; ++i)
+               printf("%02x", (unsigned char)key[i]);
+       printf("\n");
+
+       printf("%s iv:  ", enc ? "Encryption" : "Decryption");
+       for (i = 0; i < HAMMER2_CRYPTO_GCM_IV_FIXED_SIZE; ++i)
+               printf("%02x", (unsigned char)iv_fixed[i]);
+       printf(" (fixed part only)\n");
+
+       EVP_CIPHER_CTX_init(&ioq->ctx);
+
+       if (enc)
+               ok = EVP_EncryptInit_ex(&ioq->ctx, EVP_aes_256_gcm(), NULL,
+                                       key, NULL);
+       else
+               ok = EVP_DecryptInit_ex(&ioq->ctx, EVP_aes_256_gcm(), NULL,
+                                       key, NULL);
+       if (!ok)
+               goto fail;
+
+       /*
+        * According to the original Galois/Counter Mode of Operation (GCM)
+        * proposal, only IVs that are exactly 96 bits get used without any
+        * further processing. Other IV sizes cause the GHASH() operation
+        * to be applied to the IV, which is more costly.
+        *
+        * The NIST SP 800-38D also recommends using a 96 bit IV for the same
+        * reasons. We actually follow the deterministic construction
+        * recommended in NIST SP 800-38D with a 64 bit invocation field as an
+        * integer counter and a random, session-specific fixed field.
+        *
+        * This means that we can essentially use the same session key and
+        * IV fixed field for up to 2^64 invocations of the authenticated
+        * encryption or decryption.
+        *
+        * With a chunk size of 64 bytes, this adds up to 1 zettabyte of
+        * traffic.
+        */
+       ok = EVP_CIPHER_CTX_ctrl(&ioq->ctx, EVP_CTRL_GCM_SET_IVLEN,
+                                HAMMER2_CRYPTO_GCM_IV_SIZE, NULL);
+       if (!ok)
+               goto fail;
+
+       memset(ioq->iv, 0, HAMMER2_CRYPTO_GCM_IV_SIZE);
+       memcpy(ioq->iv, iv_fixed, HAMMER2_CRYPTO_GCM_IV_FIXED_SIZE);
+
+       /*
+        * Strictly speaking, padding is irrelevant with a counter mode
+        * encryption.
+        *
+        * However, setting padding to 0, even if using a counter mode such
+        * as GCM, will cause an error in _finish if the pt/ct size is not
+        * a multiple of the cipher block size.
+        */
+       EVP_CIPHER_CTX_set_padding(&ioq->ctx, 0);
+
+       return 0;
+
+fail:
+       if (DebugOpt)
+               fprintf(stderr, "Error during _gcm_init\n");
+       return -1;
+}
+
+static
+int
+_gcm_iv_increment(char *iv)
+{
+       /*
+        * Deterministic construction according to NIST SP 800-38D, with
+        * 64 bit invocation field as integer counter.
+        *
+        * In other words, our 96 bit IV consists of a 32 bit fixed field
+        * unique to the session and a 64 bit integer counter.
+        */
+
+       uint64_t *c = (uint64_t *)(&iv[HAMMER2_CRYPTO_GCM_IV_FIXED_SIZE]);
+
+       /* Increment invocation field integer counter */
+       *c = htobe64(be64toh(*c)+1);
+
+       /*
+        * Detect wrap-around, which means it is time to renegotiate
+        * the session to get a new key and/or fixed field.
+        */
+       return (c == 0) ? 0 : 1;
+}
+
+static
+int
+hammer2_crypto_gcm_encrypt_chunk(hammer2_ioq_t *ioq, char *ct, char *pt,
+                                int in_size, int *out_size)
+{
+       int ok;
+       int u_len, f_len;
+
+       *out_size = 0;
+
+       /* Re-initialize with new IV (but without redoing the key schedule) */
+       ok = EVP_EncryptInit_ex(&ioq->ctx, NULL, NULL, NULL, ioq->iv);
+       if (!ok)
+               goto fail;
+
+       ok = EVP_EncryptUpdate(&ioq->ctx, ct, &u_len, pt, in_size);
+       if (!ok)
+               goto fail;
+
+       ok = EVP_EncryptFinal(&ioq->ctx, ct + u_len, &f_len);
+       if (!ok)
+               goto fail;
+
+       /* Retrieve auth tag */
+       ok = EVP_CIPHER_CTX_ctrl(&ioq->ctx, EVP_CTRL_GCM_GET_TAG,
+                                HAMMER2_CRYPTO_GCM_TAG_SIZE,
+                                ct + u_len + f_len);
+       if (!ok)
+               goto fail;
+
+       ok = _gcm_iv_increment(ioq->iv);
+       if (!ok) {
+               ioq->error = HAMMER2_IOQ_ERROR_IVWRAP;
+               goto fail_out;
+       }
+
+       *out_size = u_len + f_len + HAMMER2_CRYPTO_GCM_TAG_SIZE;
+
+       return 0;
+
+fail:
+       ioq->error = HAMMER2_IOQ_ERROR_ALGO;
+fail_out:
+       if (DebugOpt)
+               fprintf(stderr, "error during encrypt_chunk\n");
+       return -1;
+}
+
+static
+int
+hammer2_crypto_gcm_decrypt_chunk(hammer2_ioq_t *ioq, char *ct, char *pt,
+                                int out_size, int *consume_size)
+{
+       int ok;
+       int u_len, f_len;
+
+       *consume_size = 0;
+
+       /* Re-initialize with new IV (but without redoing the key schedule) */
+       ok = EVP_DecryptInit_ex(&ioq->ctx, NULL, NULL, NULL, ioq->iv);
+       if (!ok) {
+               ioq->error = HAMMER2_IOQ_ERROR_ALGO;
+               goto fail_out;
+       }
+
+       ok = EVP_CIPHER_CTX_ctrl(&ioq->ctx, EVP_CTRL_GCM_SET_TAG,
+                                HAMMER2_CRYPTO_GCM_TAG_SIZE,
+                                ct + out_size);
+       if (!ok) {
+               ioq->error = HAMMER2_IOQ_ERROR_ALGO;
+               goto fail_out;
+       }
+
+       ok = EVP_DecryptUpdate(&ioq->ctx, pt, &u_len, ct, out_size);
+       if (!ok)
+               goto fail;
+
+       ok = EVP_DecryptFinal(&ioq->ctx, pt + u_len, &f_len);
+       if (!ok)
+               goto fail;
+
+       ok = _gcm_iv_increment(ioq->iv);
+       if (!ok) {
+               ioq->error = HAMMER2_IOQ_ERROR_IVWRAP;
+               goto fail_out;
+       }
+
+       *consume_size = u_len + f_len + HAMMER2_CRYPTO_GCM_TAG_SIZE;
+
+       return 0;
+
+fail:
+       ioq->error = HAMMER2_IOQ_ERROR_MACFAIL;
+fail_out:
+       if (DebugOpt)
+               fprintf(stderr, "error during decrypt_chunk (likely authentication error)\n");
+       return -1;
+}
+
+/*
+ * Synchronously negotiate crypto for a new session.  This must occur
+ * within 10 seconds or the connection is error'd out.
+ *
+ * We work off the IP address and/or reverse DNS.  The IP address is
+ * checked first, followed by the IP address at various levels of granularity,
+ * followed by the full domain name and domain names at various levels of
+ * granularity.
+ *
+ *     /etc/hammer2/remote/<name>.pub  - Contains a public key
+ *     /etc/hammer2/remote/<name>.none - Indicates no encryption (empty file)
+ *                                       (e.g. localhost.none).
+ *
+ * We first attempt to locate a public key file based on the peer address or
+ * peer FQDN.
+ *
+ *     <name>.none     - No further negotiation is needed.  We simply return.
+ *                       All communication proceeds without encryption.
+ *                       No public key handshake occurs in this situation.
+ *                       (both ends must match).
+ *
+ *     <name>.pub      - We have located the public key for the peer.  Both
+ *                       sides transmit a block encrypted with their private
+ *                       keys and the peer's public key.
+ *
+ *                       Both sides receive a block and decrypt it.
+ *
+ *                       Both sides formulate a reply using the decrypted
+ *                       block and transmit it.
+ *
+ *                       communication proceeds with the negotiated session
+ *                       key (typically AES-256-CBC).
+ *
+ * If we fail to locate the appropriate file and no floating.db exists the
+ * connection is terminated without further action.
+ *
+ * If floating.db exists the connection proceeds with a floating negotiation.
+ */
+typedef union {
+       struct sockaddr sa;
+       struct sockaddr_in sa_in;
+       struct sockaddr_in6 sa_in6;
+} sockaddr_any_t;
+
+void
+hammer2_crypto_negotiate(hammer2_iocom_t *iocom)
+{
+       sockaddr_any_t sa;
+       socklen_t salen = sizeof(sa);
+       char peername[128];
+       char realname[128];
+       hammer2_handshake_t handtx;
+       hammer2_handshake_t handrx;
+       char buf1[sizeof(handtx)];
+       char buf2[sizeof(handtx)];
+       char *ptr;
+       char *path;
+       struct stat st;
+       FILE *fp;
+       RSA *keys[3] = { NULL, NULL, NULL };
+       size_t i;
+       size_t blksize;
+       size_t blkmask;
+       ssize_t n;
+       int fd;
+       int error;
+
+       /*
+        * Get the peer IP address for the connection as a string.
+        */
+       if (getpeername(iocom->sock_fd, &sa.sa, &salen) < 0) {
+               iocom->ioq_rx.error = HAMMER2_IOQ_ERROR_NOPEER;
+               iocom->flags |= HAMMER2_IOCOMF_EOF;
+               if (DebugOpt)
+                       fprintf(stderr, "accept: getpeername() failed\n");
+               goto done;
+       }
+       if (getnameinfo(&sa.sa, salen, peername, sizeof(peername),
+                       NULL, 0, NI_NUMERICHOST) < 0) {
+               iocom->ioq_rx.error = HAMMER2_IOQ_ERROR_NOPEER;
+               iocom->flags |= HAMMER2_IOCOMF_EOF;
+               if (DebugOpt)
+                       fprintf(stderr, "accept: cannot decode sockaddr\n");
+               goto done;
+       }
+       if (DebugOpt) {
+               if (realhostname_sa(realname, sizeof(realname),
+                                   &sa.sa, salen) == HOSTNAME_FOUND) {
+                       fprintf(stderr, "accept from %s (%s)\n",
+                               peername, realname);
+               } else {
+                       fprintf(stderr, "accept from %s\n", peername);
+               }
+       }
+
+       /*
+        * Find the remote host's public key
+        *
+        * If the link is not to be encrypted (<ip>.none located) we shortcut
+        * the handshake entirely.  No buffers are exchanged.
+        */
+       asprintf(&path, "%s/%s.pub", HAMMER2_PATH_REMOTE, peername);
+       if ((fp = fopen(path, "r")) == NULL) {
+               free(path);
+               asprintf(&path, "%s/%s.none",
+                        HAMMER2_PATH_REMOTE, peername);
+               if (stat(path, &st) < 0) {
+                       iocom->ioq_rx.error = HAMMER2_IOQ_ERROR_NORKEY;
+                       iocom->flags |= HAMMER2_IOCOMF_EOF;
+                       if (DebugOpt)
+                               fprintf(stderr, "auth failure: unknown host\n");
+                       goto done;
+               }
+               if (DebugOpt)
+                       fprintf(stderr, "auth succeeded, unencrypted link\n");
+               goto done;
+       }
+       if (fp) {
+               keys[0] = PEM_read_RSA_PUBKEY(fp, NULL, NULL, NULL);
+               fclose(fp);
+               if (keys[0] == NULL) {
+                       iocom->ioq_rx.error = HAMMER2_IOQ_ERROR_KEYFMT;
+                       iocom->flags |= HAMMER2_IOCOMF_EOF;
+                       if (DebugOpt)
+                               fprintf(stderr,
+                                       "auth failure: bad key format\n");
+                       goto done;
+               }
+       }
+
+       /*
+        * Get our public and private keys
+        */
+       free(path);
+       asprintf(&path, HAMMER2_DEFAULT_DIR "/rsa.pub");
+       if ((fp = fopen(path, "r")) == NULL) {
+               iocom->ioq_rx.error = HAMMER2_IOQ_ERROR_NOLKEY;
+               iocom->flags |= HAMMER2_IOCOMF_EOF;
+               goto done;
+       }
+       keys[1] = PEM_read_RSA_PUBKEY(fp, NULL, NULL, NULL);
+       fclose(fp);
+       if (keys[1] == NULL) {
+               iocom->ioq_rx.error = HAMMER2_IOQ_ERROR_KEYFMT;
+               iocom->flags |= HAMMER2_IOCOMF_EOF;
+               if (DebugOpt)
+                       fprintf(stderr, "auth failure: bad host key format\n");
+               goto done;
+       }
+
+       free(path);
+       asprintf(&path, HAMMER2_DEFAULT_DIR "/rsa.prv");
+       if ((fp = fopen(path, "r")) == NULL) {
+               iocom->ioq_rx.error = HAMMER2_IOQ_ERROR_NOLKEY;
+               iocom->flags |= HAMMER2_IOCOMF_EOF;
+               if (DebugOpt)
+                       fprintf(stderr, "auth failure: bad host key format\n");
+               goto done;
+       }
+       keys[2] = PEM_read_RSAPrivateKey(fp, NULL, NULL, NULL);
+       fclose(fp);
+       if (keys[2] == NULL) {
+               iocom->ioq_rx.error = HAMMER2_IOQ_ERROR_KEYFMT;
+               iocom->flags |= HAMMER2_IOCOMF_EOF;
+               if (DebugOpt)
+                       fprintf(stderr, "auth failure: bad host key format\n");
+               goto done;
+       }
+       free(path);
+       path = NULL;
+
+       /*
+        * public key encrypt/decrypt block size.
+        */
+       if (keys[0]) {
+               blksize = (size_t)RSA_size(keys[0]);
+               if (blksize != (size_t)RSA_size(keys[1]) ||
+                   blksize != (size_t)RSA_size(keys[2]) ||
+                   sizeof(handtx) % blksize != 0) {
+                       iocom->ioq_rx.error = HAMMER2_IOQ_ERROR_KEYFMT;
+                       iocom->flags |= HAMMER2_IOCOMF_EOF;
+                       if (DebugOpt)
+                               fprintf(stderr, "auth failure: "
+                                               "key size mismatch\n");
+                       goto done;
+               }
+       } else {
+               blksize = sizeof(handtx);
+       }
+       blkmask = blksize - 1;
+
+       bzero(&handrx, sizeof(handrx));
+       bzero(&handtx, sizeof(handtx));
+
+       /*
+        * Fill all unused fields (particular all junk fields) with random
+        * data, and also set the session key.
+        */
+       fd = open("/dev/urandom", O_RDONLY);
+       if (fd < 0 ||
+           fstat(fd, &st) < 0 ||       /* something wrong */
+           S_ISREG(st.st_mode) ||      /* supposed to be a RNG dev! */
+           read(fd, &handtx, sizeof(handtx)) != sizeof(handtx)) {
+urandfail:
+               if (fd >= 0)
+                       close(fd);
+               iocom->ioq_rx.error = HAMMER2_IOQ_ERROR_BADURANDOM;
+               iocom->flags |= HAMMER2_IOCOMF_EOF;
+               if (DebugOpt)
+                       fprintf(stderr, "auth failure: bad rng\n");
+               goto done;
+       }
+       if (bcmp(&handrx, &handtx, sizeof(handtx)) == 0)
+               goto urandfail;                 /* read all zeros */
+       close(fd);
+       /* ERR_load_crypto_strings(); openssl debugging */
+
+       /*
+        * Handshake with the remote.
+        *
+        *      Encrypt with my private and remote's public
+        *      Decrypt with my private and remote's public
+        *
+        * When encrypting we have to make sure our buffer fits within the
+        * modulus, which typically requires bit 7 o the first byte to be
+        * zero.  To be safe make sure that bit 7 and bit 6 is zero.
+        */
+       snprintf(handtx.quickmsg, sizeof(handtx.quickmsg), "Testing 1 2 3");
+       handtx.magic = HAMMER2_MSGHDR_MAGIC;
+       handtx.version = 1;
+       handtx.flags = 0;
+       assert(sizeof(handtx.verf) * 4 == sizeof(handtx.sess));
+       bzero(handtx.verf, sizeof(handtx.verf));
+
+       handtx.pad1[0] &= 0x3f; /* message must fit within modulus */
+       handtx.pad2[0] &= 0x3f; /* message must fit within modulus */
+
+       for (i = 0; i < sizeof(handtx.sess); ++i)
+               handtx.verf[i / 4] ^= handtx.sess[i];
+
+       /*
+        * Write handshake buffer to remote
+        */
+       for (i = 0; i < sizeof(handtx); i += blksize) {
+               ptr = (char *)&handtx + i;
+               if (keys[0]) {
+                       /*
+                        * Since we are double-encrypting we have to make
+                        * sure that the result of the first stage does
+                        * not blow out the modulus for the second stage.
+                        *
+                        * The pointer is pointing to the pad*[] area so
+                        * we can mess with that until the first stage
+                        * is legal.
+                        */
+                       do {
+                               ++*(int *)(ptr + 4);
+                               if (RSA_private_encrypt(blksize, ptr, buf1,
+                                           keys[2], RSA_NO_PADDING) < 0) {
+                                       iocom->ioq_rx.error =
+                                               HAMMER2_IOQ_ERROR_KEYXCHGFAIL;
+                               }
+                       } while (buf1[0] & 0xC0);
+
+                       if (RSA_public_encrypt(blksize, buf1, buf2,
+                                           keys[0], RSA_NO_PADDING) < 0) {
+                               iocom->ioq_rx.error =
+                                       HAMMER2_IOQ_ERROR_KEYXCHGFAIL;
+                       }
+               }
+               if (write(iocom->sock_fd, buf2, blksize) != (ssize_t)blksize) {
+                       fprintf(stderr, "WRITE ERROR\n");
+               }
+       }
+       if (iocom->ioq_rx.error) {
+               iocom->flags |= HAMMER2_IOCOMF_EOF;
+               if (DebugOpt)
+                       fprintf(stderr, "auth failure: key exchange failure "
+                                       "during encryption\n");
+               goto done;
+       }
+
+       /*
+        * Read handshake buffer from remote
+        */
+       i = 0;
+       while (i < sizeof(handrx)) {
+               ptr = (char *)&handrx + i;
+               n = read(iocom->sock_fd, ptr, blksize - (i & blkmask));
+               if (n <= 0)
+                       break;
+               ptr -= (i & blkmask);
+               i += n;
+               if (keys[0] && (i & blkmask) == 0) {
+                       if (RSA_private_decrypt(blksize, ptr, buf1,
+                                          keys[2], RSA_NO_PADDING) < 0)
+                               iocom->ioq_rx.error =
+                                               HAMMER2_IOQ_ERROR_KEYXCHGFAIL;
+                       if (RSA_public_decrypt(blksize, buf1, ptr,
+                                          keys[0], RSA_NO_PADDING) < 0)
+                               iocom->ioq_rx.error =
+                                               HAMMER2_IOQ_ERROR_KEYXCHGFAIL;
+               }
+       }
+       if (iocom->ioq_rx.error) {
+               iocom->flags |= HAMMER2_IOCOMF_EOF;
+               if (DebugOpt)
+                       fprintf(stderr, "auth failure: key exchange failure "
+                                       "during decryption\n");
+               goto done;
+       }
+
+       /*
+        * Validate the received data.  Try to make this a constant-time
+        * algorithm.
+        */
+       if (i != sizeof(handrx)) {
+keyxchgfail:
+               iocom->ioq_rx.error = HAMMER2_IOQ_ERROR_KEYXCHGFAIL;
+               iocom->flags |= HAMMER2_IOCOMF_EOF;
+               if (DebugOpt)
+                       fprintf(stderr, "auth failure: key exchange failure\n");
+               goto done;
+       }
+
+       if (handrx.magic == HAMMER2_MSGHDR_MAGIC_REV) {
+               handrx.version = bswap16(handrx.version);
+               handrx.flags = bswap32(handrx.flags);
+       }
+       for (i = 0; i < sizeof(handrx.sess); ++i)
+               handrx.verf[i / 4] ^= handrx.sess[i];
+       n = 0;
+       for (i = 0; i < sizeof(handrx.verf); ++i)
+               n += handrx.verf[i];
+       if (handrx.version != 1)
+               ++n;
+       if (n != 0)
+               goto keyxchgfail;
+
+       /*
+        * Use separate session keys and session fixed IVs for receive and
+        * transmit.
+        */
+       error = crypto_algos[HAMMER2_CRYPTO_ALGO].init(&iocom->ioq_rx, handrx.sess,
+           crypto_algos[HAMMER2_CRYPTO_ALGO].keylen,
+           handrx.sess + crypto_algos[HAMMER2_CRYPTO_ALGO].keylen,
+           sizeof(handrx.sess) - crypto_algos[HAMMER2_CRYPTO_ALGO].keylen,
+           0 /* decryption */);
+       if (error)
+               goto keyxchgfail;
+
+       error = crypto_algos[HAMMER2_CRYPTO_ALGO].init(&iocom->ioq_tx, handtx.sess,
+           crypto_algos[HAMMER2_CRYPTO_ALGO].keylen,
+           handtx.sess + crypto_algos[HAMMER2_CRYPTO_ALGO].keylen,
+           sizeof(handtx.sess) - crypto_algos[HAMMER2_CRYPTO_ALGO].keylen,
+           1 /* encryption */);
+       if (error)
+               goto keyxchgfail;
+
+       iocom->flags |= HAMMER2_IOCOMF_CRYPTED;
+
+       if (DebugOpt)
+               fprintf(stderr, "auth success: %s\n", handrx.quickmsg);
+done:
+       if (path)
+               free(path);
+       if (keys[0])
+               RSA_free(keys[0]);
+       if (keys[1])
+               RSA_free(keys[1]);
+       if (keys[1])
+               RSA_free(keys[2]);
+}
+
+/*
+ * Decrypt pending data in the ioq's fifo.  The data is decrypted in-place.
+ */
+void
+hammer2_crypto_decrypt(hammer2_iocom_t *iocom __unused, hammer2_ioq_t *ioq)
+{
+       int p_len;
+       int used;
+       int error;
+       char buf[512];
+
+       /*
+        * fifo_beg to fifo_cdx is data already decrypted.
+        * fifo_cdn to fifo_end is data not yet decrypted.
+        */
+       p_len = ioq->fifo_end - ioq->fifo_cdn; /* data not yet decrypted */
+
+       if (p_len == 0)
+               return;
+
+       while (p_len >= crypto_algos[HAMMER2_CRYPTO_ALGO].taglen +
+           HAMMER2_CRYPTO_CHUNK_SIZE) {
+               bcopy(ioq->buf + ioq->fifo_cdn, buf,
+                     crypto_algos[HAMMER2_CRYPTO_ALGO].taglen +
+                     HAMMER2_CRYPTO_CHUNK_SIZE);
+               error = crypto_algos[HAMMER2_CRYPTO_ALGO].dec_chunk(
+                   ioq, buf,
+                   ioq->buf + ioq->fifo_cdx,
+                   HAMMER2_CRYPTO_CHUNK_SIZE,
+                   &used);
+#ifdef CRYPTO_DEBUG
+               printf("dec: p_len: %d, used: %d, fifo_cdn: %ju, fifo_cdx: %ju\n",
+                      p_len, used, ioq->fifo_cdn, ioq->fifo_cdx);
+#endif
+               p_len -= used;
+               ioq->fifo_cdn += used;
+               ioq->fifo_cdx += HAMMER2_CRYPTO_CHUNK_SIZE;
+#ifdef CRYPTO_DEBUG
+               printf("dec: p_len: %d, used: %d, fifo_cdn: %ju, fifo_cdx: %ju\n",
+                      p_len, used, ioq->fifo_cdn, ioq->fifo_cdx);
+#endif
+       }
+}
+
+/*
+ * *nactp is set to the number of ORIGINAL bytes consumed by the encrypter.
+ * The FIFO may contain more data.
+ */
+int
+hammer2_crypto_encrypt(hammer2_iocom_t *iocom __unused, hammer2_ioq_t *ioq,
+                      struct iovec *iov, int n, size_t *nactp)
+{
+       int p_len, used, ct_used;
+       int i;
+       int error;
+       size_t nmax;
+
+       nmax = sizeof(ioq->buf) - ioq->fifo_end;        /* max new bytes */
+
+       *nactp = 0;
+       for (i = 0; i < n && nmax; ++i) {
+               used = 0;
+               p_len = iov[i].iov_len;
+               assert((p_len & HAMMER2_MSG_ALIGNMASK) == 0);
+
+               while (p_len >= HAMMER2_CRYPTO_CHUNK_SIZE &&
+                   nmax >= HAMMER2_CRYPTO_CHUNK_SIZE +
+                   (size_t)crypto_algos[HAMMER2_CRYPTO_ALGO].taglen) {
+                       error = crypto_algos[HAMMER2_CRYPTO_ALGO].enc_chunk(
+                           ioq,
+                           ioq->buf + ioq->fifo_cdx,
+                           (char *)iov[i].iov_base + used,
+                           HAMMER2_CRYPTO_CHUNK_SIZE, &ct_used);
+#ifdef CRYPTO_DEBUG
+                       printf("nactp: %ju, p_len: %d, ct_used: %d, used: %d, nmax: %ju\n",
+                              *nactp, p_len, ct_used, used, nmax);
+#endif
+
+                       *nactp += (size_t)HAMMER2_CRYPTO_CHUNK_SIZE;    /* plaintext count */
+                       used += HAMMER2_CRYPTO_CHUNK_SIZE;
+                       p_len -= HAMMER2_CRYPTO_CHUNK_SIZE;
+
+                       ioq->fifo_cdx += (size_t)ct_used;       /* crypted count */
+                       ioq->fifo_cdn += (size_t)ct_used;       /* crypted count */
+                       ioq->fifo_end += (size_t)ct_used;
+                       nmax -= (size_t)ct_used;
+#ifdef CRYPTO_DEBUG
+                       printf("nactp: %ju, p_len: %d, ct_used: %d, used: %d, nmax: %ju\n",
+                              *nactp, p_len, ct_used, used, nmax);
+#endif
+               }
+       }
+       iov[0].iov_base = ioq->buf + ioq->fifo_beg;
+       iov[0].iov_len = ioq->fifo_cdx - ioq->fifo_beg;
+
+       return (1);
+}
diff --git a/sbin/hammer2/hammer2.h b/sbin/hammer2/hammer2.h
new file mode 100644 (file)
index 0000000..ddcbc08
--- /dev/null
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Rollup headers for hammer2 utility
+ */
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <sys/mount.h>
+#include <sys/file.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <sys/wait.h>
+#include <sys/tty.h>
+#include <sys/endian.h>
+
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <arpa/inet.h>
+#include <netdb.h>
+
+#include <vfs/hammer2/hammer2_disk.h>
+#include <vfs/hammer2/hammer2_mount.h>
+#include <vfs/hammer2/hammer2_ioctl.h>
+#include <vfs/hammer2/hammer2_network.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <stddef.h>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <signal.h>
+#include <string.h>
+#include <unistd.h>
+#include <ctype.h>
+#include <uuid.h>
+#include <assert.h>
+#include <pthread.h>
+#include <poll.h>
+
+#include <libutil.h>
+
+#include "network.h"
+
+#define HAMMER2_DEFAULT_DIR    "/etc/hammer2"
+#define HAMMER2_PATH_REMOTE    HAMMER2_DEFAULT_DIR "/remote"
+
+struct hammer2_idmap {
+       struct hammer2_idmap *next;
+       uint32_t ran_beg;       /* inclusive */
+       uint32_t ran_end;       /* inclusive */
+};
+
+typedef struct hammer2_idmap hammer2_idmap_t;
+
+struct hammer2_master_service_info {
+       int     fd;
+       int     detachme;
+};
+
+typedef struct hammer2_master_service_info hammer2_master_service_info_t;
+
+extern int DebugOpt;
+extern int VerboseOpt;
+extern int QuietOpt;
+extern int NormalExit;
+
+/*
+ * Hammer2 command APIs
+ */
+int hammer2_ioctl_handle(const char *sel_path);
+void hammer2_demon(void *(*func)(void *), void *arg);
+
+int cmd_remote_connect(const char *sel_path, const char *url);
+int cmd_remote_disconnect(const char *sel_path, const char *url);
+int cmd_remote_status(const char *sel_path, int all_opt);
+
+int cmd_pfs_getid(const char *sel_path, const char *name, int privateid);
+int cmd_pfs_list(const char *sel_path);
+int cmd_pfs_create(const char *sel_path, const char *name,
+                       uint8_t pfs_type, const char *uuid_str);
+int cmd_pfs_delete(const char *sel_path, const char *name);
+
+int cmd_service(void);
+int cmd_stat(int ac, const char **av);
+int cmd_leaf(const char *sel_path);
+int cmd_shell(const char *hostname);
+int cmd_debugspan(const char *hostname);
+int cmd_show(const char *devpath);
+int cmd_rsainit(const char *dir_path);
+int cmd_rsaenc(const char **keys, int nkeys);
+int cmd_rsadec(const char **keys, int nkeys);
+
+/*
+ * Msg support functions
+ */
+void hammer2_bswap_head(hammer2_msg_hdr_t *head);
+void hammer2_ioq_init(hammer2_iocom_t *iocom, hammer2_ioq_t *ioq);
+void hammer2_ioq_done(hammer2_iocom_t *iocom, hammer2_ioq_t *ioq);
+void hammer2_iocom_init(hammer2_iocom_t *iocom, int sock_fd, int alt_fd,
+                       void (*state_func)(hammer2_router_t *),
+                       void (*rcvmsg_func)(hammer2_msg_t *),
+                       void (*altmsg_func)(hammer2_iocom_t *));
+void hammer2_router_restate(hammer2_router_t *router,
+                       void (*state_func)(hammer2_router_t *),
+                       void (*rcvmsg_func)(hammer2_msg_t *),
+                       void (*altmsg_func)(hammer2_iocom_t *));
+void hammer2_router_signal(hammer2_router_t *router);
+void hammer2_iocom_done(hammer2_iocom_t *iocom);
+hammer2_msg_t *hammer2_msg_alloc(hammer2_router_t *router,
+                       size_t aux_size, uint32_t cmd,
+                       void (*func)(hammer2_msg_t *), void *data);
+void hammer2_msg_reply(hammer2_msg_t *msg, uint32_t error);
+void hammer2_msg_result(hammer2_msg_t *msg, uint32_t error);
+void hammer2_state_reply(hammer2_state_t *state, uint32_t error);
+
+void hammer2_msg_free(hammer2_msg_t *msg);
+
+void hammer2_iocom_core(hammer2_iocom_t *iocom);
+hammer2_msg_t *hammer2_ioq_read(hammer2_iocom_t *iocom);
+void hammer2_msg_write(hammer2_msg_t *msg);
+
+void hammer2_iocom_drain(hammer2_iocom_t *iocom);
+void hammer2_iocom_flush1(hammer2_iocom_t *iocom);
+void hammer2_iocom_flush2(hammer2_iocom_t *iocom);
+
+void hammer2_state_cleanuprx(hammer2_iocom_t *iocom, hammer2_msg_t *msg);
+void hammer2_state_free(hammer2_state_t *state);
+
+hammer2_router_t *hammer2_router_alloc(void);
+void hammer2_router_connect(hammer2_router_t *router);
+void hammer2_router_disconnect(hammer2_router_t **routerp);
+
+/*
+ * Msg protocol functions
+ */
+void hammer2_msg_lnk_signal(hammer2_router_t *router);
+void hammer2_msg_lnk(hammer2_msg_t *msg);
+void hammer2_msg_dbg(hammer2_msg_t *msg);
+
+/*
+ * Crypto functions
+ */
+void hammer2_crypto_setup(void);
+void hammer2_crypto_negotiate(hammer2_iocom_t *iocom);
+void hammer2_crypto_decrypt(hammer2_iocom_t *iocom, hammer2_ioq_t *ioq);
+int hammer2_crypto_encrypt(hammer2_iocom_t *iocom, hammer2_ioq_t *ioq,
+                       struct iovec *iov, int n, size_t *nactp);
+
+/*
+ * Misc functions
+ */
+const char *hammer2_time64_to_str(uint64_t htime64, char **strp);
+const char *hammer2_uuid_to_str(uuid_t *uuid, char **strp);
+const char *hammer2_iptype_to_str(uint8_t type);
+const char *hammer2_pfstype_to_str(uint8_t type);
+const char *sizetostr(hammer2_off_t size);
+const char * hammer2_basecmd_str(uint32_t cmd);
+const char *hammer2_msg_str(hammer2_msg_t *msg);
+int hammer2_connect(const char *hostname);
+
+void shell_tree(hammer2_router_t *router, char *cmdbuf);
+
+void *master_service(void *data);
+
+void hammer2_msg_debug(hammer2_iocom_t *iocom, hammer2_msg_t *msg);
+void router_printf(hammer2_router_t *router, const char *ctl, ...);
+void *hammer2_alloc(size_t bytes);
+void hammer2_free(void *ptr);
diff --git a/sbin/hammer2/icrc.c b/sbin/hammer2/icrc.c
new file mode 100644 (file)
index 0000000..eea4ea6
--- /dev/null
@@ -0,0 +1,147 @@
+/*-
+ * Copyright (c) 2005-2010 Daniel Braniss <danny@cs.huji.ac.il>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+/*
+ | iSCSI
+ | $Id: isc_subr.c 560 2009-05-07 07:37:49Z danny $
+ */
+
+#include <sys/types.h>
+#include <sys/uuid.h>
+
+#include <vfs/hammer2/hammer2_disk.h>
+
+/*****************************************************************/
+/*                                                               */
+/* CRC LOOKUP TABLE                                              */
+/* ================                                              */
+/* The following CRC lookup table was generated automagically    */
+/* by the Rocksoft^tm Model CRC Algorithm Table Generation       */
+/* Program V1.0 using the following model parameters:            */
+/*                                                               */
+/*    Width   : 4 bytes.                                         */
+/*    Poly    : 0x1EDC6F41L                                      */
+/*    Reverse : TRUE.                                            */
+/*                                                               */
+/* For more information on the Rocksoft^tm Model CRC Algorithm,  */
+/* see the document titled "A Painless Guide to CRC Error        */
+/* Detection Algorithms" by Ross Williams                        */
+/* (ross@guest.adelaide.edu.au.). This document is likely to be  */
+/* in the FTP archive "ftp.adelaide.edu.au/pub/rocksoft".        */
+/*                                                               */
+/*****************************************************************/
+
+static const uint32_t crc32Table[256] = {
+    0x00000000L, 0xF26B8303L, 0xE13B70F7L, 0x1350F3F4L,
+    0xC79A971FL, 0x35F1141CL, 0x26A1E7E8L, 0xD4CA64EBL,
+    0x8AD958CFL, 0x78B2DBCCL, 0x6BE22838L, 0x9989AB3BL,
+    0x4D43CFD0L, 0xBF284CD3L, 0xAC78BF27L, 0x5E133C24L,
+    0x105EC76FL, 0xE235446CL, 0xF165B798L, 0x030E349BL,
+    0xD7C45070L, 0x25AFD373L, 0x36FF2087L, 0xC494A384L,
+    0x9A879FA0L, 0x68EC1CA3L, 0x7BBCEF57L, 0x89D76C54L,
+    0x5D1D08BFL, 0xAF768BBCL, 0xBC267848L, 0x4E4DFB4BL,
+    0x20BD8EDEL, 0xD2D60DDDL, 0xC186FE29L, 0x33ED7D2AL,
+    0xE72719C1L, 0x154C9AC2L, 0x061C6936L, 0xF477EA35L,
+    0xAA64D611L, 0x580F5512L, 0x4B5FA6E6L, 0xB93425E5L,
+    0x6DFE410EL, 0x9F95C20DL, 0x8CC531F9L, 0x7EAEB2FAL,
+    0x30E349B1L, 0xC288CAB2L, 0xD1D83946L, 0x23B3BA45L,
+    0xF779DEAEL, 0x05125DADL, 0x1642AE59L, 0xE4292D5AL,
+    0xBA3A117EL, 0x4851927DL, 0x5B016189L, 0xA96AE28AL,
+    0x7DA08661L, 0x8FCB0562L, 0x9C9BF696L, 0x6EF07595L,
+    0x417B1DBCL, 0xB3109EBFL, 0xA0406D4BL, 0x522BEE48L,
+    0x86E18AA3L, 0x748A09A0L, 0x67DAFA54L, 0x95B17957L,
+    0xCBA24573L, 0x39C9C670L, 0x2A993584L, 0xD8F2B687L,
+    0x0C38D26CL, 0xFE53516FL, 0xED03A29BL, 0x1F682198L,
+    0x5125DAD3L, 0xA34E59D0L, 0xB01EAA24L, 0x42752927L,
+    0x96BF4DCCL, 0x64D4CECFL, 0x77843D3BL, 0x85EFBE38L,
+    0xDBFC821CL, 0x2997011FL, 0x3AC7F2EBL, 0xC8AC71E8L,
+    0x1C661503L, 0xEE0D9600L, 0xFD5D65F4L, 0x0F36E6F7L,
+    0x61C69362L, 0x93AD1061L, 0x80FDE395L, 0x72966096L,
+    0xA65C047DL, 0x5437877EL, 0x4767748AL, 0xB50CF789L,
+    0xEB1FCBADL, 0x197448AEL, 0x0A24BB5AL, 0xF84F3859L,
+    0x2C855CB2L, 0xDEEEDFB1L, 0xCDBE2C45L, 0x3FD5AF46L,
+    0x7198540DL, 0x83F3D70EL, 0x90A324FAL, 0x62C8A7F9L,
+    0xB602C312L, 0x44694011L, 0x5739B3E5L, 0xA55230E6L,
+    0xFB410CC2L, 0x092A8FC1L, 0x1A7A7C35L, 0xE811FF36L,
+    0x3CDB9BDDL, 0xCEB018DEL, 0xDDE0EB2AL, 0x2F8B6829L,
+    0x82F63B78L, 0x709DB87BL, 0x63CD4B8FL, 0x91A6C88CL,
+    0x456CAC67L, 0xB7072F64L, 0xA457DC90L, 0x563C5F93L,
+    0x082F63B7L, 0xFA44E0B4L, 0xE9141340L, 0x1B7F9043L,
+    0xCFB5F4A8L, 0x3DDE77ABL, 0x2E8E845FL, 0xDCE5075CL,
+    0x92A8FC17L, 0x60C37F14L, 0x73938CE0L, 0x81F80FE3L,
+    0x55326B08L, 0xA759E80BL, 0xB4091BFFL, 0x466298FCL,
+    0x1871A4D8L, 0xEA1A27DBL, 0xF94AD42FL, 0x0B21572CL,
+    0xDFEB33C7L, 0x2D80B0C4L, 0x3ED04330L, 0xCCBBC033L,
+    0xA24BB5A6L, 0x502036A5L, 0x4370C551L, 0xB11B4652L,
+    0x65D122B9L, 0x97BAA1BAL, 0x84EA524EL, 0x7681D14DL,
+    0x2892ED69L, 0xDAF96E6AL, 0xC9A99D9EL, 0x3BC21E9DL,
+    0xEF087A76L, 0x1D63F975L, 0x0E330A81L, 0xFC588982L,
+    0xB21572C9L, 0x407EF1CAL, 0x532E023EL, 0xA145813DL,
+    0x758FE5D6L, 0x87E466D5L, 0x94B49521L, 0x66DF1622L,
+    0x38CC2A06L, 0xCAA7A905L, 0xD9F75AF1L, 0x2B9CD9F2L,
+    0xFF56BD19L, 0x0D3D3E1AL, 0x1E6DCDEEL, 0xEC064EEDL,
+    0xC38D26C4L, 0x31E6A5C7L, 0x22B65633L, 0xD0DDD530L,
+    0x0417B1DBL, 0xF67C32D8L, 0xE52CC12CL, 0x1747422FL,
+    0x49547E0BL, 0xBB3FFD08L, 0xA86F0EFCL, 0x5A048DFFL,
+    0x8ECEE914L, 0x7CA56A17L, 0x6FF599E3L, 0x9D9E1AE0L,
+    0xD3D3E1ABL, 0x21B862A8L, 0x32E8915CL, 0xC083125FL,
+    0x144976B4L, 0xE622F5B7L, 0xF5720643L, 0x07198540L,
+    0x590AB964L, 0xAB613A67L, 0xB831C993L, 0x4A5A4A90L,
+    0x9E902E7BL, 0x6CFBAD78L, 0x7FAB5E8CL, 0x8DC0DD8FL,
+    0xE330A81AL, 0x115B2B19L, 0x020BD8EDL, 0xF0605BEEL,
+    0x24AA3F05L, 0xD6C1BC06L, 0xC5914FF2L, 0x37FACCF1L,
+    0x69E9F0D5L, 0x9B8273D6L, 0x88D28022L, 0x7AB90321L,
+    0xAE7367CAL, 0x5C18E4C9L, 0x4F48173DL, 0xBD23943EL,
+    0xF36E6F75L, 0x0105EC76L, 0x12551F82L, 0xE03E9C81L,
+    0x34F4F86AL, 0xC69F7B69L, 0xD5CF889DL, 0x27A40B9EL,
+    0x79B737BAL, 0x8BDCB4B9L, 0x988C474DL, 0x6AE7C44EL,
+    0xBE2DA0A5L, 0x4C4623A6L, 0x5F16D052L, 0xAD7D5351L
+};
+
+uint32_t
+hammer2_icrc32(const void *buf, size_t size)
+{
+     const uint8_t *p = buf;
+     uint32_t crc = 0;
+
+     crc = crc ^ 0xffffffff;
+     while (size--)
+         crc = crc32Table[(crc ^ *p++) & 0xff] ^ (crc >> 8);
+     crc = crc ^ 0xffffffff;
+     return crc;
+}
+
+uint32_t
+hammer2_icrc32c(const void *buf, size_t size, uint32_t crc)
+{
+     const uint8_t *p = buf;
+
+     crc = crc ^ 0xffffffff;
+     while (size--)
+         crc = crc32Table[(crc ^ *p++) & 0xff] ^ (crc >> 8);
+     crc = crc ^ 0xffffffff;
+     return crc;
+}
diff --git a/sbin/hammer2/main.c b/sbin/hammer2/main.c
new file mode 100644 (file)
index 0000000..1172c8d
--- /dev/null
@@ -0,0 +1,356 @@
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "hammer2.h"
+
+static void usage(int code);
+
+int DebugOpt;
+int VerboseOpt;
+int QuietOpt;
+int NormalExit = 1;    /* if set to 0 main() has to pthread_exit() */
+
+int
+main(int ac, char **av)
+{
+       const char *sel_path = NULL;
+       const char *uuid_str = NULL;
+       const char *arg;
+       int pfs_type = HAMMER2_PFSTYPE_NONE;
+       int all_opt = 0;
+       int ecode = 0;
+       int ch;
+
+       srandomdev();
+       signal(SIGPIPE, SIG_IGN);
+       hammer2_crypto_setup();
+
+       /*
+        * Core options
+        */
+       while ((ch = getopt(ac, av, "adqs:t:u:v")) != -1) {
+               switch(ch) {
+               case 'a':
+                       all_opt = 1;
+                       break;
+               case 'd':
+                       DebugOpt = 1;
+                       break;
+               case 's':
+                       sel_path = optarg;
+                       break;
+               case 't':
+                       /*
+                        * set node type for mkpfs
+                        */
+                       if (strcasecmp(optarg, "ADMIN") == 0) {
+                               pfs_type = HAMMER2_PFSTYPE_ADMIN;
+                       } else if (strcasecmp(optarg, "CACHE") == 0) {
+                               pfs_type = HAMMER2_PFSTYPE_CACHE;
+                       } else if (strcasecmp(optarg, "COPY") == 0) {
+                               pfs_type = HAMMER2_PFSTYPE_COPY;
+                       } else if (strcasecmp(optarg, "SLAVE") == 0) {
+                               pfs_type = HAMMER2_PFSTYPE_SLAVE;
+                       } else if (strcasecmp(optarg, "SOFT_SLAVE") == 0) {
+                               pfs_type = HAMMER2_PFSTYPE_SOFT_SLAVE;
+                       } else if (strcasecmp(optarg, "SOFT_MASTER") == 0) {
+                               pfs_type = HAMMER2_PFSTYPE_SOFT_MASTER;
+                       } else if (strcasecmp(optarg, "MASTER") == 0) {
+                               pfs_type = HAMMER2_PFSTYPE_MASTER;
+                       } else {
+                               fprintf(stderr, "-t: Unrecognized node type\n");
+                               usage(1);
+                       }
+                       break;
+               case 'u':
+                       /*
+                        * set uuid for mkpfs, else one will be generated
+                        * (required for all except the MASTER node_type)
+                        */
+                       uuid_str = optarg;
+                       break;
+               case 'v':
+                       if (QuietOpt)
+                               --QuietOpt;
+                       else
+                               ++VerboseOpt;
+                       break;
+               case 'q':
+                       if (VerboseOpt)
+                               --VerboseOpt;
+                       else
+                               ++QuietOpt;
+                       break;
+               default:
+                       fprintf(stderr, "Unknown option: %c\n", ch);
+                       usage(1);
+                       /* not reached */
+                       break;
+               }
+       }
+
+       /*
+        * Adjust, then process the command
+        */
+       ac -= optind;
+       av += optind;
+       if (ac < 1) {
+               fprintf(stderr, "Missing command\n");
+               usage(1);
+               /* not reached */
+       }
+
+       if (strcmp(av[0], "connect") == 0) {
+               /*
+                * Add cluster connection
+                */
+               if (ac < 2) {
+                       fprintf(stderr, "connect: missing argument\n");
+                       usage(1);
+               }
+               ecode = cmd_remote_connect(sel_path, av[1]);
+       } else if (strcmp(av[0], "debugspan") == 0) {
+               /*
+                * Debug connection to the target hammer2 service and run
+                * the CONN/SPAN protocol.
+                */
+               if (ac < 2) {
+                       fprintf(stderr, "debugspan: requires hostname\n");
+                       usage(1);
+               }
+               ecode = cmd_debugspan(av[1]);
+       } else if (strcmp(av[0], "disconnect") == 0) {
+               /*
+                * Remove cluster connection
+                */
+               if (ac < 2) {
+                       fprintf(stderr, "disconnect: missing argument\n");
+                       usage(1);
+               }
+               ecode = cmd_remote_disconnect(sel_path, av[1]);
+       } else if (strcmp(av[0], "status") == 0) {
+               /*
+                * Get status of PFS and its connections (-a for all PFSs)
+                */
+               ecode = cmd_remote_status(sel_path, all_opt);
+       } else if (strcmp(av[0], "pfs-clid") == 0) {
+               /*
+                * Print cluster id (uuid) for specific PFS
+                */
+               if (ac < 2) {
+                       fprintf(stderr, "pfs-clid: requires name\n");
+                       usage(1);
+               }
+               ecode = cmd_pfs_getid(sel_path, av[1], 0);
+       } else if (strcmp(av[0], "pfs-fsid") == 0) {
+               /*
+                * Print private id (uuid) for specific PFS
+                */
+               if (ac < 2) {
+                       fprintf(stderr, "pfs-fsid: requires name\n");
+                       usage(1);
+               }
+               ecode = cmd_pfs_getid(sel_path, av[1], 1);
+       } else if (strcmp(av[0], "pfs-list") == 0) {
+               /*
+                * List all PFSs
+                */
+               ecode = cmd_pfs_list(sel_path);
+       } else if (strcmp(av[0], "pfs-create") == 0) {
+               /*
+                * Create new PFS using pfs_type
+                */
+               if (ac < 2) {
+                       fprintf(stderr, "pfs-create: requires name\n");
+                       usage(1);
+               }
+               ecode = cmd_pfs_create(sel_path, av[1], pfs_type, uuid_str);
+       } else if (strcmp(av[0], "pfs-delete") == 0) {
+               /*
+                * Delete a PFS by name
+                */
+               if (ac < 2) {
+                       fprintf(stderr, "pfs-delete: requires name\n");
+                       usage(1);
+               }
+               ecode = cmd_pfs_delete(sel_path, av[1]);
+       } else if (strcmp(av[0], "snapshot") == 0) {
+               /*
+                * Create snapshot with optional pfs-type and optional
+                * label override.
+                */
+       } else if (strcmp(av[0], "service") == 0) {
+               /*
+                * Start the service daemon.  This daemon accepts
+                * connections from local and remote clients, handles
+                * the security handshake, and manages the core messaging
+                * protocol.
+                */
+               ecode = cmd_service();
+       } else if (strcmp(av[0], "stat") == 0) {
+               ecode = cmd_stat(ac - 1, (const char **)&av[1]);
+       } else if (strcmp(av[0], "leaf") == 0) {
+               /*
+                * Start the management daemon for a specific PFS.
+                *
+                * This will typically connect to the local master node
+                * daemon, register the PFS, and then pass its side of
+                * the socket descriptor to the kernel HAMMER2 VFS via an
+                * ioctl().  The process and/or thread context remains in the
+                * kernel until the PFS is unmounted or the connection is
+                * lost, then returns from the ioctl.
+                *
+                * It is possible to connect directly to a remote master node
+                * instead of the local master node in situations where
+                * encryption is not desired or no local master node is
+                * desired.  This is not recommended because it represents
+                * a single point of failure for the PFS's communications.
+                *
+                * Direct kernel<->kernel communication between HAMMER2 VFSs
+                * is theoretically possible for directly-connected
+                * registrations (i.e. where the spanning tree is degenerate),
+                * but not recommended.  We specifically try to reduce the
+                * complexity of the HAMMER2 VFS kernel code.
+                */
+               ecode = cmd_leaf(sel_path);
+       } else if (strcmp(av[0], "shell") == 0) {
+               /*
+                * Connect to the command line monitor in the hammer2 master
+                * node for the machine using HAMMER2_DBG_SHELL messages.
+                */
+               ecode = cmd_shell((ac < 2) ? NULL : av[1]);
+       } else if (strcmp(av[0], "rsainit") == 0) {
+               /*
+                * Initialize a RSA keypair.  If no target directory is
+                * specified we default to "/etc/hammer2".
+                */
+               arg = (ac < 2) ? HAMMER2_DEFAULT_DIR : av[1];
+               ecode = cmd_rsainit(arg);
+       } else if (strcmp(av[0], "rsaenc") == 0) {
+               /*
+                * Encrypt the input symmetrically by running it through
+                * the specified public and/or private key files.
+                *
+                * If no key files are specified data is encoded using
+                * "/etc/hammer2/rsa.pub".
+                *
+                * WARNING: no padding is added, data stream must contain
+                *          random padding for this to be secure.
+                *
+                * Used for debugging only
+                */
+               if (ac == 1) {
+                       const char *rsapath = HAMMER2_DEFAULT_DIR "/rsa.pub";
+                       ecode = cmd_rsaenc(&rsapath, 1);
+               } else {
+                       ecode = cmd_rsaenc((const char **)&av[1], ac - 1);
+               }
+       } else if (strcmp(av[0], "rsadec") == 0) {
+               /*
+                * Decrypt the input symmetrically by running it through
+                * the specified public and/or private key files.
+                *
+                * If no key files are specified data is decoded using
+                * "/etc/hammer2/rsa.prv".
+                *
+                * WARNING: no padding is added, data stream must contain
+                *          random padding for this to be secure.
+                *
+                * Used for debugging only
+                */
+               if (ac == 1) {
+                       const char *rsapath = HAMMER2_DEFAULT_DIR "/rsa.prv";
+                       ecode = cmd_rsadec(&rsapath, 1);
+               } else {
+                       ecode = cmd_rsadec((const char **)&av[1], ac - 1);
+               }
+       } else if (strcmp(av[0], "show") == 0) {
+               /*
+                * Raw dump of filesystem.  Use -v to check all crc's, and
+                * -vv to dump bulk file data.
+                */
+               if (ac != 2) {
+                       fprintf(stderr, "show: requires device path\n");
+                       usage(1);
+               } else {
+                       cmd_show(av[1]);
+               }
+       } else {
+               fprintf(stderr, "Unrecognized command: %s\n", av[0]);
+               usage(1);
+       }
+
+       /*
+        * In DebugMode we may wind up starting several pthreads in the
+        * original process, in which case we have to let them run and
+        * not actually exit.
+        */
+       if (NormalExit) {
+               return (ecode);
+       } else {
+               pthread_exit(NULL);
+               _exit(2);       /* NOT REACHED */
+       }
+}
+
+static
+void
+usage(int code)
+{
+       fprintf(stderr,
+               "hammer2 [-s path] command...\n"
+               "    -s path            Select filesystem\n"
+               "    -t type            PFS type for pfs-create\n"
+               "    -u uuid            uuid for pfs-create\n"
+               "\n"
+               "    connect <target>   Add cluster link\n"
+               "    disconnect <target> Del cluster link\n"
+               "    status             Report cluster status\n"
+               "    pfs-list           List PFSs\n"
+               "    pfs-clid <label>   Print cluster id for specific PFS\n"
+               "    pfs-fsid <label>   Print private id for specific PFS\n"
+               "    pfs-create <label> Create a PFS\n"
+               "    pfs-delete <label> Destroy a PFS\n"
+               "    snapshot           Snapshot a PFS\n"
+               "    service            Start service daemon\n"
+               "    stat [<path>]      Return inode quota & config\n"
+               "    leaf               Start pfs leaf daemon\n"
+               "    shell [<host>]     Connect to debug shell\n"
+               "    debugspan <target> Connect to target, run CONN/SPAN\n"
+               "    rsainit            Initialize rsa fields\n"
+               "    show devpath       Raw hammer2 media dump\n"
+       );
+       exit(code);
+}
diff --git a/sbin/hammer2/msg.c b/sbin/hammer2/msg.c
new file mode 100644 (file)
index 0000000..f4cb65c
--- /dev/null
@@ -0,0 +1,2239 @@
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "hammer2.h"
+
+static int hammer2_state_msgrx(hammer2_msg_t *msg);
+static void hammer2_state_cleanuptx(hammer2_msg_t *msg);
+
+/*
+ * ROUTER TREE - Represents available routes for message routing, indexed
+ *              by their spanid.  The router structure is embedded in
+ *              either an iocom, h2span_link, or h2span_relay (see msg_lnk.c).
+ */
+int
+hammer2_router_cmp(hammer2_router_t *router1, hammer2_router_t *router2)
+{
+       if (router1->target < router2->target)
+               return(-1);
+       if (router1->target > router2->target)
+               return(1);
+       return(0);
+}
+
+RB_GENERATE(hammer2_router_tree, hammer2_router, rbnode, hammer2_router_cmp);
+
+static pthread_mutex_t router_mtx;
+static struct hammer2_router_tree router_ltree = RB_INITIALIZER(router_ltree);
+static struct hammer2_router_tree router_rtree = RB_INITIALIZER(router_rtree);
+
+/*
+ * STATE TREE - Represents open transactions which are indexed by their
+ *             {router,msgid} relative to the governing iocom.
+ *
+ *             router is usually iocom->router since state isn't stored
+ *             for relayed messages.
+ */
+int
+hammer2_state_cmp(hammer2_state_t *state1, hammer2_state_t *state2)
+{
+#if 0
+       if (state1->router < state2->router)
+               return(-1);
+       if (state1->router > state2->router)
+               return(1);
+#endif
+       if (state1->msgid < state2->msgid)
+               return(-1);
+       if (state1->msgid > state2->msgid)
+               return(1);
+       return(0);
+}
+
+RB_GENERATE(hammer2_state_tree, hammer2_state, rbnode, hammer2_state_cmp);
+
+/*
+ * Initialize a low-level ioq
+ */
+void
+hammer2_ioq_init(hammer2_iocom_t *iocom __unused, hammer2_ioq_t *ioq)
+{
+       bzero(ioq, sizeof(*ioq));
+       ioq->state = HAMMER2_MSGQ_STATE_HEADER1;
+       TAILQ_INIT(&ioq->msgq);
+}
+
+/*
+ * Cleanup queue.
+ *
+ * caller holds iocom->mtx.
+ */
+void
+hammer2_ioq_done(hammer2_iocom_t *iocom __unused, hammer2_ioq_t *ioq)
+{
+       hammer2_msg_t *msg;
+
+       while ((msg = TAILQ_FIRST(&ioq->msgq)) != NULL) {
+               assert(0);      /* shouldn't happen */
+               TAILQ_REMOVE(&ioq->msgq, msg, qentry);
+               hammer2_msg_free(msg);
+       }
+       if ((msg = ioq->msg) != NULL) {
+               ioq->msg = NULL;
+               hammer2_msg_free(msg);
+       }
+}
+
+/*
+ * Initialize a low-level communications channel.
+ *
+ * NOTE: The signal_func() is called at least once from the loop and can be
+ *      re-armed via hammer2_iocom_restate().
+ */
+void
+hammer2_iocom_init(hammer2_iocom_t *iocom, int sock_fd, int alt_fd,
+                  void (*signal_func)(hammer2_router_t *),
+                  void (*rcvmsg_func)(hammer2_msg_t *),
+                  void (*altmsg_func)(hammer2_iocom_t *))
+{
+       struct stat st;
+
+       bzero(iocom, sizeof(*iocom));
+
+       iocom->router = hammer2_router_alloc();
+       iocom->router->signal_callback = signal_func;
+       iocom->router->rcvmsg_callback = rcvmsg_func;
+       iocom->router->altmsg_callback = altmsg_func;
+       /* we do not call hammer2_router_connect() for iocom routers */
+
+       pthread_mutex_init(&iocom->mtx, NULL);
+       RB_INIT(&iocom->router->staterd_tree);
+       RB_INIT(&iocom->router->statewr_tree);
+       TAILQ_INIT(&iocom->freeq);
+       TAILQ_INIT(&iocom->freeq_aux);
+       TAILQ_INIT(&iocom->router->txmsgq);
+       iocom->router->iocom = iocom;
+       iocom->sock_fd = sock_fd;
+       iocom->alt_fd = alt_fd;
+       iocom->flags = HAMMER2_IOCOMF_RREQ;
+       if (signal_func)
+               iocom->flags |= HAMMER2_IOCOMF_SWORK;
+       hammer2_ioq_init(iocom, &iocom->ioq_rx);
+       hammer2_ioq_init(iocom, &iocom->ioq_tx);
+       if (pipe(iocom->wakeupfds) < 0)
+               assert(0);
+       fcntl(iocom->wakeupfds[0], F_SETFL, O_NONBLOCK);
+       fcntl(iocom->wakeupfds[1], F_SETFL, O_NONBLOCK);
+
+       /*
+        * Negotiate session crypto synchronously.  This will mark the
+        * connection as error'd if it fails.  If this is a pipe it's
+        * a linkage that we set up ourselves to the filesystem and there
+        * is no crypto.
+        */
+       if (fstat(sock_fd, &st) < 0)
+               assert(0);
+       if (S_ISSOCK(st.st_mode))
+               hammer2_crypto_negotiate(iocom);
+
+       /*
+        * Make sure our fds are set to non-blocking for the iocom core.
+        */
+       if (sock_fd >= 0)
+               fcntl(sock_fd, F_SETFL, O_NONBLOCK);
+#if 0
+       /* if line buffered our single fgets() should be fine */
+       if (alt_fd >= 0)
+               fcntl(alt_fd, F_SETFL, O_NONBLOCK);
+#endif
+}
+
+/*
+ * May only be called from a callback from iocom_core.
+ *
+ * Adjust state machine functions, set flags to guarantee that both
+ * the recevmsg_func and the sendmsg_func is called at least once.
+ */
+void
+hammer2_router_restate(hammer2_router_t *router,
+                  void (*signal_func)(hammer2_router_t *),
+                  void (*rcvmsg_func)(hammer2_msg_t *msg),
+                  void (*altmsg_func)(hammer2_iocom_t *))
+{
+       router->signal_callback = signal_func;
+       router->rcvmsg_callback = rcvmsg_func;
+       router->altmsg_callback = altmsg_func;
+       if (signal_func)
+               router->iocom->flags |= HAMMER2_IOCOMF_SWORK;
+       else
+               router->iocom->flags &= ~HAMMER2_IOCOMF_SWORK;
+}
+
+void
+hammer2_router_signal(hammer2_router_t *router)
+{
+       if (router->signal_callback)
+               router->iocom->flags |= HAMMER2_IOCOMF_SWORK;
+}
+
+/*
+ * Cleanup a terminating iocom.
+ *
+ * Caller should not hold iocom->mtx.  The iocom has already been disconnected
+ * from all possible references to it.
+ */
+void
+hammer2_iocom_done(hammer2_iocom_t *iocom)
+{
+       hammer2_msg_t *msg;
+
+       if (iocom->sock_fd >= 0) {
+               close(iocom->sock_fd);
+               iocom->sock_fd = -1;
+       }
+       if (iocom->alt_fd >= 0) {
+               close(iocom->alt_fd);
+               iocom->alt_fd = -1;
+       }
+       hammer2_ioq_done(iocom, &iocom->ioq_rx);
+       hammer2_ioq_done(iocom, &iocom->ioq_tx);
+       if ((msg = TAILQ_FIRST(&iocom->freeq)) != NULL) {
+               TAILQ_REMOVE(&iocom->freeq, msg, qentry);
+               free(msg);
+       }
+       if ((msg = TAILQ_FIRST(&iocom->freeq_aux)) != NULL) {
+               TAILQ_REMOVE(&iocom->freeq_aux, msg, qentry);
+               free(msg->aux_data);
+               msg->aux_data = NULL;
+               free(msg);
+       }
+       if (iocom->wakeupfds[0] >= 0) {
+               close(iocom->wakeupfds[0]);
+               iocom->wakeupfds[0] = -1;
+       }
+       if (iocom->wakeupfds[1] >= 0) {
+               close(iocom->wakeupfds[1]);
+               iocom->wakeupfds[1] = -1;
+       }
+       pthread_mutex_destroy(&iocom->mtx);
+}
+
+/*
+ * Allocate a new one-way message.
+ */
+hammer2_msg_t *
+hammer2_msg_alloc(hammer2_router_t *router, size_t aux_size, uint32_t cmd,
+                 void (*func)(hammer2_msg_t *), void *data)
+{
+       hammer2_state_t *state = NULL;
+       hammer2_iocom_t *iocom = router->iocom;
+       hammer2_msg_t *msg;
+       int hbytes;
+
+       pthread_mutex_lock(&iocom->mtx);
+       if (aux_size) {
+               aux_size = (aux_size + HAMMER2_MSG_ALIGNMASK) &
+                          ~HAMMER2_MSG_ALIGNMASK;
+               if ((msg = TAILQ_FIRST(&iocom->freeq_aux)) != NULL)
+                       TAILQ_REMOVE(&iocom->freeq_aux, msg, qentry);
+       } else {
+               if ((msg = TAILQ_FIRST(&iocom->freeq)) != NULL)
+                       TAILQ_REMOVE(&iocom->freeq, msg, qentry);
+       }
+       if ((cmd & (HAMMER2_MSGF_CREATE | HAMMER2_MSGF_REPLY)) ==
+           HAMMER2_MSGF_CREATE) {
+               /*
+                * Create state when CREATE is set without REPLY.
+                *
+                * NOTE: CREATE in txcmd handled by hammer2_msg_write()
+                * NOTE: DELETE in txcmd handled by hammer2_state_cleanuptx()
+                */
+               state = malloc(sizeof(*state));
+               bzero(state, sizeof(*state));
+               state->iocom = iocom;
+               state->flags = HAMMER2_STATE_DYNAMIC;
+               state->msgid = (uint64_t)(uintptr_t)state;
+               state->router = router;
+               state->txcmd = cmd & ~(HAMMER2_MSGF_CREATE |
+                                      HAMMER2_MSGF_DELETE);
+               state->rxcmd = HAMMER2_MSGF_REPLY;
+               state->func = func;
+               state->any.any = data;
+               pthread_mutex_lock(&iocom->mtx);
+               RB_INSERT(hammer2_state_tree,
+                         &iocom->router->statewr_tree,
+                         state);
+               pthread_mutex_unlock(&iocom->mtx);
+               state->flags |= HAMMER2_STATE_INSERTED;
+       }
+       pthread_mutex_unlock(&iocom->mtx);
+       if (msg == NULL) {
+               msg = malloc(sizeof(*msg));
+               bzero(msg, sizeof(*msg));
+               msg->aux_data = NULL;
+               msg->aux_size = 0;
+       }
+       if (msg->aux_size != aux_size) {
+               if (msg->aux_data) {
+                       free(msg->aux_data);
+                       msg->aux_data = NULL;
+                       msg->aux_size = 0;
+               }
+               if (aux_size) {
+                       msg->aux_data = malloc(aux_size);
+                       msg->aux_size = aux_size;
+               }
+       }
+       hbytes = (cmd & HAMMER2_MSGF_SIZE) * HAMMER2_MSG_ALIGN;
+       if (hbytes)
+               bzero(&msg->any.head, hbytes);
+       msg->hdr_size = hbytes;
+       msg->any.head.cmd = cmd;
+       msg->any.head.aux_descr = 0;
+       msg->any.head.aux_crc = 0;
+       msg->router = router;
+       if (state) {
+               msg->state = state;
+               state->msg = msg;
+               msg->any.head.msgid = state->msgid;
+       }
+       return (msg);
+}
+
+/*
+ * Free a message so it can be reused afresh.
+ *
+ * NOTE: aux_size can be 0 with a non-NULL aux_data.
+ */
+static
+void
+hammer2_msg_free_locked(hammer2_msg_t *msg)
+{
+       hammer2_iocom_t *iocom = msg->router->iocom;
+
+       msg->state = NULL;
+       if (msg->aux_data)
+               TAILQ_INSERT_TAIL(&iocom->freeq_aux, msg, qentry);
+       else
+               TAILQ_INSERT_TAIL(&iocom->freeq, msg, qentry);
+}
+
+void
+hammer2_msg_free(hammer2_msg_t *msg)
+{
+       hammer2_iocom_t *iocom = msg->router->iocom;
+
+       pthread_mutex_lock(&iocom->mtx);
+       hammer2_msg_free_locked(msg);
+       pthread_mutex_unlock(&iocom->mtx);
+}
+
+/*
+ * I/O core loop for an iocom.
+ *
+ * Thread localized, iocom->mtx not held.
+ */
+void
+hammer2_iocom_core(hammer2_iocom_t *iocom)
+{
+       struct pollfd fds[3];
+       char dummybuf[256];
+       hammer2_msg_t *msg;
+       int timeout;
+       int count;
+       int wi; /* wakeup pipe */
+       int si; /* socket */
+       int ai; /* alt bulk path socket */
+
+       while ((iocom->flags & HAMMER2_IOCOMF_EOF) == 0) {
+               if ((iocom->flags & (HAMMER2_IOCOMF_RWORK |
+                                    HAMMER2_IOCOMF_WWORK |
+                                    HAMMER2_IOCOMF_PWORK |
+                                    HAMMER2_IOCOMF_SWORK |
+                                    HAMMER2_IOCOMF_ARWORK |
+                                    HAMMER2_IOCOMF_AWWORK)) == 0) {
+                       /*
+                        * Only poll if no immediate work is pending.
+                        * Otherwise we are just wasting our time calling
+                        * poll.
+                        */
+                       timeout = 5000;
+
+                       count = 0;
+                       wi = -1;
+                       si = -1;
+                       ai = -1;
+
+                       /*
+                        * Always check the inter-thread pipe, e.g.
+                        * for iocom->txmsgq work.
+                        */
+                       wi = count++;
+                       fds[wi].fd = iocom->wakeupfds[0];
+                       fds[wi].events = POLLIN;
+                       fds[wi].revents = 0;
+
+                       /*
+                        * Check the socket input/output direction as
+                        * requested
+                        */
+                       if (iocom->flags & (HAMMER2_IOCOMF_RREQ |
+                                           HAMMER2_IOCOMF_WREQ)) {
+                               si = count++;
+                               fds[si].fd = iocom->sock_fd;
+                               fds[si].events = 0;
+                               fds[si].revents = 0;
+
+                               if (iocom->flags & HAMMER2_IOCOMF_RREQ)
+                                       fds[si].events |= POLLIN;
+                               if (iocom->flags & HAMMER2_IOCOMF_WREQ)
+                                       fds[si].events |= POLLOUT;
+                       }
+
+                       /*
+                        * Check the alternative fd for work.
+                        */
+                       if (iocom->alt_fd >= 0) {
+                               ai = count++;
+                               fds[ai].fd = iocom->alt_fd;
+                               fds[ai].events = POLLIN;
+                               fds[ai].revents = 0;
+                       }
+                       poll(fds, count, timeout);
+
+                       if (wi >= 0 && (fds[wi].revents & POLLIN))
+                               iocom->flags |= HAMMER2_IOCOMF_PWORK;
+                       if (si >= 0 && (fds[si].revents & POLLIN))
+                               iocom->flags |= HAMMER2_IOCOMF_RWORK;
+                       if (si >= 0 && (fds[si].revents & POLLOUT))
+                               iocom->flags |= HAMMER2_IOCOMF_WWORK;
+                       if (wi >= 0 && (fds[wi].revents & POLLOUT))
+                               iocom->flags |= HAMMER2_IOCOMF_WWORK;
+                       if (ai >= 0 && (fds[ai].revents & POLLIN))
+                               iocom->flags |= HAMMER2_IOCOMF_ARWORK;
+               } else {
+                       /*
+                        * Always check the pipe
+                        */
+                       iocom->flags |= HAMMER2_IOCOMF_PWORK;
+               }
+
+               if (iocom->flags & HAMMER2_IOCOMF_SWORK) {
+                       iocom->flags &= ~HAMMER2_IOCOMF_SWORK;
+                       iocom->router->signal_callback(iocom->router);
+               }
+
+               /*
+                * Pending message queues from other threads wake us up
+                * with a write to the wakeupfds[] pipe.  We have to clear
+                * the pipe with a dummy read.
+                */
+               if (iocom->flags & HAMMER2_IOCOMF_PWORK) {
+                       iocom->flags &= ~HAMMER2_IOCOMF_PWORK;
+                       read(iocom->wakeupfds[0], dummybuf, sizeof(dummybuf));
+                       iocom->flags |= HAMMER2_IOCOMF_RWORK;
+                       iocom->flags |= HAMMER2_IOCOMF_WWORK;
+                       if (TAILQ_FIRST(&iocom->router->txmsgq))
+                               hammer2_iocom_flush1(iocom);
+               }
+
+               /*
+                * Message write sequencing
+                */
+               if (iocom->flags & HAMMER2_IOCOMF_WWORK)
+                       hammer2_iocom_flush1(iocom);
+
+               /*
+                * Message read sequencing.  Run this after the write
+                * sequencing in case the write sequencing allowed another
+                * auto-DELETE to occur on the read side.
+                */
+               if (iocom->flags & HAMMER2_IOCOMF_RWORK) {
+                       while ((iocom->flags & HAMMER2_IOCOMF_EOF) == 0 &&
+                              (msg = hammer2_ioq_read(iocom)) != NULL) {
+                               if (DebugOpt) {
+                                       fprintf(stderr, "receive %s\n",
+                                               hammer2_msg_str(msg));
+                               }
+                               iocom->router->rcvmsg_callback(msg);
+                               hammer2_state_cleanuprx(iocom, msg);
+                       }
+               }
+
+               if (iocom->flags & HAMMER2_IOCOMF_ARWORK) {
+                       iocom->flags &= ~HAMMER2_IOCOMF_ARWORK;
+                       iocom->router->altmsg_callback(iocom);
+               }
+       }
+}
+
+/*
+ * Make sure there's enough room in the FIFO to hold the
+ * needed data.
+ *
+ * Assume worst case encrypted form is 2x the size of the
+ * plaintext equivalent.
+ */
+static
+size_t
+hammer2_ioq_makeroom(hammer2_ioq_t *ioq, size_t needed)
+{
+       size_t bytes;
+       size_t nmax;
+
+       bytes = ioq->fifo_cdx - ioq->fifo_beg;
+       nmax = sizeof(ioq->buf) - ioq->fifo_end;
+       if (bytes + nmax / 2 < needed) {
+               if (bytes) {
+                       bcopy(ioq->buf + ioq->fifo_beg,
+                             ioq->buf,
+                             bytes);
+               }
+               ioq->fifo_cdx -= ioq->fifo_beg;
+               ioq->fifo_beg = 0;
+               if (ioq->fifo_cdn < ioq->fifo_end) {
+                       bcopy(ioq->buf + ioq->fifo_cdn,
+                             ioq->buf + ioq->fifo_cdx,
+                             ioq->fifo_end - ioq->fifo_cdn);
+               }
+               ioq->fifo_end -= ioq->fifo_cdn - ioq->fifo_cdx;
+               ioq->fifo_cdn = ioq->fifo_cdx;
+               nmax = sizeof(ioq->buf) - ioq->fifo_end;
+       }
+       return(nmax);
+}
+
+/*
+ * Read the next ready message from the ioq, issuing I/O if needed.
+ * Caller should retry on a read-event when NULL is returned.
+ *
+ * If an error occurs during reception a HAMMER2_LNK_ERROR msg will
+ * be returned for each open transaction, then the ioq and iocom
+ * will be errored out and a non-transactional HAMMER2_LNK_ERROR
+ * msg will be returned as the final message.  The caller should not call
+ * us again after the final message is returned.
+ *
+ * Thread localized, iocom->mtx not held.
+ */
+hammer2_msg_t *
+hammer2_ioq_read(hammer2_iocom_t *iocom)
+{
+       hammer2_ioq_t *ioq = &iocom->ioq_rx;
+       hammer2_msg_t *msg;
+       hammer2_msg_hdr_t *head;
+       hammer2_state_t *state;
+       ssize_t n;
+       size_t bytes;
+       size_t nmax;
+       uint32_t xcrc32;
+       int error;
+
+again:
+       iocom->flags &= ~(HAMMER2_IOCOMF_RREQ | HAMMER2_IOCOMF_RWORK);
+
+       /*
+        * If a message is already pending we can just remove and
+        * return it.  Message state has already been processed.
+        * (currently not implemented)
+        */
+       if ((msg = TAILQ_FIRST(&ioq->msgq)) != NULL) {
+               TAILQ_REMOVE(&ioq->msgq, msg, qentry);
+               return (msg);
+       }
+
+       /*
+        * If the stream is errored out we stop processing it.
+        */
+       if (ioq->error)
+               goto skip;
+
+       /*
+        * Message read in-progress (msg is NULL at the moment).  We don't
+        * allocate a msg until we have its core header.
+        */
+       nmax = sizeof(ioq->buf) - ioq->fifo_end;
+       bytes = ioq->fifo_cdx - ioq->fifo_beg;          /* already decrypted */
+       msg = ioq->msg;
+
+       switch(ioq->state) {
+       case HAMMER2_MSGQ_STATE_HEADER1:
+               /*
+                * Load the primary header, fail on any non-trivial read
+                * error or on EOF.  Since the primary header is the same
+                * size is the message alignment it will never straddle
+                * the end of the buffer.
+                */
+               nmax = hammer2_ioq_makeroom(ioq, sizeof(msg->any.head));
+               if (bytes < sizeof(msg->any.head)) {
+                       n = read(iocom->sock_fd,
+                                ioq->buf + ioq->fifo_end,
+                                nmax);
+                       if (n <= 0) {
+                               if (n == 0) {
+                                       ioq->error = HAMMER2_IOQ_ERROR_EOF;
+                                       break;
+                               }
+                               if (errno != EINTR &&
+                                   errno != EINPROGRESS &&
+                                   errno != EAGAIN) {
+                                       ioq->error = HAMMER2_IOQ_ERROR_SOCK;
+                                       break;
+                               }
+                               n = 0;
+                               /* fall through */
+                       }
+                       ioq->fifo_end += (size_t)n;
+                       nmax -= (size_t)n;
+               }
+
+               /*
+                * Decrypt data received so far.  Data will be decrypted
+                * in-place but might create gaps in the FIFO.  Partial
+                * blocks are not immediately decrypted.
+                *
+                * WARNING!  The header might be in the wrong endian, we
+                *           do not fix it up until we get the entire
+                *           extended header.
+                */
+               if (iocom->flags & HAMMER2_IOCOMF_CRYPTED) {
+                       hammer2_crypto_decrypt(iocom, ioq);
+               } else {
+                       ioq->fifo_cdx = ioq->fifo_end;
+                       ioq->fifo_cdn = ioq->fifo_end;
+               }
+               bytes = ioq->fifo_cdx - ioq->fifo_beg;
+
+               /*
+                * Insufficient data accumulated (msg is NULL, caller will
+                * retry on event).
+                */
+               assert(msg == NULL);
+               if (bytes < sizeof(msg->any.head))
+                       break;
+
+               /*
+                * Check and fixup the core header.  Note that the icrc
+                * has to be calculated before any fixups, but the crc
+                * fields in the msg may have to be swapped like everything
+                * else.
+                */
+               head = (void *)(ioq->buf + ioq->fifo_beg);
+               if (head->magic != HAMMER2_MSGHDR_MAGIC &&
+                   head->magic != HAMMER2_MSGHDR_MAGIC_REV) {
+                       ioq->error = HAMMER2_IOQ_ERROR_SYNC;
+                       break;
+               }
+
+               /*
+                * Calculate the full header size and aux data size
+                */
+               if (head->magic == HAMMER2_MSGHDR_MAGIC_REV) {
+                       ioq->hbytes = (bswap32(head->cmd) & HAMMER2_MSGF_SIZE) *
+                                     HAMMER2_MSG_ALIGN;
+                       ioq->abytes = bswap32(head->aux_bytes) *
+                                     HAMMER2_MSG_ALIGN;
+               } else {
+                       ioq->hbytes = (head->cmd & HAMMER2_MSGF_SIZE) *
+                                     HAMMER2_MSG_ALIGN;
+                       ioq->abytes = head->aux_bytes * HAMMER2_MSG_ALIGN;
+               }
+               if (ioq->hbytes < sizeof(msg->any.head) ||
+                   ioq->hbytes > sizeof(msg->any) ||
+                   ioq->abytes > HAMMER2_MSGAUX_MAX) {
+                       ioq->error = HAMMER2_IOQ_ERROR_FIELD;
+                       break;
+               }
+
+               /*
+                * Allocate the message, the next state will fill it in.
+                */
+               msg = hammer2_msg_alloc(iocom->router, ioq->abytes, 0,
+                                       NULL, NULL);
+               ioq->msg = msg;
+
+               /*
+                * Fall through to the next state.  Make sure that the
+                * extended header does not straddle the end of the buffer.
+                * We still want to issue larger reads into our buffer,
+                * book-keeping is easier if we don't bcopy() yet.
+                *
+                * Make sure there is enough room for bloated encrypt data.
+                */
+               nmax = hammer2_ioq_makeroom(ioq, ioq->hbytes);
+               ioq->state = HAMMER2_MSGQ_STATE_HEADER2;
+               /* fall through */
+       case HAMMER2_MSGQ_STATE_HEADER2:
+               /*
+                * Fill out the extended header.
+                */
+               assert(msg != NULL);
+               if (bytes < ioq->hbytes) {
+                       n = read(iocom->sock_fd,
+                                ioq->buf + ioq->fifo_end,
+                                nmax);
+                       if (n <= 0) {
+                               if (n == 0) {
+                                       ioq->error = HAMMER2_IOQ_ERROR_EOF;
+                                       break;
+                               }
+                               if (errno != EINTR &&
+                                   errno != EINPROGRESS &&
+                                   errno != EAGAIN) {
+                                       ioq->error = HAMMER2_IOQ_ERROR_SOCK;
+                                       break;
+                               }
+                               n = 0;
+                               /* fall through */
+                       }
+                       ioq->fifo_end += (size_t)n;
+                       nmax -= (size_t)n;
+               }
+
+               if (iocom->flags & HAMMER2_IOCOMF_CRYPTED) {
+                       hammer2_crypto_decrypt(iocom, ioq);
+               } else {
+                       ioq->fifo_cdx = ioq->fifo_end;
+                       ioq->fifo_cdn = ioq->fifo_end;
+               }
+               bytes = ioq->fifo_cdx - ioq->fifo_beg;
+
+               /*
+                * Insufficient data accumulated (set msg NULL so caller will
+                * retry on event).
+                */
+               if (bytes < ioq->hbytes) {
+                       msg = NULL;
+                       break;
+               }
+
+               /*
+                * Calculate the extended header, decrypt data received
+                * so far.  Handle endian-conversion for the entire extended
+                * header.
+                */
+               head = (void *)(ioq->buf + ioq->fifo_beg);
+
+               /*
+                * Check the CRC.
+                */
+               if (head->magic == HAMMER2_MSGHDR_MAGIC_REV)
+                       xcrc32 = bswap32(head->hdr_crc);
+               else
+                       xcrc32 = head->hdr_crc;
+               head->hdr_crc = 0;
+               if (hammer2_icrc32(head, ioq->hbytes) != xcrc32) {
+                       ioq->error = HAMMER2_IOQ_ERROR_XCRC;
+                       fprintf(stderr, "BAD-XCRC(%08x,%08x) %s\n",
+                               xcrc32, hammer2_icrc32(head, ioq->hbytes),
+                               hammer2_msg_str(msg));
+                       assert(0);
+                       break;
+               }
+               head->hdr_crc = xcrc32;
+
+               if (head->magic == HAMMER2_MSGHDR_MAGIC_REV) {
+                       hammer2_bswap_head(head);
+               }
+
+               /*
+                * Copy the extended header into the msg and adjust the
+                * FIFO.
+                */
+               bcopy(head, &msg->any, ioq->hbytes);
+
+               /*
+                * We are either done or we fall-through.
+                */
+               if (ioq->abytes == 0) {
+                       ioq->fifo_beg += ioq->hbytes;
+                       break;
+               }
+
+               /*
+                * Must adjust bytes (and the state) when falling through.
+                * nmax doesn't change.
+                */
+               ioq->fifo_beg += ioq->hbytes;
+               bytes -= ioq->hbytes;
+               ioq->state = HAMMER2_MSGQ_STATE_AUXDATA1;
+               /* fall through */
+       case HAMMER2_MSGQ_STATE_AUXDATA1:
+               /*
+                * Copy the partial or complete payload from remaining
+                * bytes in the FIFO in order to optimize the makeroom call
+                * in the AUXDATA2 state.  We have to fall-through either
+                * way so we can check the crc.
+                *
+                * msg->aux_size tracks our aux data.
+                */
+               if (bytes >= ioq->abytes) {
+                       bcopy(ioq->buf + ioq->fifo_beg, msg->aux_data,
+                             ioq->abytes);
+                       msg->aux_size = ioq->abytes;
+                       ioq->fifo_beg += ioq->abytes;
+                       assert(ioq->fifo_beg <= ioq->fifo_cdx);
+                       assert(ioq->fifo_cdx <= ioq->fifo_cdn);
+                       bytes -= ioq->abytes;
+               } else if (bytes) {
+                       bcopy(ioq->buf + ioq->fifo_beg, msg->aux_data,
+                             bytes);
+                       msg->aux_size = bytes;
+                       ioq->fifo_beg += bytes;
+                       if (ioq->fifo_cdx < ioq->fifo_beg)
+                               ioq->fifo_cdx = ioq->fifo_beg;
+                       assert(ioq->fifo_beg <= ioq->fifo_cdx);
+                       assert(ioq->fifo_cdx <= ioq->fifo_cdn);
+                       bytes = 0;
+               } else {
+                       msg->aux_size = 0;
+               }
+               ioq->state = HAMMER2_MSGQ_STATE_AUXDATA2;
+               /* fall through */
+       case HAMMER2_MSGQ_STATE_AUXDATA2:
+               /*
+                * Make sure there is enough room for more data.
+                */
+               assert(msg);
+               nmax = hammer2_ioq_makeroom(ioq, ioq->abytes - msg->aux_size);
+
+               /*
+                * Read and decrypt more of the payload.
+                */
+               if (msg->aux_size < ioq->abytes) {
+                       assert(bytes == 0);
+                       n = read(iocom->sock_fd,
+                                ioq->buf + ioq->fifo_end,
+                                nmax);
+                       if (n <= 0) {
+                               if (n == 0) {
+                                       ioq->error = HAMMER2_IOQ_ERROR_EOF;
+                                       break;
+                               }
+                               if (errno != EINTR &&
+                                   errno != EINPROGRESS &&
+                                   errno != EAGAIN) {
+                                       ioq->error = HAMMER2_IOQ_ERROR_SOCK;
+                                       break;
+                               }
+                               n = 0;
+                               /* fall through */
+                       }
+                       ioq->fifo_end += (size_t)n;
+                       nmax -= (size_t)n;
+               }
+
+               if (iocom->flags & HAMMER2_IOCOMF_CRYPTED) {
+                       hammer2_crypto_decrypt(iocom, ioq);
+               } else {
+                       ioq->fifo_cdx = ioq->fifo_end;
+                       ioq->fifo_cdn = ioq->fifo_end;
+               }
+               bytes = ioq->fifo_cdx - ioq->fifo_beg;
+
+               if (bytes > ioq->abytes - msg->aux_size)
+                       bytes = ioq->abytes - msg->aux_size;
+
+               if (bytes) {
+                       bcopy(ioq->buf + ioq->fifo_beg,
+                             msg->aux_data + msg->aux_size,
+                             bytes);
+                       msg->aux_size += bytes;
+                       ioq->fifo_beg += bytes;
+               }
+
+               /*
+                * Insufficient data accumulated (set msg NULL so caller will
+                * retry on event).
+                */
+               if (msg->aux_size < ioq->abytes) {
+                       msg = NULL;
+                       break;
+               }
+               assert(msg->aux_size == ioq->abytes);
+
+               /*
+                * Check aux_crc, then we are done.
+                */
+               xcrc32 = hammer2_icrc32(msg->aux_data, msg->aux_size);
+               if (xcrc32 != msg->any.head.aux_crc) {
+                       ioq->error = HAMMER2_IOQ_ERROR_ACRC;
+                       break;
+               }
+               break;
+       case HAMMER2_MSGQ_STATE_ERROR:
+               /*
+                * Continued calls to drain recorded transactions (returning
+                * a LNK_ERROR for each one), before we return the final
+                * LNK_ERROR.
+                */
+               assert(msg == NULL);
+               break;
+       default:
+               /*
+                * We don't double-return errors, the caller should not
+                * have called us again after getting an error msg.
+                */
+               assert(0);
+               break;
+       }
+
+       /*
+        * Check the message sequence.  The iv[] should prevent any
+        * possibility of a replay but we add this check anyway.
+        */
+       if (msg && ioq->error == 0) {
+               if ((msg->any.head.salt & 255) != (ioq->seq & 255)) {
+                       ioq->error = HAMMER2_IOQ_ERROR_MSGSEQ;
+               } else {
+                       ++ioq->seq;
+               }
+       }
+
+       /*
+        * Process transactional state for the message.
+        */
+       if (msg && ioq->error == 0) {
+               error = hammer2_state_msgrx(msg);
+               if (error) {
+                       if (error == HAMMER2_IOQ_ERROR_EALREADY) {
+                               hammer2_msg_free(msg);
+                               goto again;
+                       }
+                       ioq->error = error;
+               }
+       }
+
+       /*
+        * Handle error, RREQ, or completion
+        *
+        * NOTE: nmax and bytes are invalid at this point, we don't bother
+        *       to update them when breaking out.
+        */
+       if (ioq->error) {
+skip:
+               /*
+                * An unrecoverable error causes all active receive
+                * transactions to be terminated with a LNK_ERROR message.
+                *
+                * Once all active transactions are exhausted we set the
+                * iocom ERROR flag and return a non-transactional LNK_ERROR
+                * message, which should cause master processing loops to
+                * terminate.
+                */
+               assert(ioq->msg == msg);
+               if (msg) {
+                       hammer2_msg_free(msg);
+                       ioq->msg = NULL;
+               }
+
+               /*
+                * No more I/O read processing
+                */
+               ioq->state = HAMMER2_MSGQ_STATE_ERROR;
+
+               /*
+                * Simulate a remote LNK_ERROR DELETE msg for any open
+                * transactions, ending with a final non-transactional
+                * LNK_ERROR (that the session can detect) when no
+                * transactions remain.
+                */
+               msg = hammer2_msg_alloc(iocom->router, 0, 0, NULL, NULL);
+               bzero(&msg->any.head, sizeof(msg->any.head));
+               msg->any.head.magic = HAMMER2_MSGHDR_MAGIC;
+               msg->any.head.cmd = HAMMER2_LNK_ERROR;
+               msg->any.head.error = ioq->error;
+
+               pthread_mutex_lock(&iocom->mtx);
+               hammer2_iocom_drain(iocom);
+               if ((state = RB_ROOT(&iocom->router->staterd_tree)) != NULL) {
+                       /*
+                        * Active remote transactions are still present.
+                        * Simulate the other end sending us a DELETE.
+                        */
+                       if (state->rxcmd & HAMMER2_MSGF_DELETE) {
+                               hammer2_msg_free(msg);
+                               msg = NULL;
+                       } else {
+                               /*state->txcmd |= HAMMER2_MSGF_DELETE;*/
+                               msg->state = state;
+                               msg->router = state->router;
+                               msg->any.head.msgid = state->msgid;
+                               msg->any.head.cmd |= HAMMER2_MSGF_ABORT |
+                                                    HAMMER2_MSGF_DELETE;
+                       }
+               } else if ((state = RB_ROOT(&iocom->router->statewr_tree)) !=
+                          NULL) {
+                       /*
+                        * Active local transactions are still present.
+                        * Simulate the other end sending us a DELETE.
+                        */
+                       if (state->rxcmd & HAMMER2_MSGF_DELETE) {
+                               hammer2_msg_free(msg);
+                               msg = NULL;
+                       } else {
+                               msg->state = state;
+                               msg->router = state->router;
+                               msg->any.head.msgid = state->msgid;
+                               msg->any.head.cmd |= HAMMER2_MSGF_ABORT |
+                                                    HAMMER2_MSGF_DELETE |
+                                                    HAMMER2_MSGF_REPLY;
+                               if ((state->rxcmd & HAMMER2_MSGF_CREATE) == 0) {
+                                       msg->any.head.cmd |=
+                                                    HAMMER2_MSGF_CREATE;
+                               }
+                       }
+               } else {
+                       /*
+                        * No active local or remote transactions remain.
+                        * Generate a final LNK_ERROR and flag EOF.
+                        */
+                       msg->state = NULL;
+                       iocom->flags |= HAMMER2_IOCOMF_EOF;
+                       fprintf(stderr, "EOF ON SOCKET %d\n", iocom->sock_fd);
+               }
+               pthread_mutex_unlock(&iocom->mtx);
+
+               /*
+                * For the iocom error case we want to set RWORK to indicate
+                * that more messages might be pending.
+                *
+                * It is possible to return NULL when there is more work to
+                * do because each message has to be DELETEd in both
+                * directions before we continue on with the next (though
+                * this could be optimized).  The transmit direction will
+                * re-set RWORK.
+                */
+               if (msg)
+                       iocom->flags |= HAMMER2_IOCOMF_RWORK;
+       } else if (msg == NULL) {
+               /*
+                * Insufficient data received to finish building the message,
+                * set RREQ and return NULL.
+                *
+                * Leave ioq->msg intact.
+                * Leave the FIFO intact.
+                */
+               iocom->flags |= HAMMER2_IOCOMF_RREQ;
+       } else {
+               /*
+                * Return msg.
+                *
+                * The fifo has already been advanced past the message.
+                * Trivially reset the FIFO indices if possible.
+                *
+                * clear the FIFO if it is now empty and set RREQ to wait
+                * for more from the socket.  If the FIFO is not empty set
+                * TWORK to bypass the poll so we loop immediately.
+                */
+               if (ioq->fifo_beg == ioq->fifo_cdx &&
+                   ioq->fifo_cdn == ioq->fifo_end) {
+                       iocom->flags |= HAMMER2_IOCOMF_RREQ;
+                       ioq->fifo_cdx = 0;
+                       ioq->fifo_cdn = 0;
+                       ioq->fifo_beg = 0;
+                       ioq->fifo_end = 0;
+               } else {
+                       iocom->flags |= HAMMER2_IOCOMF_RWORK;
+               }
+               ioq->state = HAMMER2_MSGQ_STATE_HEADER1;
+               ioq->msg = NULL;
+       }
+       return (msg);
+}
+
+/*
+ * Calculate the header and data crc's and write a low-level message to
+ * the connection.  If aux_crc is non-zero the aux_data crc is already
+ * assumed to have been set.
+ *
+ * A non-NULL msg is added to the queue but not necessarily flushed.
+ * Calling this function with msg == NULL will get a flush going.
+ *
+ * Caller must hold iocom->mtx.
+ */
+void
+hammer2_iocom_flush1(hammer2_iocom_t *iocom)
+{
+       hammer2_ioq_t *ioq = &iocom->ioq_tx;
+       hammer2_msg_t *msg;
+       uint32_t xcrc32;
+       int hbytes;
+       hammer2_msg_queue_t tmpq;
+
+       iocom->flags &= ~(HAMMER2_IOCOMF_WREQ | HAMMER2_IOCOMF_WWORK);
+       TAILQ_INIT(&tmpq);
+       pthread_mutex_lock(&iocom->mtx);
+       while ((msg = TAILQ_FIRST(&iocom->router->txmsgq)) != NULL) {
+               TAILQ_REMOVE(&iocom->router->txmsgq, msg, qentry);
+               TAILQ_INSERT_TAIL(&tmpq, msg, qentry);
+       }
+       pthread_mutex_unlock(&iocom->mtx);
+
+       while ((msg = TAILQ_FIRST(&tmpq)) != NULL) {
+               /*
+                * Process terminal connection errors.
+                */
+               TAILQ_REMOVE(&tmpq, msg, qentry);
+               if (ioq->error) {
+                       TAILQ_INSERT_TAIL(&ioq->msgq, msg, qentry);
+                       ++ioq->msgcount;
+                       continue;
+               }
+
+               /*
+                * Finish populating the msg fields.  The salt ensures that
+                * the iv[] array is ridiculously randomized and we also
+                * re-seed our PRNG every 32768 messages just to be sure.
+                */
+               msg->any.head.magic = HAMMER2_MSGHDR_MAGIC;
+               msg->any.head.salt = (random() << 8) | (ioq->seq & 255);
+               ++ioq->seq;
+               if ((ioq->seq & 32767) == 0)
+                       srandomdev();
+
+               /*
+                * Calculate aux_crc if 0, then calculate hdr_crc.
+                */
+               if (msg->aux_size && msg->any.head.aux_crc == 0) {
+                       assert((msg->aux_size & HAMMER2_MSG_ALIGNMASK) == 0);
+                       xcrc32 = hammer2_icrc32(msg->aux_data, msg->aux_size);
+                       msg->any.head.aux_crc = xcrc32;
+               }
+               msg->any.head.aux_bytes = msg->aux_size / HAMMER2_MSG_ALIGN;
+               assert((msg->aux_size & HAMMER2_MSG_ALIGNMASK) == 0);
+
+               hbytes = (msg->any.head.cmd & HAMMER2_MSGF_SIZE) *
+                        HAMMER2_MSG_ALIGN;
+               msg->any.head.hdr_crc = 0;
+               msg->any.head.hdr_crc = hammer2_icrc32(&msg->any.head, hbytes);
+
+               /*
+                * Enqueue the message (the flush codes handles stream
+                * encryption).
+                */
+               TAILQ_INSERT_TAIL(&ioq->msgq, msg, qentry);
+               ++ioq->msgcount;
+       }
+       hammer2_iocom_flush2(iocom);
+}
+
+/*
+ * Thread localized, iocom->mtx not held by caller.
+ */
+void
+hammer2_iocom_flush2(hammer2_iocom_t *iocom)
+{
+       hammer2_ioq_t *ioq = &iocom->ioq_tx;
+       hammer2_msg_t *msg;
+       ssize_t n;
+       struct iovec iov[HAMMER2_IOQ_MAXIOVEC];
+       size_t nact;
+       size_t hbytes;
+       size_t abytes;
+       size_t hoff;
+       size_t aoff;
+       int iovcnt;
+
+       if (ioq->error) {
+               hammer2_iocom_drain(iocom);
+               return;
+       }
+
+       /*
+        * Pump messages out the connection by building an iovec.
+        *
+        * ioq->hbytes/ioq->abytes tracks how much of the first message
+        * in the queue has been successfully written out, so we can
+        * resume writing.
+        */
+       iovcnt = 0;
+       nact = 0;
+       hoff = ioq->hbytes;
+       aoff = ioq->abytes;
+
+       TAILQ_FOREACH(msg, &ioq->msgq, qentry) {
+               hbytes = (msg->any.head.cmd & HAMMER2_MSGF_SIZE) *
+                        HAMMER2_MSG_ALIGN;
+               abytes = msg->aux_size;
+               assert(hoff <= hbytes && aoff <= abytes);
+
+               if (hoff < hbytes) {
+                       iov[iovcnt].iov_base = (char *)&msg->any.head + hoff;
+                       iov[iovcnt].iov_len = hbytes - hoff;
+                       nact += hbytes - hoff;
+                       ++iovcnt;
+                       if (iovcnt == HAMMER2_IOQ_MAXIOVEC)
+                               break;
+               }
+               if (aoff < abytes) {
+                       assert(msg->aux_data != NULL);
+                       iov[iovcnt].iov_base = (char *)msg->aux_data + aoff;
+                       iov[iovcnt].iov_len = abytes - aoff;
+                       nact += abytes - aoff;
+                       ++iovcnt;
+                       if (iovcnt == HAMMER2_IOQ_MAXIOVEC)
+                               break;
+               }
+               hoff = 0;
+               aoff = 0;
+       }
+       if (iovcnt == 0)
+               return;
+
+       /*
+        * Encrypt and write the data.  The crypto code will move the
+        * data into the fifo and adjust the iov as necessary.  If
+        * encryption is disabled the iov is left alone.
+        *
+        * May return a smaller iov (thus a smaller n), with aggregated
+        * chunks.  May reduce nmax to what fits in the FIFO.
+        *
+        * This function sets nact to the number of original bytes now
+        * encrypted, adding to the FIFO some number of bytes that might
+        * be greater depending on the crypto mechanic.  iov[] is adjusted
+        * to point at the FIFO if necessary.
+        *
+        * NOTE: The return value from the writev() is the post-encrypted
+        *       byte count, not the plaintext count.
+        */
+       if (iocom->flags & HAMMER2_IOCOMF_CRYPTED) {
+               /*
+                * Make sure the FIFO has a reasonable amount of space
+                * left (if not completely full).
+                */
+               if (ioq->fifo_beg > sizeof(ioq->buf) / 2 &&
+                   sizeof(ioq->buf) - ioq->fifo_end >= HAMMER2_MSG_ALIGN * 2) {
+                       bcopy(ioq->buf + ioq->fifo_beg, ioq->buf,
+                             ioq->fifo_end - ioq->fifo_beg);
+                       ioq->fifo_cdx -= ioq->fifo_beg;
+                       ioq->fifo_cdn -= ioq->fifo_beg;
+                       ioq->fifo_end -= ioq->fifo_beg;
+                       ioq->fifo_beg = 0;
+               }
+
+               iovcnt = hammer2_crypto_encrypt(iocom, ioq, iov, iovcnt, &nact);
+               n = writev(iocom->sock_fd, iov, iovcnt);
+               if (n > 0) {
+                       ioq->fifo_beg += n;
+                       ioq->fifo_cdn += n;
+                       ioq->fifo_cdx += n;
+                       if (ioq->fifo_beg == ioq->fifo_end) {
+                               ioq->fifo_beg = 0;
+                               ioq->fifo_cdn = 0;
+                               ioq->fifo_cdx = 0;
+                               ioq->fifo_end = 0;
+                       }
+               }
+       } else {
+               n = writev(iocom->sock_fd, iov, iovcnt);
+               if (n > 0)
+                       nact = n;
+               else
+                       nact = 0;
+       }
+
+       /*
+        * Clean out the transmit queue based on what we successfully
+        * sent (nact is the plaintext count).  ioq->hbytes/abytes
+        * represents the portion of the first message previously sent.
+        */
+       while ((msg = TAILQ_FIRST(&ioq->msgq)) != NULL) {
+               hbytes = (msg->any.head.cmd & HAMMER2_MSGF_SIZE) *
+                        HAMMER2_MSG_ALIGN;
+               abytes = msg->aux_size;
+
+               if ((size_t)nact < hbytes - ioq->hbytes) {
+                       ioq->hbytes += nact;
+                       nact = 0;
+                       break;
+               }
+               nact -= hbytes - ioq->hbytes;
+               ioq->hbytes = hbytes;
+               if ((size_t)nact < abytes - ioq->abytes) {
+                       ioq->abytes += nact;
+                       nact = 0;
+                       break;
+               }
+               nact -= abytes - ioq->abytes;
+
+               TAILQ_REMOVE(&ioq->msgq, msg, qentry);
+               --ioq->msgcount;
+               ioq->hbytes = 0;
+               ioq->abytes = 0;
+
+               hammer2_state_cleanuptx(msg);
+       }
+       assert(nact == 0);
+
+       /*
+        * Process the return value from the write w/regards to blocking.
+        */
+       if (n < 0) {
+               if (errno != EINTR &&
+                   errno != EINPROGRESS &&
+                   errno != EAGAIN) {
+                       /*
+                        * Fatal write error
+                        */
+                       ioq->error = HAMMER2_IOQ_ERROR_SOCK;
+                       hammer2_iocom_drain(iocom);
+               } else {
+                       /*
+                        * Wait for socket buffer space
+                        */
+                       iocom->flags |= HAMMER2_IOCOMF_WREQ;
+               }
+       } else {
+               iocom->flags |= HAMMER2_IOCOMF_WREQ;
+       }
+       if (ioq->error) {
+               hammer2_iocom_drain(iocom);
+       }
+}
+
+/*
+ * Kill pending msgs on ioq_tx and adjust the flags such that no more
+ * write events will occur.  We don't kill read msgs because we want
+ * the caller to pull off our contrived terminal error msg to detect
+ * the connection failure.
+ *
+ * Thread localized, iocom->mtx not held by caller.
+ */
+void
+hammer2_iocom_drain(hammer2_iocom_t *iocom)
+{
+       hammer2_ioq_t *ioq = &iocom->ioq_tx;
+       hammer2_msg_t *msg;
+
+       iocom->flags &= ~(HAMMER2_IOCOMF_WREQ | HAMMER2_IOCOMF_WWORK);
+       ioq->hbytes = 0;
+       ioq->abytes = 0;
+
+       while ((msg = TAILQ_FIRST(&ioq->msgq)) != NULL) {
+               TAILQ_REMOVE(&ioq->msgq, msg, qentry);
+               --ioq->msgcount;
+               hammer2_state_cleanuptx(msg);
+       }
+}
+
+/*
+ * Write a message to an iocom, with additional state processing.
+ */
+void
+hammer2_msg_write(hammer2_msg_t *msg)
+{
+       hammer2_iocom_t *iocom = msg->router->iocom;
+       hammer2_state_t *state;
+       char dummy;
+
+       /*
+        * Handle state processing, create state if necessary.
+        */
+       pthread_mutex_lock(&iocom->mtx);
+       if ((state = msg->state) != NULL) {
+               /*
+                * Existing transaction (could be reply).  It is also
+                * possible for this to be the first reply (CREATE is set),
+                * in which case we populate state->txcmd.
+                *
+                * state->txcmd is adjusted to hold the final message cmd,
+                * and we also be sure to set the CREATE bit here.  We did
+                * not set it in hammer2_msg_alloc() because that would have
+                * not been serialized (state could have gotten ripped out
+                * from under the message prior to it being transmitted).
+                */
+               if ((msg->any.head.cmd & (HAMMER2_MSGF_CREATE |
+                                         HAMMER2_MSGF_REPLY)) ==
+                   HAMMER2_MSGF_CREATE) {
+                       state->txcmd = msg->any.head.cmd & ~HAMMER2_MSGF_DELETE;
+               }
+               msg->any.head.msgid = state->msgid;
+               assert(((state->txcmd ^ msg->any.head.cmd) &
+                       HAMMER2_MSGF_REPLY) == 0);
+               if (msg->any.head.cmd & HAMMER2_MSGF_CREATE)
+                       state->txcmd = msg->any.head.cmd & ~HAMMER2_MSGF_DELETE;
+       } else {
+               msg->any.head.msgid = 0;
+               /* XXX set spanid by router */
+       }
+       msg->any.head.source = 0;
+       msg->any.head.target = msg->router->target;
+
+       /*
+        * Queue it for output, wake up the I/O pthread.  Note that the
+        * I/O thread is responsible for generating the CRCs and encryption.
+        */
+       TAILQ_INSERT_TAIL(&iocom->router->txmsgq, msg, qentry);
+       dummy = 0;
+       write(iocom->wakeupfds[1], &dummy, 1);  /* XXX optimize me */
+       pthread_mutex_unlock(&iocom->mtx);
+}
+
+/*
+ * This is a shortcut to formulate a reply to msg with a simple error code,
+ * It can reply to and terminate a transaction, or it can reply to a one-way
+ * messages.  A HAMMER2_LNK_ERROR command code is utilized to encode
+ * the error code (which can be 0).  Not all transactions are terminated
+ * with HAMMER2_LNK_ERROR status (the low level only cares about the
+ * MSGF_DELETE flag), but most are.
+ *
+ * Replies to one-way messages are a bit of an oxymoron but the feature
+ * is used by the debug (DBG) protocol.
+ *
+ * The reply contains no extended data.
+ */
+void
+hammer2_msg_reply(hammer2_msg_t *msg, uint32_t error)
+{
+       hammer2_iocom_t *iocom = msg->router->iocom;
+       hammer2_state_t *state = msg->state;
+       hammer2_msg_t *nmsg;
+       uint32_t cmd;
+
+
+       /*
+        * Reply with a simple error code and terminate the transaction.
+        */
+       cmd = HAMMER2_LNK_ERROR;
+
+       /*
+        * Check if our direction has even been initiated yet, set CREATE.
+        *
+        * Check what direction this is (command or reply direction).  Note
+        * that txcmd might not have been initiated yet.
+        *
+        * If our direction has already been closed we just return without
+        * doing anything.
+        */
+       if (state) {
+               if (state->txcmd & HAMMER2_MSGF_DELETE)
+                       return;
+               if (state->txcmd & HAMMER2_MSGF_REPLY)
+                       cmd |= HAMMER2_MSGF_REPLY;
+               cmd |= HAMMER2_MSGF_DELETE;
+       } else {
+               if ((msg->any.head.cmd & HAMMER2_MSGF_REPLY) == 0)
+                       cmd |= HAMMER2_MSGF_REPLY;
+       }
+
+       /*
+        * Allocate the message and associate it with the existing state.
+        * We cannot pass MSGF_CREATE to msg_alloc() because that may
+        * allocate new state.  We have our state already.
+        */
+       nmsg = hammer2_msg_alloc(iocom->router, 0, cmd, NULL, NULL);
+       if (state) {
+               if ((state->txcmd & HAMMER2_MSGF_CREATE) == 0)
+                       nmsg->any.head.cmd |= HAMMER2_MSGF_CREATE;
+       }
+       nmsg->any.head.error = error;
+       nmsg->state = state;
+       hammer2_msg_write(nmsg);
+}
+
+/*
+ * Similar to hammer2_msg_reply() but leave the transaction open.  That is,
+ * we are generating a streaming reply or an intermediate acknowledgement
+ * of some sort as part of the higher level protocol, with more to come
+ * later.
+ */
+void
+hammer2_msg_result(hammer2_msg_t *msg, uint32_t error)
+{
+       hammer2_iocom_t *iocom = msg->router->iocom;
+       hammer2_state_t *state = msg->state;
+       hammer2_msg_t *nmsg;
+       uint32_t cmd;
+
+
+       /*
+        * Reply with a simple error code and terminate the transaction.
+        */
+       cmd = HAMMER2_LNK_ERROR;
+
+       /*
+        * Check if our direction has even been initiated yet, set CREATE.
+        *
+        * Check what direction this is (command or reply direction).  Note
+        * that txcmd might not have been initiated yet.
+        *
+        * If our direction has already been closed we just return without
+        * doing anything.
+        */
+       if (state) {
+               if (state->txcmd & HAMMER2_MSGF_DELETE)
+                       return;
+               if (state->txcmd & HAMMER2_MSGF_REPLY)
+                       cmd |= HAMMER2_MSGF_REPLY;
+               /* continuing transaction, do not set MSGF_DELETE */
+       } else {
+               if ((msg->any.head.cmd & HAMMER2_MSGF_REPLY) == 0)
+                       cmd |= HAMMER2_MSGF_REPLY;
+       }
+
+       nmsg = hammer2_msg_alloc(iocom->router, 0, cmd, NULL, NULL);
+       if (state) {
+               if ((state->txcmd & HAMMER2_MSGF_CREATE) == 0)
+                       nmsg->any.head.cmd |= HAMMER2_MSGF_CREATE;
+       }
+       nmsg->any.head.error = error;
+       nmsg->state = state;
+       hammer2_msg_write(nmsg);
+}
+
+/*
+ * Terminate a transaction given a state structure by issuing a DELETE.
+ */
+void
+hammer2_state_reply(hammer2_state_t *state, uint32_t error)
+{
+       hammer2_msg_t *nmsg;
+       uint32_t cmd = HAMMER2_LNK_ERROR | HAMMER2_MSGF_DELETE;
+
+       /*
+        * Nothing to do if we already transmitted a delete
+        */
+       if (state->txcmd & HAMMER2_MSGF_DELETE)
+               return;
+
+       /*
+        * Set REPLY if the other end initiated the command.  Otherwise
+        * we are the command direction.
+        */
+       if (state->txcmd & HAMMER2_MSGF_REPLY)
+               cmd |= HAMMER2_MSGF_REPLY;
+
+       nmsg = hammer2_msg_alloc(state->iocom->router, 0, cmd, NULL, NULL);
+       if (state) {
+               if ((state->txcmd & HAMMER2_MSGF_CREATE) == 0)
+                       nmsg->any.head.cmd |= HAMMER2_MSGF_CREATE;
+       }
+       nmsg->any.head.error = error;
+       nmsg->state = state;
+       hammer2_msg_write(nmsg);
+}
+
+/************************************************************************
+ *                     TRANSACTION STATE HANDLING                      *
+ ************************************************************************
+ *
+ */
+
+/*
+ * Process state tracking for a message after reception, prior to
+ * execution.
+ *
+ * Called with msglk held and the msg dequeued.
+ *
+ * All messages are called with dummy state and return actual state.
+ * (One-off messages often just return the same dummy state).
+ *
+ * May request that caller discard the message by setting *discardp to 1.
+ * The returned state is not used in this case and is allowed to be NULL.
+ *
+ * --
+ *
+ * These routines handle persistent and command/reply message state via the
+ * CREATE and DELETE flags.  The first message in a command or reply sequence
+ * sets CREATE, the last message in a command or reply sequence sets DELETE.
+ *
+ * There can be any number of intermediate messages belonging to the same
+ * sequence sent inbetween the CREATE message and the DELETE message,
+ * which set neither flag.  This represents a streaming command or reply.
+ *
+ * Any command message received with CREATE set expects a reply sequence to
+ * be returned.  Reply sequences work the same as command sequences except the
+ * REPLY bit is also sent.  Both the command side and reply side can
+ * degenerate into a single message with both CREATE and DELETE set.  Note
+ * that one side can be streaming and the other side not, or neither, or both.
+ *
+ * The msgid is unique for the initiator.  That is, two sides sending a new
+ * message can use the same msgid without colliding.
+ *
+ * --
+ *
+ * ABORT sequences work by setting the ABORT flag along with normal message
+ * state.  However, ABORTs can also be sent on half-closed messages, that is
+ * even if the command or reply side has already sent a DELETE, as long as
+ * the message has not been fully closed it can still send an ABORT+DELETE
+ * to terminate the half-closed message state.
+ *
+ * Since ABORT+DELETEs can race we silently discard ABORT's for message
+ * state which has already been fully closed.  REPLY+ABORT+DELETEs can
+ * also race, and in this situation the other side might have already
+ * initiated a new unrelated command with the same message id.  Since
+ * the abort has not set the CREATE flag the situation can be detected
+ * and the message will also be discarded.
+ *
+ * Non-blocking requests can be initiated with ABORT+CREATE[+DELETE].
+ * The ABORT request is essentially integrated into the command instead
+ * of being sent later on.  In this situation the command implementation
+ * detects that CREATE and ABORT are both set (vs ABORT alone) and can
+ * special-case non-blocking operation for the command.
+ *
+ * NOTE!  Messages with ABORT set without CREATE or DELETE are considered
+ *       to be mid-stream aborts for command/reply sequences.  ABORTs on
+ *       one-way messages are not supported.
+ *
+ * NOTE!  If a command sequence does not support aborts the ABORT flag is
+ *       simply ignored.
+ *
+ * --
+ *
+ * One-off messages (no reply expected) are sent with neither CREATE or DELETE
+ * set.  One-off messages cannot be aborted and typically aren't processed
+ * by these routines.  The REPLY bit can be used to distinguish whether a
+ * one-off message is a command or reply.  For example, one-off replies
+ * will typically just contain status updates.
+ */
+static int
+hammer2_state_msgrx(hammer2_msg_t *msg)
+{
+       hammer2_iocom_t *iocom = msg->router->iocom;
+       hammer2_state_t *state;
+       hammer2_state_t dummy;
+       int error;
+
+       /*
+        * Lock RB tree and locate existing persistent state, if any.
+        *
+        * If received msg is a command state is on staterd_tree.
+        * If received msg is a reply state is on statewr_tree.
+        */
+       dummy.msgid = msg->any.head.msgid;
+       pthread_mutex_lock(&iocom->mtx);
+       if (msg->any.head.cmd & HAMMER2_MSGF_REPLY) {
+               state = RB_FIND(hammer2_state_tree,
+                               &iocom->router->statewr_tree, &dummy);
+       } else {
+               state = RB_FIND(hammer2_state_tree,
+                               &iocom->router->staterd_tree, &dummy);
+       }
+       msg->state = state;
+       pthread_mutex_unlock(&iocom->mtx);
+
+       /*
+        * Short-cut one-off or mid-stream messages (state may be NULL).
+        */
+       if ((msg->any.head.cmd & (HAMMER2_MSGF_CREATE | HAMMER2_MSGF_DELETE |
+                                 HAMMER2_MSGF_ABORT)) == 0) {
+               return(0);
+       }
+
+       /*
+        * Switch on CREATE, DELETE, REPLY, and also handle ABORT from
+        * inside the case statements.
+        */
+       switch(msg->any.head.cmd & (HAMMER2_MSGF_CREATE | HAMMER2_MSGF_DELETE |
+                                   HAMMER2_MSGF_REPLY)) {
+       case HAMMER2_MSGF_CREATE:
+       case HAMMER2_MSGF_CREATE | HAMMER2_MSGF_DELETE:
+               /*
+                * New persistant command received.
+                */
+               if (state) {
+                       fprintf(stderr, "duplicate-trans %s\n",
+                               hammer2_msg_str(msg));
+                       error = HAMMER2_IOQ_ERROR_TRANS;
+                       assert(0);
+                       break;
+               }
+               state = malloc(sizeof(*state));
+               bzero(state, sizeof(*state));
+               state->iocom = iocom;
+               state->flags = HAMMER2_STATE_DYNAMIC;
+               state->msg = msg;
+               state->txcmd = HAMMER2_MSGF_REPLY;
+               state->rxcmd = msg->any.head.cmd & ~HAMMER2_MSGF_DELETE;
+               state->flags |= HAMMER2_STATE_INSERTED;
+               state->msgid = msg->any.head.msgid;
+               state->router = msg->router;
+               msg->state = state;
+               pthread_mutex_lock(&iocom->mtx);
+               RB_INSERT(hammer2_state_tree,
+                         &iocom->router->staterd_tree, state);
+               pthread_mutex_unlock(&iocom->mtx);
+               error = 0;
+               if (DebugOpt) {
+                       fprintf(stderr, "create state %p id=%08x on iocom staterd %p\n",
+                               state, (uint32_t)state->msgid, iocom);
+               }
+               break;
+       case HAMMER2_MSGF_DELETE:
+               /*
+                * Persistent state is expected but might not exist if an
+                * ABORT+DELETE races the close.
+                */
+               if (state == NULL) {
+                       if (msg->any.head.cmd & HAMMER2_MSGF_ABORT) {
+                               error = HAMMER2_IOQ_ERROR_EALREADY;
+                       } else {
+                               fprintf(stderr, "missing-state %s\n",
+                                       hammer2_msg_str(msg));
+                               error = HAMMER2_IOQ_ERROR_TRANS;
+                       assert(0);
+                       }
+                       break;
+               }
+
+               /*
+                * Handle another ABORT+DELETE case if the msgid has already
+                * been reused.
+                */
+               if ((state->rxcmd & HAMMER2_MSGF_CREATE) == 0) {
+                       if (msg->any.head.cmd & HAMMER2_MSGF_ABORT) {
+                               error = HAMMER2_IOQ_ERROR_EALREADY;
+                       } else {
+                               fprintf(stderr, "reused-state %s\n",
+                                       hammer2_msg_str(msg));
+                               error = HAMMER2_IOQ_ERROR_TRANS;
+                       assert(0);
+                       }
+                       break;
+               }
+               error = 0;
+               break;
+       default:
+               /*
+                * Check for mid-stream ABORT command received, otherwise
+                * allow.
+                */
+               if (msg->any.head.cmd & HAMMER2_MSGF_ABORT) {
+                       if (state == NULL ||
+                           (state->rxcmd & HAMMER2_MSGF_CREATE) == 0) {
+                               error = HAMMER2_IOQ_ERROR_EALREADY;
+                               break;
+                       }
+               }
+               error = 0;
+               break;
+       case HAMMER2_MSGF_REPLY | HAMMER2_MSGF_CREATE:
+       case HAMMER2_MSGF_REPLY | HAMMER2_MSGF_CREATE | HAMMER2_MSGF_DELETE:
+               /*
+                * When receiving a reply with CREATE set the original
+                * persistent state message should already exist.
+                */
+               if (state == NULL) {
+                       fprintf(stderr, "no-state(r) %s\n",
+                               hammer2_msg_str(msg));
+                       error = HAMMER2_IOQ_ERROR_TRANS;
+                       assert(0);
+                       break;
+               }
+               assert(((state->rxcmd ^ msg->any.head.cmd) &
+                       HAMMER2_MSGF_REPLY) == 0);
+               state->rxcmd = msg->any.head.cmd & ~HAMMER2_MSGF_DELETE;
+               error = 0;
+               break;
+       case HAMMER2_MSGF_REPLY | HAMMER2_MSGF_DELETE:
+               /*
+                * Received REPLY+ABORT+DELETE in case where msgid has
+                * already been fully closed, ignore the message.
+                */
+               if (state == NULL) {
+                       if (msg->any.head.cmd & HAMMER2_MSGF_ABORT) {
+                               error = HAMMER2_IOQ_ERROR_EALREADY;
+                       } else {
+                               fprintf(stderr, "no-state(r,d) %s\n",
+                                       hammer2_msg_str(msg));
+                               error = HAMMER2_IOQ_ERROR_TRANS;
+                       assert(0);
+                       }
+                       break;
+               }
+
+               /*
+                * Received REPLY+ABORT+DELETE in case where msgid has
+                * already been reused for an unrelated message,
+                * ignore the message.
+                */
+               if ((state->rxcmd & HAMMER2_MSGF_CREATE) == 0) {
+                       if (msg->any.head.cmd & HAMMER2_MSGF_ABORT) {
+                               error = HAMMER2_IOQ_ERROR_EALREADY;
+                       } else {
+                               fprintf(stderr, "reused-state(r,d) %s\n",
+                                       hammer2_msg_str(msg));
+                               error = HAMMER2_IOQ_ERROR_TRANS;
+                       assert(0);
+                       }
+                       break;
+               }
+               error = 0;
+               break;
+       case HAMMER2_MSGF_REPLY:
+               /*
+                * Check for mid-stream ABORT reply received to sent command.
+                */
+               if (msg->any.head.cmd & HAMMER2_MSGF_ABORT) {
+                       if (state == NULL ||
+                           (state->rxcmd & HAMMER2_MSGF_CREATE) == 0) {
+                               error = HAMMER2_IOQ_ERROR_EALREADY;
+                               break;
+                       }
+               }
+               error = 0;
+               break;
+       }
+       return (error);
+}
+
+void
+hammer2_state_cleanuprx(hammer2_iocom_t *iocom, hammer2_msg_t *msg)
+{
+       hammer2_state_t *state;
+
+       if ((state = msg->state) == NULL) {
+               /*
+                * Free a non-transactional message, there is no state
+                * to worry about.
+                */
+               hammer2_msg_free(msg);
+       } else if (msg->any.head.cmd & HAMMER2_MSGF_DELETE) {
+               /*
+                * Message terminating transaction, destroy the related
+                * state, the original message, and this message (if it
+                * isn't the original message due to a CREATE|DELETE).
+                */
+               pthread_mutex_lock(&iocom->mtx);
+               state->rxcmd |= HAMMER2_MSGF_DELETE;
+               if (state->txcmd & HAMMER2_MSGF_DELETE) {
+                       if (state->msg == msg)
+                               state->msg = NULL;
+                       assert(state->flags & HAMMER2_STATE_INSERTED);
+                       if (state->rxcmd & HAMMER2_MSGF_REPLY) {
+                               assert(msg->any.head.cmd & HAMMER2_MSGF_REPLY);
+                               RB_REMOVE(hammer2_state_tree,
+                                         &iocom->router->statewr_tree, state);
+                       } else {
+                               assert((msg->any.head.cmd & HAMMER2_MSGF_REPLY) == 0);
+                               RB_REMOVE(hammer2_state_tree,
+                                         &iocom->router->staterd_tree, state);
+                       }
+                       state->flags &= ~HAMMER2_STATE_INSERTED;
+                       hammer2_state_free(state);
+               } else {
+                       ;
+               }
+               pthread_mutex_unlock(&iocom->mtx);
+               hammer2_msg_free(msg);
+       } else if (state->msg != msg) {
+               /*
+                * Message not terminating transaction, leave state intact
+                * and free message if it isn't the CREATE message.
+                */
+               hammer2_msg_free(msg);
+       }
+}
+
+static void
+hammer2_state_cleanuptx(hammer2_msg_t *msg)
+{
+       hammer2_iocom_t *iocom = msg->router->iocom;
+       hammer2_state_t *state;
+
+       if ((state = msg->state) == NULL) {
+               hammer2_msg_free(msg);
+       } else if (msg->any.head.cmd & HAMMER2_MSGF_DELETE) {
+               pthread_mutex_lock(&iocom->mtx);
+               state->txcmd |= HAMMER2_MSGF_DELETE;
+               if (state->rxcmd & HAMMER2_MSGF_DELETE) {
+                       if (state->msg == msg)
+                               state->msg = NULL;
+                       assert(state->flags & HAMMER2_STATE_INSERTED);
+                       if (state->txcmd & HAMMER2_MSGF_REPLY) {
+                               assert(msg->any.head.cmd & HAMMER2_MSGF_REPLY);
+                               RB_REMOVE(hammer2_state_tree,
+                                         &iocom->router->staterd_tree, state);
+                       } else {
+                               assert((msg->any.head.cmd & HAMMER2_MSGF_REPLY) == 0);
+                               RB_REMOVE(hammer2_state_tree,
+                                         &iocom->router->statewr_tree, state);
+                       }
+                       state->flags &= ~HAMMER2_STATE_INSERTED;
+                       hammer2_state_free(state);
+               } else {
+                       ;
+               }
+               pthread_mutex_unlock(&iocom->mtx);
+               hammer2_msg_free(msg);
+       } else if (state->msg != msg) {
+               hammer2_msg_free(msg);
+       }
+}
+
+/*
+ * Called with iocom locked
+ */
+void
+hammer2_state_free(hammer2_state_t *state)
+{
+       hammer2_iocom_t *iocom = state->iocom;
+       hammer2_msg_t *msg;
+       char dummy;
+
+       if (DebugOpt) {
+               fprintf(stderr, "terminate state %p id=%08x\n",
+                       state, (uint32_t)state->msgid);
+       }
+       assert(state->any.any == NULL);
+       msg = state->msg;
+       state->msg = NULL;
+       if (msg)
+               hammer2_msg_free_locked(msg);
+       free(state);
+
+       /*
+        * When an iocom error is present we are trying to close down the
+        * iocom, but we have to wait for all states to terminate before
+        * we can do so.  The iocom rx code will terminate the receive side
+        * for all transactions by simulating incoming DELETE messages,
+        * but the state doesn't go away until both sides are terminated.
+        *
+        * We may have to wake up the rx code.
+        */
+       if (iocom->ioq_rx.error &&
+           RB_EMPTY(&iocom->router->staterd_tree) &&
+           RB_EMPTY(&iocom->router->statewr_tree)) {
+               dummy = 0;
+               write(iocom->wakeupfds[1], &dummy, 1);
+       }
+}
+
+/************************************************************************
+ *                             ROUTING                                 *
+ ************************************************************************
+ *
+ * Incoming messages are routed by their spanid, matched up against
+ * outgoing LNK_SPANs managed by h2span_relay structures (see msg_lnk.c).
+ * Any replies run through the same router.
+ *
+ * Originated messages are routed by their spanid, matched up against
+ * incoming LNK_SPANs managed by h2span_link structures (see msg_lnk.c).
+ * Replies come back through the same route.
+ *
+ * Keep in mind that ALL MESSAGE TRAFFIC pertaining to a particular
+ * transaction runs through the same route.  Commands and replies both.
+ *
+ * An originated message will use a different routing spanid to
+ * reach a target node than a message which originates from that node.
+ * They might use the same physical pipes (each pipe can have multiple
+ * SPANs and RELAYs), but the routes are distinct from the perspective
+ * of the router.
+ */
+hammer2_router_t *
+hammer2_router_alloc(void)
+{
+       hammer2_router_t *router;
+
+       router = hammer2_alloc(sizeof(*router));
+       TAILQ_INIT(&router->txmsgq);
+       return (router);
+}
+
+void
+hammer2_router_connect(hammer2_router_t *router)
+{
+       hammer2_router_t *tmp;
+
+       assert(router->link || router->relay);
+       assert((router->flags & HAMMER2_ROUTER_CONNECTED) == 0);
+
+       pthread_mutex_lock(&router_mtx);
+       if (router->link)
+               tmp = RB_INSERT(hammer2_router_tree, &router_ltree, router);
+       else
+               tmp = RB_INSERT(hammer2_router_tree, &router_rtree, router);
+       assert(tmp == NULL);
+       router->flags |= HAMMER2_ROUTER_CONNECTED;
+       pthread_mutex_unlock(&router_mtx);
+}
+
+void
+hammer2_router_disconnect(hammer2_router_t **routerp)
+{
+       hammer2_router_t *router;
+
+       router = *routerp;
+       assert(router->link || router->relay);
+       assert(router->flags & HAMMER2_ROUTER_CONNECTED);
+
+       pthread_mutex_lock(&router_mtx);
+       if (router->link)
+               RB_REMOVE(hammer2_router_tree, &router_ltree, router);
+       else
+               RB_REMOVE(hammer2_router_tree, &router_rtree, router);
+       router->flags &= ~HAMMER2_ROUTER_CONNECTED;
+       *routerp = NULL;
+       pthread_mutex_unlock(&router_mtx);
+}
+
+#if 0
+/*
+ * XXX
+ */
+hammer2_router_t *
+hammer2_route_msg(hammer2_msg_t *msg)
+{
+}
+#endif
+
+/************************************************************************
+ *                             DEBUGGING                               *
+ ************************************************************************/
+
+const char *
+hammer2_basecmd_str(uint32_t cmd)
+{
+       static char buf[64];
+       char protobuf[32];
+       char cmdbuf[32];
+       const char *protostr;
+       const char *cmdstr;
+
+       switch(cmd & HAMMER2_MSGF_PROTOS) {
+       case HAMMER2_MSG_PROTO_LNK:
+               protostr = "LNK_";
+               break;
+       case HAMMER2_MSG_PROTO_DBG:
+               protostr = "DBG_";
+               break;
+       case HAMMER2_MSG_PROTO_DOM:
+               protostr = "DOM_";
+               break;
+       case HAMMER2_MSG_PROTO_CAC:
+               protostr = "CAC_";
+               break;
+       case HAMMER2_MSG_PROTO_QRM:
+               protostr = "QRM_";
+               break;
+       case HAMMER2_MSG_PROTO_BLK:
+               protostr = "BLK_";
+               break;
+       case HAMMER2_MSG_PROTO_VOP:
+               protostr = "VOP_";
+               break;
+       default:
+               snprintf(protobuf, sizeof(protobuf), "%x_",
+                       (cmd & HAMMER2_MSGF_PROTOS) >> 20);
+               protostr = protobuf;
+               break;
+       }
+
+       switch(cmd & (HAMMER2_MSGF_PROTOS |
+                     HAMMER2_MSGF_CMDS |
+                     HAMMER2_MSGF_SIZE)) {
+       case HAMMER2_LNK_PAD:
+               cmdstr = "PAD";
+               break;
+       case HAMMER2_LNK_PING:
+               cmdstr = "PING";
+               break;
+       case HAMMER2_LNK_AUTH:
+               cmdstr = "AUTH";
+               break;
+       case HAMMER2_LNK_CONN:
+               cmdstr = "CONN";
+               break;
+       case HAMMER2_LNK_SPAN:
+               cmdstr = "SPAN";
+               break;
+       case HAMMER2_LNK_VOLCONF:
+               cmdstr = "VOLCONF";
+               break;
+       case HAMMER2_LNK_ERROR:
+               if (cmd & HAMMER2_MSGF_DELETE)
+                       cmdstr = "RETURN";
+               else
+                       cmdstr = "RESULT";
+               break;
+       case HAMMER2_DBG_SHELL:
+               cmdstr = "SHELL";
+               break;
+       default:
+               snprintf(cmdbuf, sizeof(cmdbuf),
+                        "%06x", (cmd & (HAMMER2_MSGF_PROTOS |
+                                        HAMMER2_MSGF_CMDS |
+                                        HAMMER2_MSGF_SIZE)));
+               cmdstr = cmdbuf;
+               break;
+       }
+       snprintf(buf, sizeof(buf), "%s%s", protostr, cmdstr);
+       return (buf);
+}
+
+const char *
+hammer2_msg_str(hammer2_msg_t *msg)
+{
+       hammer2_state_t *state;
+       static char buf[256];
+       char errbuf[16];
+       char statebuf[64];
+       char flagbuf[64];
+       const char *statestr;
+       const char *errstr;
+       uint32_t basecmd;
+       int i;
+
+       /*
+        * Parse the state
+        */
+       if ((state = msg->state) != NULL) {
+               basecmd = (state->rxcmd & HAMMER2_MSGF_REPLY) ?
+                         state->txcmd : state->rxcmd;
+               snprintf(statebuf, sizeof(statebuf),
+                        " %s=%s,L=%s%s,R=%s%s",
+                        ((state->txcmd & HAMMER2_MSGF_REPLY) ?
+                               "rcvcmd" : "sndcmd"),
+                        hammer2_basecmd_str(basecmd),
+                        ((state->txcmd & HAMMER2_MSGF_CREATE) ? "C" : ""),
+                        ((state->txcmd & HAMMER2_MSGF_DELETE) ? "D" : ""),
+                        ((state->rxcmd & HAMMER2_MSGF_CREATE) ? "C" : ""),
+                        ((state->rxcmd & HAMMER2_MSGF_DELETE) ? "D" : "")
+               );
+               statestr = statebuf;
+       } else {
+               statestr = "";
+       }
+
+       /*
+        * Parse the error
+        */
+       switch(msg->any.head.error) {
+       case 0:
+               errstr = "";
+               break;
+       case HAMMER2_IOQ_ERROR_SYNC:
+               errstr = "err=IOQ:NOSYNC";
+               break;
+       case HAMMER2_IOQ_ERROR_EOF:
+               errstr = "err=IOQ:STREAMEOF";
+               break;
+       case HAMMER2_IOQ_ERROR_SOCK:
+               errstr = "err=IOQ:SOCKERR";
+               break;
+       case HAMMER2_IOQ_ERROR_FIELD:
+               errstr = "err=IOQ:BADFIELD";
+               break;
+       case HAMMER2_IOQ_ERROR_HCRC:
+               errstr = "err=IOQ:BADHCRC";
+               break;
+       case HAMMER2_IOQ_ERROR_XCRC:
+               errstr = "err=IOQ:BADXCRC";
+               break;
+       case HAMMER2_IOQ_ERROR_ACRC:
+               errstr = "err=IOQ:BADACRC";
+               break;
+       case HAMMER2_IOQ_ERROR_STATE:
+               errstr = "err=IOQ:BADSTATE";
+               break;
+       case HAMMER2_IOQ_ERROR_NOPEER:
+               errstr = "err=IOQ:PEERCONFIG";
+               break;
+       case HAMMER2_IOQ_ERROR_NORKEY:
+               errstr = "err=IOQ:BADRKEY";
+               break;
+       case HAMMER2_IOQ_ERROR_NOLKEY:
+               errstr = "err=IOQ:BADLKEY";
+               break;
+       case HAMMER2_IOQ_ERROR_KEYXCHGFAIL:
+               errstr = "err=IOQ:BADKEYXCHG";
+               break;
+       case HAMMER2_IOQ_ERROR_KEYFMT:
+               errstr = "err=IOQ:BADFMT";
+               break;
+       case HAMMER2_IOQ_ERROR_BADURANDOM:
+               errstr = "err=IOQ:BADRANDOM";
+               break;
+       case HAMMER2_IOQ_ERROR_MSGSEQ:
+               errstr = "err=IOQ:BADSEQ";
+               break;
+       case HAMMER2_IOQ_ERROR_EALREADY:
+               errstr = "err=IOQ:DUPMSG";
+               break;
+       case HAMMER2_IOQ_ERROR_TRANS:
+               errstr = "err=IOQ:BADTRANS";
+               break;
+       case HAMMER2_IOQ_ERROR_IVWRAP:
+               errstr = "err=IOQ:IVWRAP";
+               break;
+       case HAMMER2_IOQ_ERROR_MACFAIL:
+               errstr = "err=IOQ:MACFAIL";
+               break;
+       case HAMMER2_IOQ_ERROR_ALGO:
+               errstr = "err=IOQ:ALGOFAIL";
+               break;
+       case HAMMER2_MSG_ERR_NOSUPP:
+               errstr = "err=NOSUPPORT";
+               break;
+       default:
+               snprintf(errbuf, sizeof(errbuf),
+                        " err=%d", msg->any.head.error);
+               errstr = errbuf;
+               break;
+       }
+
+       /*
+        * Message flags
+        */
+       i = 0;
+       if (msg->any.head.cmd & (HAMMER2_MSGF_CREATE | HAMMER2_MSGF_DELETE |
+                                HAMMER2_MSGF_ABORT | HAMMER2_MSGF_REPLY)) {
+               flagbuf[i++] = '|';
+               if (msg->any.head.cmd & HAMMER2_MSGF_CREATE)
+                       flagbuf[i++] = 'C';
+               if (msg->any.head.cmd & HAMMER2_MSGF_DELETE)
+                       flagbuf[i++] = 'D';
+               if (msg->any.head.cmd & HAMMER2_MSGF_REPLY)
+                       flagbuf[i++] = 'R';
+               if (msg->any.head.cmd & HAMMER2_MSGF_ABORT)
+                       flagbuf[i++] = 'A';
+       }
+       flagbuf[i] = 0;
+
+       /*
+        * Generate the buf
+        */
+       snprintf(buf, sizeof(buf),
+               "msg=%s%s %s id=%08x src=%08x tgt=%08x %s",
+                hammer2_basecmd_str(msg->any.head.cmd),
+                flagbuf,
+                errstr,
+                (uint32_t)(intmax_t)msg->any.head.msgid,   /* for brevity */
+                (uint32_t)(intmax_t)msg->any.head.source,  /* for brevity */
+                (uint32_t)(intmax_t)msg->any.head.target,  /* for brevity */
+                statestr);
+
+       return(buf);
+}
diff --git a/sbin/hammer2/msg_lnk.c b/sbin/hammer2/msg_lnk.c
new file mode 100644 (file)
index 0000000..cb5afb7
--- /dev/null
@@ -0,0 +1,1257 @@
+/*
+ * Copyright (c) 2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * LNK_SPAN PROTOCOL SUPPORT FUNCTIONS
+ *
+ * This code supports the LNK_SPAN protocol.  Essentially all PFS's
+ * clients and services rendezvous with the userland hammer2 service and
+ * open LNK_SPAN transactions using a message header linkid of 0,
+ * registering any PFS's they have connectivity to with us.
+ *
+ * --
+ *
+ * Each registration maintains its own open LNK_SPAN message transaction.
+ * The SPANs are collected, aggregated, and retransmitted over available
+ * connections through the maintainance of additional LNK_SPAN message
+ * transactions on each link.
+ *
+ * The msgid for each active LNK_SPAN transaction we receive allows us to
+ * send a message to the target PFS (which might be one of many belonging
+ * to the same cluster), by specifying that msgid as the linkid in any
+ * message we send to the target PFS.
+ *
+ * Similarly the msgid we allocate for any LNK_SPAN transaction we transmit
+ * (and remember we will maintain multiple open LNK_SPAN transactions on
+ * each connection representing the topology span, so every node sees every
+ * other node as a separate open transaction).  So, similarly the msgid for
+ * these active transactions which we initiated can be used by the other
+ * end to route messages through us to another node, ultimately winding up
+ * at the identified hammer2 PFS.  We have to adjust the spanid in the message
+ * header at each hop to be representative of the outgoing LNK_SPAN we
+ * are forwarding the message through.
+ *
+ * --
+ *
+ * If we were to retransmit every LNK_SPAN transaction we receive it would
+ * create a huge mess, so we have to aggregate all received LNK_SPAN
+ * transactions, sort them by the fsid (the cluster) and sub-sort them by
+ * the pfs_fsid (individual nodes in the cluster), and only retransmit
+ * (create outgoing transactions) for a subset of the nearest distance-hops
+ * for each individual node.
+ *
+ * The higher level protocols can then issue transactions to the nodes making
+ * up a cluster to perform all actions required.
+ *
+ * --
+ *
+ * Since this is a large topology and a spanning tree protocol, links can
+ * go up and down all the time.  Any time a link goes down its transaction
+ * is closed.  The transaction has to be closed on both ends before we can
+ * delete (and potentially reuse) the related spanid.  The LNK_SPAN being
+ * closed may have been propagated out to other connections and those related
+ * LNK_SPANs are also closed.  Ultimately all routes via the lost LNK_SPAN
+ * go away, ultimately reaching all sources and all targets.
+ *
+ * Any messages in-transit using a route that goes away will be thrown away.
+ * Open transactions are only tracked at the two end-points.  When a link
+ * failure propagates to an end-point the related open transactions lose
+ * their spanid and are automatically aborted.
+ *
+ * It is important to note that internal route nodes cannot just associate
+ * a lost LNK_SPAN transaction with another route to the same destination.
+ * Message transactions MUST be serialized and MUST be ordered.  All messages
+ * for a transaction must run over the same route.  So if the route used by
+ * an active transaction is lost, the related messages will be fully aborted
+ * and the higher protocol levels will retry as appropriate.
+ *
+ * FULLY ABORTING A ROUTED MESSAGE is handled via link-failure propagation
+ * back to the originator.  Only the originator keeps tracks of a message.
+ * Routers just pass it through.  If a route is lost during transit the
+ * message is simply thrown away.
+ *
+ * It is also important to note that several paths to the same PFS can be
+ * propagated along the same link, which allows concurrency and even
+ * redundancy over several network interfaces or via different routes through
+ * the topology.  Any given transaction will use only a single route but busy
+ * servers will often have hundreds of transactions active simultaniously,
+ * so having multiple active paths through the network topology for A<->B
+ * will improve performance.
+ *
+ * --
+ *
+ * Most protocols consolidate operations rather than simply relaying them.
+ * This is particularly true of LEAF protocols (such as strict HAMMER2
+ * clients), of which there can be millions connecting into the cluster at
+ * various points.  The SPAN protocol is not used for these LEAF elements.
+ *
+ * Instead the primary service they connect to implements a proxy for the
+ * client protocols so the core topology only has to propagate a couple of
+ * LNK_SPANs and not millions.  LNK_SPANs are meant to be used only for
+ * core master nodes and satellite slaves and cache nodes.
+ */
+
+#include "hammer2.h"
+
+/*
+ * Maximum spanning tree distance.  This has the practical effect of
+ * stopping tail-chasing closed loops when a feeder span is lost.
+ */
+#define HAMMER2_SPAN_MAXDIST   16
+
+/*
+ * RED-BLACK TREE DEFINITIONS
+ *
+ * We need to track:
+ *
+ * (1) shared fsid's (a cluster).
+ * (2) unique fsid's (a node in a cluster) <--- LNK_SPAN transactions.
+ *
+ * We need to aggegate all active LNK_SPANs, aggregate, and create our own
+ * outgoing LNK_SPAN transactions on each of our connections representing
+ * the aggregated state.
+ *
+ * h2span_connect      - list of iocom connections who wish to receive SPAN
+ *                       propagation from other connections.  Might contain
+ *                       a filter string.  Only iocom's with an open
+ *                       LNK_CONN transactions are applicable for SPAN
+ *                       propagation.
+ *
+ * h2span_relay                - List of links relayed (via SPAN).  Essentially
+ *                       each relay structure represents a LNK_SPAN
+ *                       transaction that we initiated, verses h2span_link
+ *                       which is a LNK_SPAN transaction that we received.
+ *
+ * --
+ *
+ * h2span_cluster      - Organizes the shared fsid's.  One structure for
+ *                       each cluster.
+ *
+ * h2span_node         - Organizes the nodes in a cluster.  One structure
+ *                       for each unique {cluster,node}, aka {fsid, pfs_fsid}.
+ *
+ * h2span_link         - Organizes all incoming and outgoing LNK_SPAN message
+ *                       transactions related to a node.
+ *
+ *                       One h2span_link structure for each incoming LNK_SPAN
+ *                       transaction.  Links selected for propagation back
+ *                       out are also where the outgoing LNK_SPAN messages
+ *                       are indexed into (so we can propagate changes).
+ *
+ *                       The h2span_link's use a red-black tree to sort the
+ *                       distance hop metric for the incoming LNK_SPAN.  We
+ *                       then select the top N for outgoing.  When the
+ *                       topology changes the top N may also change and cause
+ *                       new outgoing LNK_SPAN transactions to be opened
+ *                       and less desireable ones to be closed, causing
+ *                       transactional aborts within the message flow in
+ *                       the process.
+ *
+ * Also note           - All outgoing LNK_SPAN message transactions are also
+ *                       entered into a red-black tree for use by the routing
+ *                       function.  This is handled by msg.c in the state
+ *                       code, not here.
+ */
+
+struct h2span_link;
+struct h2span_relay;
+TAILQ_HEAD(h2span_media_queue, h2span_media);
+TAILQ_HEAD(h2span_connect_queue, h2span_connect);
+TAILQ_HEAD(h2span_relay_queue, h2span_relay);
+
+RB_HEAD(h2span_cluster_tree, h2span_cluster);
+RB_HEAD(h2span_node_tree, h2span_node);
+RB_HEAD(h2span_link_tree, h2span_link);
+RB_HEAD(h2span_relay_tree, h2span_relay);
+
+/*
+ * This represents a media
+ */
+struct h2span_media {
+       TAILQ_ENTRY(h2span_media) entry;
+       uuid_t  mediaid;
+       int     refs;
+       struct h2span_media_config {
+               hammer2_copy_data_t     copy_run;
+               hammer2_copy_data_t     copy_pend;
+               pthread_t               thread;
+               pthread_cond_t          cond;
+               int                     ctl;
+               int                     fd;
+               hammer2_iocom_t         iocom;
+               pthread_t               iocom_thread;
+               enum { H2MC_STOPPED, H2MC_CONNECT, H2MC_RUNNING } state;
+       } config[HAMMER2_COPYID_COUNT];
+};
+
+typedef struct h2span_media_config h2span_media_config_t;
+
+#define H2CONFCTL_STOP         0x00000001
+#define H2CONFCTL_UPDATE       0x00000002
+
+/*
+ * Received LNK_CONN transaction enables SPAN protocol over connection.
+ * (may contain filter).  Typically one for each mount and several may
+ * share the same media.
+ */
+struct h2span_connect {
+       TAILQ_ENTRY(h2span_connect) entry;
+       struct h2span_relay_tree tree;
+       struct h2span_media *media;
+       hammer2_state_t *state;
+};
+
+/*
+ * All received LNK_SPANs are organized by cluster (pfs_clid),
+ * node (pfs_fsid), and link (received LNK_SPAN transaction).
+ */
+struct h2span_cluster {
+       RB_ENTRY(h2span_cluster) rbnode;
+       struct h2span_node_tree tree;
+       uuid_t  pfs_clid;               /* shared fsid */
+       int     refs;                   /* prevents destruction */
+};
+
+struct h2span_node {
+       RB_ENTRY(h2span_node) rbnode;
+       struct h2span_link_tree tree;
+       struct h2span_cluster *cls;
+       uuid_t  pfs_fsid;               /* unique fsid */
+       char label[64];
+};
+
+struct h2span_link {
+       RB_ENTRY(h2span_link) rbnode;
+       hammer2_state_t *state;         /* state<->link */
+       struct h2span_node *node;       /* related node */
+       int32_t dist;
+       struct h2span_relay_queue relayq; /* relay out */
+       struct hammer2_router *router;  /* route out this link */
+};
+
+/*
+ * Any LNK_SPAN transactions we receive which are relayed out other
+ * connections utilize this structure to track the LNK_SPAN transaction
+ * we initiate on the other connections, if selected for relay.
+ *
+ * In many respects this is the core of the protocol... actually figuring
+ * out what LNK_SPANs to relay.  The spanid used for relaying is the
+ * address of the 'state' structure, which is why h2span_relay has to
+ * be entered into a RB-TREE based at h2span_connect (so we can look
+ * up the spanid to validate it).
+ *
+ * NOTE: Messages can be received via the LNK_SPAN transaction the
+ *      relay maintains, and can be replied via relay->router, but
+ *      messages are NOT initiated via a relay.  Messages are initiated
+ *      via incoming links (h2span_link's).
+ *
+ *      relay->link represents the link being relayed, NOT the LNK_SPAN
+ *      transaction the relay is holding open.
+ */
+struct h2span_relay {
+       RB_ENTRY(h2span_relay) rbnode;  /* from h2span_connect */
+       TAILQ_ENTRY(h2span_relay) entry; /* from link */
+       struct h2span_connect *conn;
+       hammer2_state_t *state;         /* transmitted LNK_SPAN */
+       struct h2span_link *link;       /* LNK_SPAN being relayed */
+       struct hammer2_router   *router;/* route out this relay */
+};
+
+
+typedef struct h2span_media h2span_media_t;
+typedef struct h2span_connect h2span_connect_t;
+typedef struct h2span_cluster h2span_cluster_t;
+typedef struct h2span_node h2span_node_t;
+typedef struct h2span_link h2span_link_t;
+typedef struct h2span_relay h2span_relay_t;
+
+static
+int
+h2span_cluster_cmp(h2span_cluster_t *cls1, h2span_cluster_t *cls2)
+{
+       return(uuid_compare(&cls1->pfs_clid, &cls2->pfs_clid, NULL));
+}
+
+static
+int
+h2span_node_cmp(h2span_node_t *node1, h2span_node_t *node2)
+{
+       return(uuid_compare(&node1->pfs_fsid, &node2->pfs_fsid, NULL));
+}
+
+/*
+ * Sort/subsort must match h2span_relay_cmp() under any given node
+ * to make the aggregation algorithm easier, so the best links are
+ * in the same sorted order as the best relays.
+ *
+ * NOTE: We cannot use link*->state->msgid because this msgid is created
+ *      by each remote host and thus might wind up being the same.
+ */
+static
+int
+h2span_link_cmp(h2span_link_t *link1, h2span_link_t *link2)
+{
+       if (link1->dist < link2->dist)
+               return(-1);
+       if (link1->dist > link2->dist)
+               return(1);
+#if 1
+       if ((uintptr_t)link1->state < (uintptr_t)link2->state)
+               return(-1);
+       if ((uintptr_t)link1->state > (uintptr_t)link2->state)
+               return(1);
+#else
+       if (link1->state->msgid < link2->state->msgid)
+               return(-1);
+       if (link1->state->msgid > link2->state->msgid)
+               return(1);
+#endif
+       return(0);
+}
+
+/*
+ * Relay entries are sorted by node, subsorted by distance and link
+ * address (so we can match up the conn->tree relay topology with
+ * a node's link topology).
+ */
+static
+int
+h2span_relay_cmp(h2span_relay_t *relay1, h2span_relay_t *relay2)
+{
+       h2span_link_t *link1 = relay1->link;
+       h2span_link_t *link2 = relay2->link;
+
+       if ((intptr_t)link1->node < (intptr_t)link2->node)
+               return(-1);
+       if ((intptr_t)link1->node > (intptr_t)link2->node)
+               return(1);
+       if (link1->dist < link2->dist)
+               return(-1);
+       if (link1->dist > link2->dist)
+               return(1);
+#if 1
+       if ((uintptr_t)link1->state < (uintptr_t)link2->state)
+               return(-1);
+       if ((uintptr_t)link1->state > (uintptr_t)link2->state)
+               return(1);
+#else
+       if (link1->state->msgid < link2->state->msgid)
+               return(-1);
+       if (link1->state->msgid > link2->state->msgid)
+               return(1);
+#endif
+       return(0);
+}
+
+RB_PROTOTYPE_STATIC(h2span_cluster_tree, h2span_cluster,
+            rbnode, h2span_cluster_cmp);
+RB_PROTOTYPE_STATIC(h2span_node_tree, h2span_node,
+            rbnode, h2span_node_cmp);
+RB_PROTOTYPE_STATIC(h2span_link_tree, h2span_link,
+            rbnode, h2span_link_cmp);
+RB_PROTOTYPE_STATIC(h2span_relay_tree, h2span_relay,
+            rbnode, h2span_relay_cmp);
+
+RB_GENERATE_STATIC(h2span_cluster_tree, h2span_cluster,
+            rbnode, h2span_cluster_cmp);
+RB_GENERATE_STATIC(h2span_node_tree, h2span_node,
+            rbnode, h2span_node_cmp);
+RB_GENERATE_STATIC(h2span_link_tree, h2span_link,
+            rbnode, h2span_link_cmp);
+RB_GENERATE_STATIC(h2span_relay_tree, h2span_relay,
+            rbnode, h2span_relay_cmp);
+
+/*
+ * Global mutex protects cluster_tree lookups, connq, mediaq.
+ */
+static pthread_mutex_t cluster_mtx;
+static struct h2span_cluster_tree cluster_tree = RB_INITIALIZER(cluster_tree);
+static struct h2span_connect_queue connq = TAILQ_HEAD_INITIALIZER(connq);
+static struct h2span_media_queue mediaq = TAILQ_HEAD_INITIALIZER(mediaq);
+
+static void hammer2_lnk_span(hammer2_msg_t *msg);
+static void hammer2_lnk_conn(hammer2_msg_t *msg);
+static void hammer2_lnk_relay(hammer2_msg_t *msg);
+static void hammer2_relay_scan(h2span_connect_t *conn, h2span_node_t *node);
+static void hammer2_relay_delete(h2span_relay_t *relay);
+
+static void *hammer2_volconf_thread(void *info);
+static void hammer2_volconf_stop(h2span_media_config_t *conf);
+static void hammer2_volconf_start(h2span_media_config_t *conf,
+                               const char *hostname);
+
+void
+hammer2_msg_lnk_signal(hammer2_router_t *router __unused)
+{
+       pthread_mutex_lock(&cluster_mtx);
+       hammer2_relay_scan(NULL, NULL);
+       pthread_mutex_unlock(&cluster_mtx);
+}
+
+/*
+ * Receive a HAMMER2_MSG_PROTO_LNK message.  This only called for
+ * one-way and opening-transactions since state->func will be assigned
+ * in all other cases.
+ */
+void
+hammer2_msg_lnk(hammer2_msg_t *msg)
+{
+       switch(msg->any.head.cmd & HAMMER2_MSGF_BASECMDMASK) {
+       case HAMMER2_LNK_CONN:
+               hammer2_lnk_conn(msg);
+               break;
+       case HAMMER2_LNK_SPAN:
+               hammer2_lnk_span(msg);
+               break;
+       default:
+               fprintf(stderr,
+                       "MSG_PROTO_LNK: Unknown msg %08x\n", msg->any.head.cmd);
+               hammer2_msg_reply(msg, HAMMER2_MSG_ERR_NOSUPP);
+               /* state invalid after reply */
+               break;
+       }
+}
+
+void
+hammer2_lnk_conn(hammer2_msg_t *msg)
+{
+       hammer2_state_t *state = msg->state;
+       h2span_media_t *media;
+       h2span_media_config_t *conf;
+       h2span_connect_t *conn;
+       h2span_relay_t *relay;
+       char *alloc = NULL;
+       int i;
+
+       pthread_mutex_lock(&cluster_mtx);
+
+       switch(msg->any.head.cmd & HAMMER2_MSGF_TRANSMASK) {
+       case HAMMER2_LNK_CONN | HAMMER2_MSGF_CREATE:
+       case HAMMER2_LNK_CONN | HAMMER2_MSGF_CREATE | HAMMER2_MSGF_DELETE:
+               /*
+                * On transaction start we allocate a new h2span_connect and
+                * acknowledge the request, leaving the transaction open.
+                * We then relay priority-selected SPANs.
+                */
+               fprintf(stderr, "LNK_CONN(%08x): %s/%s\n",
+                       (uint32_t)msg->any.head.msgid,
+                       hammer2_uuid_to_str(&msg->any.lnk_conn.pfs_clid,
+                                           &alloc),
+                       msg->any.lnk_conn.label);
+               free(alloc);
+
+               conn = hammer2_alloc(sizeof(*conn));
+
+               RB_INIT(&conn->tree);
+               conn->state = state;
+               state->func = hammer2_lnk_conn;
+               state->any.conn = conn;
+               TAILQ_INSERT_TAIL(&connq, conn, entry);
+
+               /*
+                * Set up media
+                */
+               TAILQ_FOREACH(media, &mediaq, entry) {
+                       if (uuid_compare(&msg->any.lnk_conn.mediaid,
+                                        &media->mediaid, NULL) == 0) {
+                               break;
+                       }
+               }
+               if (media == NULL) {
+                       media = hammer2_alloc(sizeof(*media));
+                       media->mediaid = msg->any.lnk_conn.mediaid;
+                       TAILQ_INSERT_TAIL(&mediaq, media, entry);
+               }
+               conn->media = media;
+               ++media->refs;
+
+               if ((msg->any.head.cmd & HAMMER2_MSGF_DELETE) == 0) {
+                       hammer2_msg_result(msg, 0);
+                       hammer2_router_signal(msg->router);
+                       break;
+               }
+               /* FALL THROUGH */
+       case HAMMER2_LNK_CONN | HAMMER2_MSGF_DELETE:
+       case HAMMER2_LNK_ERROR | HAMMER2_MSGF_DELETE:
+deleteconn:
+               /*
+                * On transaction terminate we clean out our h2span_connect
+                * and acknowledge the request, closing the transaction.
+                */
+               fprintf(stderr, "LNK_CONN: Terminated\n");
+               conn = state->any.conn;
+               assert(conn);
+
+               /*
+                * Clean out the media structure. If refs drops to zero we
+                * also clean out the media config threads.  These threads
+                * maintain span connections to other hammer2 service daemons.
+                */
+               media = conn->media;
+               if (--media->refs == 0) {
+                       fprintf(stderr, "Shutting down media spans\n");
+                       for (i = 0; i < HAMMER2_COPYID_COUNT; ++i) {
+                               conf = &media->config[i];
+
+                               if (conf->thread == NULL)
+                                       continue;
+                               conf->ctl = H2CONFCTL_STOP;
+                               pthread_cond_signal(&conf->cond);
+                       }
+                       for (i = 0; i < HAMMER2_COPYID_COUNT; ++i) {
+                               conf = &media->config[i];
+
+                               if (conf->thread == NULL)
+                                       continue;
+                               pthread_mutex_unlock(&cluster_mtx);
+                               pthread_join(conf->thread, NULL);
+                               pthread_mutex_lock(&cluster_mtx);
+                               conf->thread = NULL;
+                               pthread_cond_destroy(&conf->cond);
+                       }
+                       fprintf(stderr, "Media shutdown complete\n");
+                       TAILQ_REMOVE(&mediaq, media, entry);
+                       hammer2_free(media);
+               }
+
+               /*
+                * Clean out all relays.  This requires terminating each
+                * relay transaction.
+                */
+               while ((relay = RB_ROOT(&conn->tree)) != NULL) {
+                       hammer2_relay_delete(relay);
+               }
+
+               /*
+                * Clean out conn
+                */
+               conn->media = NULL;
+               conn->state = NULL;
+               msg->state->any.conn = NULL;
+               TAILQ_REMOVE(&connq, conn, entry);
+               hammer2_free(conn);
+
+               hammer2_msg_reply(msg, 0);
+               /* state invalid after reply */
+               break;
+       case HAMMER2_LNK_VOLCONF:
+               /*
+                * One-way volume-configuration message is transmitted
+                * over the open LNK_CONN transaction.
+                */
+               fprintf(stderr, "RECEIVED VOLCONF\n");
+               if (msg->any.lnk_volconf.index < 0 ||
+                   msg->any.lnk_volconf.index >= HAMMER2_COPYID_COUNT) {
+                       fprintf(stderr, "VOLCONF: ILLEGAL INDEX %d\n",
+                               msg->any.lnk_volconf.index);
+                       break;
+               }
+               if (msg->any.lnk_volconf.copy.path[sizeof(msg->any.lnk_volconf.copy.path) - 1] != 0 ||
+                   msg->any.lnk_volconf.copy.path[0] == 0) {
+                       fprintf(stderr, "VOLCONF: ILLEGAL PATH %d\n",
+                               msg->any.lnk_volconf.index);
+                       break;
+               }
+               conn = msg->state->any.conn;
+               if (conn == NULL) {
+                       fprintf(stderr, "VOLCONF: LNK_CONN is missing\n");
+                       break;
+               }
+               conf = &conn->media->config[msg->any.lnk_volconf.index];
+               conf->copy_pend = msg->any.lnk_volconf.copy;
+               conf->ctl |= H2CONFCTL_UPDATE;
+               if (conf->thread == NULL) {
+                       fprintf(stderr, "VOLCONF THREAD STARTED\n");
+                       pthread_cond_init(&conf->cond, NULL);
+                       pthread_create(&conf->thread, NULL,
+                                      hammer2_volconf_thread, (void *)conf);
+               }
+               pthread_cond_signal(&conf->cond);
+               break;
+       default:
+               /*
+                * Failsafe
+                */
+               if (msg->any.head.cmd & HAMMER2_MSGF_DELETE)
+                       goto deleteconn;
+               hammer2_msg_reply(msg, HAMMER2_MSG_ERR_NOSUPP);
+               break;
+       }
+       pthread_mutex_unlock(&cluster_mtx);
+}
+
+void
+hammer2_lnk_span(hammer2_msg_t *msg)
+{
+       hammer2_state_t *state = msg->state;
+       h2span_cluster_t dummy_cls;
+       h2span_node_t dummy_node;
+       h2span_cluster_t *cls;
+       h2span_node_t *node;
+       h2span_link_t *slink;
+       h2span_relay_t *relay;
+       char *alloc = NULL;
+
+       assert((msg->any.head.cmd & HAMMER2_MSGF_REPLY) == 0);
+
+       pthread_mutex_lock(&cluster_mtx);
+
+       /*
+        * On transaction start we initialize the tracking infrastructure
+        */
+       if (msg->any.head.cmd & HAMMER2_MSGF_CREATE) {
+               assert(state->func == NULL);
+               state->func = hammer2_lnk_span;
+
+               msg->any.lnk_span.label[sizeof(msg->any.lnk_span.label)-1] = 0;
+
+               /*
+                * Find the cluster
+                */
+               dummy_cls.pfs_clid = msg->any.lnk_span.pfs_clid;
+               cls = RB_FIND(h2span_cluster_tree, &cluster_tree, &dummy_cls);
+               if (cls == NULL) {
+                       cls = hammer2_alloc(sizeof(*cls));
+                       cls->pfs_clid = msg->any.lnk_span.pfs_clid;
+                       RB_INIT(&cls->tree);
+                       RB_INSERT(h2span_cluster_tree, &cluster_tree, cls);
+               }
+
+               /*
+                * Find the node
+                */
+               dummy_node.pfs_fsid = msg->any.lnk_span.pfs_fsid;
+               node = RB_FIND(h2span_node_tree, &cls->tree, &dummy_node);
+               if (node == NULL) {
+                       node = hammer2_alloc(sizeof(*node));
+                       node->pfs_fsid = msg->any.lnk_span.pfs_fsid;
+                       node->cls = cls;
+                       RB_INIT(&node->tree);
+                       RB_INSERT(h2span_node_tree, &cls->tree, node);
+                       snprintf(node->label, sizeof(node->label),
+                                "%s", msg->any.lnk_span.label);
+               }
+
+               /*
+                * Create the link
+                */
+               assert(state->any.link == NULL);
+               slink = hammer2_alloc(sizeof(*slink));
+               TAILQ_INIT(&slink->relayq);
+               slink->node = node;
+               slink->dist = msg->any.lnk_span.dist;
+               slink->state = state;
+               state->any.link = slink;
+
+               /*
+                * Embedded router structure in link for message forwarding.
+                *
+                * The spanning id for the router is the message id of
+                * the SPAN link it is embedded in, allowing messages to
+                * be routed via &slink->router.
+                */
+               slink->router = hammer2_router_alloc();
+               slink->router->iocom = state->iocom;
+               slink->router->link = slink;
+               slink->router->target = state->msgid;
+               hammer2_router_connect(slink->router);
+
+               RB_INSERT(h2span_link_tree, &node->tree, slink);
+
+               fprintf(stderr, "LNK_SPAN(thr %p): %p %s/%s dist=%d\n",
+                       msg->router->iocom,
+                       slink,
+                       hammer2_uuid_to_str(&msg->any.lnk_span.pfs_clid,
+                                           &alloc),
+                       msg->any.lnk_span.label,
+                       msg->any.lnk_span.dist);
+               free(alloc);
+#if 0
+               hammer2_relay_scan(NULL, node);
+#endif
+               hammer2_router_signal(msg->router);
+       }
+
+       /*
+        * On transaction terminate we remove the tracking infrastructure.
+        */
+       if (msg->any.head.cmd & HAMMER2_MSGF_DELETE) {
+               slink = state->any.link;
+               assert(slink != NULL);
+               node = slink->node;
+               cls = node->cls;
+
+               fprintf(stderr, "LNK_DELE(thr %p): %p %s/%s dist=%d\n",
+                       msg->router->iocom,
+                       slink,
+                       hammer2_uuid_to_str(&cls->pfs_clid, &alloc),
+                       state->msg->any.lnk_span.label,
+                       state->msg->any.lnk_span.dist);
+               free(alloc);
+
+               /*
+                * Remove the router from consideration
+                */
+               hammer2_router_disconnect(&slink->router);
+
+               /*
+                * Clean out all relays.  This requires terminating each
+                * relay transaction.
+                */
+               while ((relay = TAILQ_FIRST(&slink->relayq)) != NULL) {
+                       hammer2_relay_delete(relay);
+               }
+
+               /*
+                * Clean out the topology
+                */
+               RB_REMOVE(h2span_link_tree, &node->tree, slink);
+               if (RB_EMPTY(&node->tree)) {
+                       RB_REMOVE(h2span_node_tree, &cls->tree, node);
+                       if (RB_EMPTY(&cls->tree) && cls->refs == 0) {
+                               RB_REMOVE(h2span_cluster_tree,
+                                         &cluster_tree, cls);
+                               hammer2_free(cls);
+                       }
+                       node->cls = NULL;
+                       hammer2_free(node);
+                       node = NULL;
+               }
+               state->any.link = NULL;
+               slink->state = NULL;
+               slink->node = NULL;
+               hammer2_free(slink);
+
+               /*
+                * We have to terminate the transaction
+                */
+               hammer2_state_reply(state, 0);
+               /* state invalid after reply */
+
+               /*
+                * If the node still exists issue any required updates.  If
+                * it doesn't then all related relays have already been
+                * removed and there's nothing left to do.
+                */
+#if 0
+               if (node)
+                       hammer2_relay_scan(NULL, node);
+#endif
+               if (node)
+                       hammer2_router_signal(msg->router);
+       }
+
+       pthread_mutex_unlock(&cluster_mtx);
+}
+
+/*
+ * Messages received on relay SPANs.  These are open transactions so it is
+ * in fact possible for the other end to close the transaction.
+ *
+ * XXX MPRACE on state structure
+ */
+static void
+hammer2_lnk_relay(hammer2_msg_t *msg)
+{
+       hammer2_state_t *state = msg->state;
+       h2span_relay_t *relay;
+
+       assert(msg->any.head.cmd & HAMMER2_MSGF_REPLY);
+
+       if (msg->any.head.cmd & HAMMER2_MSGF_DELETE) {
+               pthread_mutex_lock(&cluster_mtx);
+               if ((relay = state->any.relay) != NULL) {
+                       hammer2_relay_delete(relay);
+               } else {
+                       hammer2_state_reply(state, 0);
+               }
+               pthread_mutex_unlock(&cluster_mtx);
+       }
+}
+
+/*
+ * Update relay transactions for SPANs.
+ *
+ * Called with cluster_mtx held.
+ */
+static void hammer2_relay_scan_specific(h2span_node_t *node,
+                                       h2span_connect_t *conn);
+
+static void
+hammer2_relay_scan(h2span_connect_t *conn, h2span_node_t *node)
+{
+       h2span_cluster_t *cls;
+
+       if (node) {
+               /*
+                * Iterate specific node
+                */
+               TAILQ_FOREACH(conn, &connq, entry)
+                       hammer2_relay_scan_specific(node, conn);
+       } else {
+               /*
+                * Full iteration.
+                *
+                * Iterate cluster ids, nodes, and either a specific connection
+                * or all connections.
+                */
+               RB_FOREACH(cls, h2span_cluster_tree, &cluster_tree) {
+                       /*
+                        * Iterate node ids
+                        */
+                       RB_FOREACH(node, h2span_node_tree, &cls->tree) {
+                               /*
+                                * Synchronize the node's link (received SPANs)
+                                * with each connection's relays.
+                                */
+                               if (conn) {
+                                       hammer2_relay_scan_specific(node, conn);
+                               } else {
+                                       TAILQ_FOREACH(conn, &connq, entry) {
+                                           hammer2_relay_scan_specific(node,
+                                                                       conn);
+                                       }
+                                       assert(conn == NULL);
+                               }
+                       }
+               }
+       }
+}
+
+/*
+ * Update the relay'd SPANs for this (node, conn).
+ *
+ * Iterate links and adjust relays to match.  We only propagate the top link
+ * for now (XXX we want to propagate the top two).
+ *
+ * The hammer2_relay_scan_cmp() function locates the first relay element
+ * for any given node.  The relay elements will be sub-sorted by dist.
+ */
+struct relay_scan_info {
+       h2span_node_t *node;
+       h2span_relay_t *relay;
+};
+
+static int
+hammer2_relay_scan_cmp(h2span_relay_t *relay, void *arg)
+{
+       struct relay_scan_info *info = arg;
+
+       if ((intptr_t)relay->link->node < (intptr_t)info->node)
+               return(-1);
+       if ((intptr_t)relay->link->node > (intptr_t)info->node)
+               return(1);
+       return(0);
+}
+
+static int
+hammer2_relay_scan_callback(h2span_relay_t *relay, void *arg)
+{
+       struct relay_scan_info *info = arg;
+
+       info->relay = relay;
+       return(-1);
+}
+
+static void
+hammer2_relay_scan_specific(h2span_node_t *node, h2span_connect_t *conn)
+{
+       struct relay_scan_info info;
+       h2span_relay_t *relay;
+       h2span_relay_t *next_relay;
+       h2span_link_t *slink;
+       int count = 2;
+
+       info.node = node;
+       info.relay = NULL;
+
+       /*
+        * Locate the first related relay for the node on this connection.
+        * relay will be NULL if there were none.
+        */
+       RB_SCAN(h2span_relay_tree, &conn->tree,
+               hammer2_relay_scan_cmp, hammer2_relay_scan_callback, &info);
+       relay = info.relay;
+       info.relay = NULL;
+       if (relay)
+               assert(relay->link->node == node);
+
+       if (DebugOpt > 8)
+               fprintf(stderr, "relay scan for connection %p\n", conn);
+
+       /*
+        * Iterate the node's links (received SPANs) in distance order,
+        * lowest (best) dist first.
+        */
+       /* fprintf(stderr, "LOOP\n"); */
+       RB_FOREACH(slink, h2span_link_tree, &node->tree) {
+               /*
+               fprintf(stderr, "SLINK %p RELAY %p(%p)\n",
+                       slink, relay, relay ? relay->link : NULL);
+               */
+               /*
+                * PROPAGATE THE BEST LINKS OVER THE SPECIFIED CONNECTION.
+                *
+                * Track relays while iterating the best links and construct
+                * missing relays when necessary.
+                *
+                * (If some prior better link was removed it would have also
+                *  removed the relay, so the relay can only match exactly or
+                *  be worse).
+                */
+               if (relay && relay->link == slink) {
+                       /*
+                        * Match, relay already in-place, get the next
+                        * relay to match against the next slink.
+                        */
+                       relay = RB_NEXT(h2span_relay_tree, &conn->tree, relay);
+                       if (--count == 0)
+                               break;
+               } else if (slink->dist > HAMMER2_SPAN_MAXDIST) {
+                       /*
+                        * No match but span distance is too great,
+                        * do not relay.  This prevents endless closed
+                        * loops with ever-incrementing distances when
+                        * the seed span is lost in the graph.
+                        *
+                        * All later spans will also be too far away so
+                        * we can break out of the loop.
+                        */
+                       break;
+               } else if (slink->state->iocom == conn->state->iocom) {
+                       /*
+                        * No match but we would transmit a LNK_SPAN
+                        * out the same connection it came in on, which
+                        * can be trivially optimized out.
+                        */
+                       break;
+               } else {
+                       /*
+                        * No match, distance is ok, construct a new relay.
+                        * (slink is better than relay).
+                        */
+                       hammer2_msg_t *msg;
+
+                       assert(relay == NULL ||
+                              relay->link->node != slink->node ||
+                              relay->link->dist >= slink->dist);
+                       relay = hammer2_alloc(sizeof(*relay));
+                       relay->conn = conn;
+                       relay->link = slink;
+
+                       msg = hammer2_msg_alloc(conn->state->iocom->router, 0,
+                                               HAMMER2_LNK_SPAN |
+                                               HAMMER2_MSGF_CREATE,
+                                               hammer2_lnk_relay, relay);
+                       relay->state = msg->state;
+                       relay->router = hammer2_router_alloc();
+                       relay->router->iocom = relay->state->iocom;
+                       relay->router->relay = relay;
+                       relay->router->target = relay->state->msgid;
+
+                       msg->any.lnk_span = slink->state->msg->any.lnk_span;
+                       msg->any.lnk_span.dist = slink->dist + 1;
+
+                       hammer2_router_connect(relay->router);
+
+                       RB_INSERT(h2span_relay_tree, &conn->tree, relay);
+                       TAILQ_INSERT_TAIL(&slink->relayq, relay, entry);
+
+                       hammer2_msg_write(msg);
+
+                       fprintf(stderr,
+                               "RELAY SPAN %p RELAY %p ON CLS=%p NODE=%p DIST=%d "
+                               "FD %d state %p\n",
+                               slink,
+                               relay,
+                               node->cls, node, slink->dist,
+                               conn->state->iocom->sock_fd, relay->state);
+
+                       /*
+                        * Match (created new relay), get the next relay to
+                        * match against the next slink.
+                        */
+                       relay = RB_NEXT(h2span_relay_tree, &conn->tree, relay);
+                       if (--count == 0)
+                               break;
+               }
+       }
+
+       /*
+        * Any remaining relay's belonging to this connection which match
+        * the node are in excess of the current aggregate spanning state
+        * and should be removed.
+        */
+       while (relay && relay->link->node == node) {
+               next_relay = RB_NEXT(h2span_relay_tree, &conn->tree, relay);
+               hammer2_relay_delete(relay);
+               relay = next_relay;
+       }
+}
+
+static
+void
+hammer2_relay_delete(h2span_relay_t *relay)
+{
+       fprintf(stderr,
+               "RELAY DELETE %p RELAY %p ON CLS=%p NODE=%p DIST=%d FD %d STATE %p\n",
+               relay->link,
+               relay,
+               relay->link->node->cls, relay->link->node,
+               relay->link->dist,
+               relay->conn->state->iocom->sock_fd, relay->state);
+
+       hammer2_router_disconnect(&relay->router);
+
+       RB_REMOVE(h2span_relay_tree, &relay->conn->tree, relay);
+       TAILQ_REMOVE(&relay->link->relayq, relay, entry);
+
+       if (relay->state) {
+               relay->state->any.relay = NULL;
+               hammer2_state_reply(relay->state, 0);
+               /* state invalid after reply */
+               relay->state = NULL;
+       }
+       relay->conn = NULL;
+       relay->link = NULL;
+       hammer2_free(relay);
+}
+
+static void *
+hammer2_volconf_thread(void *info)
+{
+       h2span_media_config_t *conf = info;
+
+       pthread_mutex_lock(&cluster_mtx);
+       while ((conf->ctl & H2CONFCTL_STOP) == 0) {
+               if (conf->ctl & H2CONFCTL_UPDATE) {
+                       fprintf(stderr, "VOLCONF UPDATE\n");
+                       conf->ctl &= ~H2CONFCTL_UPDATE;
+                       if (bcmp(&conf->copy_run, &conf->copy_pend,
+                                sizeof(conf->copy_run)) == 0) {
+                               fprintf(stderr, "VOLCONF: no changes\n");
+                               continue;
+                       }
+                       /*
+                        * XXX TODO - auto reconnect on lookup failure or
+                        *              connect failure or stream failure.
+                        */
+
+                       pthread_mutex_unlock(&cluster_mtx);
+                       hammer2_volconf_stop(conf);
+                       conf->copy_run = conf->copy_pend;
+                       if (conf->copy_run.copyid != 0 &&
+                           strncmp(conf->copy_run.path, "span:", 5) == 0) {
+                               hammer2_volconf_start(conf,
+                                                     conf->copy_run.path + 5);
+                       }
+                       pthread_mutex_lock(&cluster_mtx);
+                       fprintf(stderr, "VOLCONF UPDATE DONE state %d\n", conf->state);
+               }
+               if (conf->state == H2MC_CONNECT) {
+                       hammer2_volconf_start(conf, conf->copy_run.path + 5);
+                       pthread_mutex_unlock(&cluster_mtx);
+                       sleep(5);
+                       pthread_mutex_lock(&cluster_mtx);
+               } else {
+                       pthread_cond_wait(&conf->cond, &cluster_mtx);
+               }
+       }
+       pthread_mutex_unlock(&cluster_mtx);
+       hammer2_volconf_stop(conf);
+       return(NULL);
+}
+
+static
+void
+hammer2_volconf_stop(h2span_media_config_t *conf)
+{
+       switch(conf->state) {
+       case H2MC_STOPPED:
+               break;
+       case H2MC_CONNECT:
+               conf->state = H2MC_STOPPED;
+               break;
+       case H2MC_RUNNING:
+               shutdown(conf->fd, SHUT_WR);
+               pthread_join(conf->iocom_thread, NULL);
+               conf->iocom_thread = NULL;
+               break;
+       }
+}
+
+static
+void
+hammer2_volconf_start(h2span_media_config_t *conf, const char *hostname)
+{
+       hammer2_master_service_info_t *info;
+
+       switch(conf->state) {
+       case H2MC_STOPPED:
+       case H2MC_CONNECT:
+               conf->fd = hammer2_connect(hostname);
+               if (conf->fd < 0) {
+                       fprintf(stderr, "Unable to connect to %s\n", hostname);
+                       conf->state = H2MC_CONNECT;
+               } else {
+                       info = malloc(sizeof(*info));
+                       bzero(info, sizeof(*info));
+                       info->fd = conf->fd;
+                       info->detachme = 0;
+                       conf->state = H2MC_RUNNING;
+                       pthread_create(&conf->iocom_thread, NULL,
+                                      master_service, info);
+               }
+               break;
+       case H2MC_RUNNING:
+               break;
+       }
+}
+
+/************************************************************************
+ *                     ROUTER AND MESSAGING HANDLES                    *
+ ************************************************************************
+ *
+ * Basically the idea here is to provide a stable data structure which
+ * can be localized to the caller for higher level protocols to work with.
+ * Depends on the context, these hammer2_handle's can be pooled by use-case
+ * and remain persistent through a client (or mount point's) life.
+ */
+
+#if 0
+/*
+ * Obtain a stable handle on a cluster given its uuid.  This ties directly
+ * into the global cluster topology, creating the structure if necessary
+ * (even if the uuid does not exist or does not exist yet), and preventing
+ * the structure from getting ripped out from under us while we hold a
+ * pointer to it.
+ */
+h2span_cluster_t *
+hammer2_cluster_get(uuid_t *pfs_clid)
+{
+       h2span_cluster_t dummy_cls;
+       h2span_cluster_t *cls;
+
+       dummy_cls.pfs_clid = *pfs_clid;
+       pthread_mutex_lock(&cluster_mtx);
+       cls = RB_FIND(h2span_cluster_tree, &cluster_tree, &dummy_cls);
+       if (cls)
+               ++cls->refs;
+       pthread_mutex_unlock(&cluster_mtx);
+       return (cls);
+}
+
+void
+hammer2_cluster_put(h2span_cluster_t *cls)
+{
+       pthread_mutex_lock(&cluster_mtx);
+       assert(cls->refs > 0);
+       --cls->refs;
+       if (RB_EMPTY(&cls->tree) && cls->refs == 0) {
+               RB_REMOVE(h2span_cluster_tree,
+                         &cluster_tree, cls);
+               hammer2_free(cls);
+       }
+       pthread_mutex_unlock(&cluster_mtx);
+}
+
+/*
+ * Obtain a stable handle to a specific cluster node given its uuid.
+ * This handle does NOT lock in the route to the node and is typically
+ * used as part of the hammer2_handle_*() API to obtain a set of
+ * stable nodes.
+ */
+h2span_node_t *
+hammer2_node_get(h2span_cluster_t *cls, uuid_t *pfs_fsid)
+{
+}
+
+#endif
+
+#if 0
+/*
+ * Acquire a persistent router structure given the cluster and node ids.
+ * Messages can be transacted via this structure while held.  If the route
+ * is lost messages will return failure.
+ */
+hammer2_router_t *
+hammer2_router_get(uuid_t *pfs_clid, uuid_t *pfs_fsid)
+{
+}
+
+/*
+ * Release previously acquired router.
+ */
+void
+hammer2_router_put(hammer2_router_t *router)
+{
+}
+#endif
+
+/************************************************************************
+ *                             DEBUGGER                                *
+ ************************************************************************/
+/*
+ * Dumps the spanning tree
+ */
+void
+shell_tree(hammer2_router_t *router, char *cmdbuf __unused)
+{
+       h2span_cluster_t *cls;
+       h2span_node_t *node;
+       h2span_link_t *slink;
+       char *uustr = NULL;
+
+       pthread_mutex_lock(&cluster_mtx);
+       RB_FOREACH(cls, h2span_cluster_tree, &cluster_tree) {
+               router_printf(router, "Cluster %s\n",
+                            hammer2_uuid_to_str(&cls->pfs_clid, &uustr));
+               RB_FOREACH(node, h2span_node_tree, &cls->tree) {
+                       router_printf(router, "    Node %s (%s)\n",
+                                hammer2_uuid_to_str(&node->pfs_fsid, &uustr),
+                                node->label);
+                       RB_FOREACH(slink, h2span_link_tree, &node->tree) {
+                               router_printf(router, "\tLink dist=%d via %d\n",
+                                            slink->dist,
+                                            slink->state->iocom->sock_fd);
+                       }
+               }
+       }
+       pthread_mutex_unlock(&cluster_mtx);
+       if (uustr)
+               free(uustr);
+#if 0
+       TAILQ_FOREACH(conn, &connq, entry) {
+       }
+#endif
+}
diff --git a/sbin/hammer2/network.h b/sbin/hammer2/network.h
new file mode 100644 (file)
index 0000000..565d387
--- /dev/null
@@ -0,0 +1,306 @@
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <openssl/rsa.h>       /* public/private key functions */
+#include <openssl/pem.h>       /* public/private key file load */
+#include <openssl/err.h>
+#include <openssl/evp.h>       /* aes_256_cbc functions */
+
+/***************************************************************************
+ *                             CRYPTO HANDSHAKE                           *
+ ***************************************************************************
+ *
+ * The initial public-key exchange is implementing by transmitting a
+ * 512-byte buffer to the other side in a symmetrical fashion.  This
+ * buffer contains the following:
+ *
+ * (1) A random session key.  512 bits is specified.  We use aes_256_cbc()
+ *     and initialize the key with the first 256 bits and the iv[] with
+ *     the second.  Note that the transmitted and received session
+ *     keys are XOR'd together to create the session key used for
+ *     communications (so even if the verifier is compromised the session
+ *     will still be gobbly gook if the public key has not been completely
+ *     broken).
+ *
+ * (2) A verifier to determine that the decode was successful.  It encodes
+ *     an XOR of each group of 4 bytes from the session key.
+ *
+ * (3) Additional configuration and additional random data.
+ *
+ *     - The hammer2 message header magic for endian detect
+ *
+ *     - The hammer2 protocol version.  The two sides agree on the
+ *      smaller of the two.
+ *
+ *     - All unused fields (junk*) are filled with random data.
+ *
+ * This structure must be exactly 512 bytes and expects to use 256-byte
+ * RSA keys.
+ */
+struct hammer2_handshake {
+       char pad1[8];           /* 000 */
+       uint16_t magic;         /* 008 HAMMER2_MSGHDR_MAGIC for endian detect */
+       uint16_t version;       /* 00A hammer2 protocol version */
+       uint32_t flags;         /* 00C protocol extension flags */
+       uint8_t sess[64];       /* 010 512-bit session key */
+       uint8_t verf[16];       /* 050 verifier = ~sess */
+       char quickmsg[32];      /* 060 reason for connecting */
+       char junk080[128];      /* 080-0FF */
+       char pad2[8];           /* 100-107 */
+       char junk100[256-8];    /* 108-1FF */
+};
+
+typedef struct hammer2_handshake hammer2_handshake_t;
+
+
+#define HAMMER2_CRYPTO_CHUNK_SIZE              HAMMER2_MSG_ALIGN
+#define HAMMER2_MAX_IV_SIZE                    32
+
+#define HAMMER2_CRYPTO_GCM_IV_FIXED_SIZE       4
+#define HAMMER2_CRYPTO_GCM_IV_SIZE             12
+#define HAMMER2_CRYPTO_GCM_KEY_SIZE            32
+#define HAMMER2_CRYPTO_GCM_TAG_SIZE            16
+
+#define HAMMER2_CRYPTO_ALGO_GCM_IDX            0
+
+#define HAMMER2_CRYPTO_ALGO                    HAMMER2_CRYPTO_ALGO_GCM_IDX
+
+
+
+
+/***************************************************************************
+ *                             LOW LEVEL MESSAGING                        *
+ ***************************************************************************
+ *
+ * hammer2_msg - A standalone copy of a message, typically referenced by
+ *              or embedded in other structures, or used with I/O queues.
+ *
+ * These structures are strictly temporary, so they do not have to be
+ * particularly optimized for size.  All possible message headers are
+ * directly embedded (any), and the message may contain a reference
+ * to allocated auxillary data.  The structure is recycled quite often
+ * by a connection.
+ *
+ * This structure is typically not used for storing persistent message
+ * state (see hammer2_persist for that).
+ */
+struct hammer2_iocom;
+struct hammer2_persist;
+struct hammer2_state;
+struct hammer2_router;
+struct hammer2_msg;
+
+TAILQ_HEAD(hammer2_state_queue, hammer2_state);
+TAILQ_HEAD(hammer2_msg_queue, hammer2_msg);
+RB_HEAD(hammer2_state_tree, hammer2_state);
+RB_HEAD(hammer2_router_tree, hammer2_router);
+
+struct h2span_link;
+struct h2span_relay;
+struct h2span_connect;
+
+struct hammer2_state {
+       RB_ENTRY(hammer2_state) rbnode;         /* indexed by msgid */
+       struct hammer2_iocom *iocom;
+       struct hammer2_router *router;          /* if routed */
+       uint32_t        txcmd;                  /* mostly for CMDF flags */
+       uint32_t        rxcmd;                  /* mostly for CMDF flags */
+       uint64_t        msgid;                  /* {spanid,msgid} uniq */
+       int             flags;
+       int             error;
+       struct hammer2_msg *msg;
+       void (*func)(struct hammer2_msg *);
+       union {
+               void *any;
+               struct h2span_link *link;
+               struct h2span_connect *conn;
+               struct h2span_relay *relay;
+       } any;
+};
+
+#define HAMMER2_STATE_INSERTED 0x0001
+#define HAMMER2_STATE_DYNAMIC  0x0002
+#define HAMMER2_STATE_NODEID   0x0004          /* manages a node id */
+
+struct hammer2_msg {
+       TAILQ_ENTRY(hammer2_msg) qentry;
+       struct hammer2_router *router;
+       struct hammer2_state *state;
+       size_t          hdr_size;
+       size_t          aux_size;
+       char            *aux_data;
+       hammer2_msg_any_t any;
+};
+
+typedef struct hammer2_state hammer2_state_t;
+typedef struct hammer2_msg hammer2_msg_t;
+typedef struct hammer2_msg_queue hammer2_msg_queue_t;
+
+int hammer2_state_cmp(hammer2_state_t *state1, hammer2_state_t *state2);
+RB_PROTOTYPE(hammer2_state_tree, hammer2_state, rbnode, hammer2_state_cmp);
+
+/*
+ * hammer2_ioq - An embedded component of hammer2_connect, holds state
+ * for the buffering and parsing of incoming and outgoing messages.
+ *
+ * cdx - beg  - processed buffer data, encrypted or decrypted
+ * end - cdn  - unprocessed buffer data not yet encrypted or decrypted
+ */
+struct hammer2_ioq {
+       enum { HAMMER2_MSGQ_STATE_HEADER1,
+              HAMMER2_MSGQ_STATE_HEADER2,
+              HAMMER2_MSGQ_STATE_AUXDATA1,
+              HAMMER2_MSGQ_STATE_AUXDATA2,
+              HAMMER2_MSGQ_STATE_ERROR } state;
+       size_t          fifo_beg;               /* buffered data */
+       size_t          fifo_cdx;               /* cdx-beg processed */
+       size_t          fifo_cdn;               /* end-cdn unprocessed */
+       size_t          fifo_end;
+       size_t          hbytes;                 /* header size */
+       size_t          abytes;                 /* aux_data size */
+       int             error;
+       int             seq;                    /* salt sequencer */
+       int             msgcount;
+       EVP_CIPHER_CTX  ctx;
+       char            iv[HAMMER2_MAX_IV_SIZE]; /* encrypt or decrypt iv[] */
+       hammer2_msg_t   *msg;
+       hammer2_msg_queue_t msgq;
+       char            buf[HAMMER2_MSGBUF_SIZE]; /* staging buffer */
+};
+
+typedef struct hammer2_ioq hammer2_ioq_t;
+
+#define HAMMER2_IOQ_ERROR_SYNC         1       /* bad magic / out of sync */
+#define HAMMER2_IOQ_ERROR_EOF          2       /* unexpected EOF */
+#define HAMMER2_IOQ_ERROR_SOCK         3       /* read() error on socket */
+#define HAMMER2_IOQ_ERROR_FIELD                4       /* invalid field */
+#define HAMMER2_IOQ_ERROR_HCRC         5       /* core header crc bad */
+#define HAMMER2_IOQ_ERROR_XCRC         6       /* ext header crc bad */
+#define HAMMER2_IOQ_ERROR_ACRC         7       /* aux data crc bad */
+#define HAMMER2_IOQ_ERROR_STATE                8       /* bad state */
+#define HAMMER2_IOQ_ERROR_NOPEER       9       /* bad socket peer */
+#define HAMMER2_IOQ_ERROR_NORKEY       10      /* no remote keyfile found */
+#define HAMMER2_IOQ_ERROR_NOLKEY       11      /* no local keyfile found */
+#define HAMMER2_IOQ_ERROR_KEYXCHGFAIL  12      /* key exchange failed */
+#define HAMMER2_IOQ_ERROR_KEYFMT       13      /* key file format problem */
+#define HAMMER2_IOQ_ERROR_BADURANDOM   14      /* /dev/urandom is bad */
+#define HAMMER2_IOQ_ERROR_MSGSEQ       15      /* message sequence error */
+#define HAMMER2_IOQ_ERROR_EALREADY     16      /* ignore this message */
+#define HAMMER2_IOQ_ERROR_TRANS                17      /* state transaction issue */
+#define HAMMER2_IOQ_ERROR_IVWRAP       18      /* IVs exhaused */
+#define HAMMER2_IOQ_ERROR_MACFAIL      19      /* MAC of encryption algorithm failed */
+#define HAMMER2_IOQ_ERROR_ALGO         20      /* Misc. encryption algorithm error */
+
+#define HAMMER2_IOQ_MAXIOVEC    16
+
+/*
+ * hammer2_router - governs the routing of a message.  Passed into
+ *                 hammer2_msg_write.
+ *
+ * The router is either connected to an iocom (socket) directly, or
+ * connected to a SPAN transaction (h2span_link structure for outgoing)
+ * or to a SPAN transaction (h2span_relay structure for incoming).
+ */
+struct hammer2_router {
+       RB_ENTRY(hammer2_router) rbnode;        /* indexed by target */
+       struct hammer2_iocom *iocom;
+       struct h2span_link   *link;             /* may be NULL */
+       struct h2span_relay  *relay;            /* may be NULL */
+       void    (*signal_callback)(struct hammer2_router *);
+       void    (*rcvmsg_callback)(struct hammer2_msg *);
+       void    (*altmsg_callback)(struct hammer2_iocom *);
+       struct hammer2_state_tree staterd_tree; /* active messages */
+       struct hammer2_state_tree statewr_tree; /* active messages */
+       hammer2_msg_queue_t txmsgq;             /* tx msgq from remote */
+       uint64_t        target;                 /* for routing */
+       int             flags;
+       int             refs;                   /* refs prevent destruction */
+};
+
+#define HAMMER2_ROUTER_CONNECTED       0x0001  /* on global RB tree */
+#define HAMMER2_ROUTER_DELETED         0x0002  /* parent structure destroyed */
+
+typedef struct hammer2_router hammer2_router_t;
+
+int hammer2_router_cmp(hammer2_router_t *router1, hammer2_router_t *router2);
+RB_PROTOTYPE(hammer2_router_tree, hammer2_router, rbnode, hammer2_router_cmp);
+
+/*
+ * hammer2_iocom - governs a messaging stream connection
+ */
+struct hammer2_iocom {
+       hammer2_ioq_t   ioq_rx;
+       hammer2_ioq_t   ioq_tx;
+       hammer2_msg_queue_t freeq;              /* free msgs hdr only */
+       hammer2_msg_queue_t freeq_aux;          /* free msgs w/aux_data */
+       int     sock_fd;                        /* comm socket or pipe */
+       int     alt_fd;                         /* thread signal, tty, etc */
+       int     wakeupfds[2];                   /* pipe wakes up iocom thread */
+       int     flags;
+       int     rxmisc;
+       int     txmisc;
+       struct hammer2_router *router;
+       pthread_mutex_t mtx;                    /* mutex for state*tree/rmsgq */
+};
+
+typedef struct hammer2_iocom hammer2_iocom_t;
+
+#define HAMMER2_IOCOMF_EOF     0x00000001      /* EOF or ERROR on desc */
+#define HAMMER2_IOCOMF_RREQ    0x00000002      /* request read-data event */
+#define HAMMER2_IOCOMF_WREQ    0x00000004      /* request write-avail event */
+#define HAMMER2_IOCOMF_RWORK   0x00000008      /* immediate work pending */
+#define HAMMER2_IOCOMF_WWORK   0x00000010      /* immediate work pending */
+#define HAMMER2_IOCOMF_PWORK   0x00000020      /* immediate work pending */
+#define HAMMER2_IOCOMF_ARWORK  0x00000040      /* immediate work pending */
+#define HAMMER2_IOCOMF_AWWORK  0x00000080      /* immediate work pending */
+#define HAMMER2_IOCOMF_SWORK   0x00000100      /* immediate work pending */
+#define HAMMER2_IOCOMF_CRYPTED 0x00000200      /* encrypt enabled */
+
+/*
+ * Crypto algorithm table and related typedefs.
+ */
+
+typedef int (*algo_init_fn)(hammer2_ioq_t *, char *, int, char *, int, int);
+typedef int (*algo_enc_fn)(hammer2_ioq_t *, char *, char *, int, int *);
+typedef int (*algo_dec_fn)(hammer2_ioq_t *, char *, char *, int, int *);
+
+struct crypto_algo {
+       const char      *name;
+       int             keylen;
+       int             taglen;
+       algo_init_fn    init;
+       algo_enc_fn     enc_chunk;
+       algo_dec_fn     dec_chunk;
+};
diff --git a/sbin/hammer2/subs.c b/sbin/hammer2/subs.c
new file mode 100644 (file)
index 0000000..d04c410
--- /dev/null
@@ -0,0 +1,335 @@
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "hammer2.h"
+
+/*
+ * Obtain a file descriptor that the caller can execute ioctl()'s on.
+ */
+int
+hammer2_ioctl_handle(const char *sel_path)
+{
+       struct hammer2_ioc_version info;
+       int fd;
+
+       if (sel_path == NULL)
+ &nb