hammer2 - Early messaging infrastructure
authorMatthew Dillon <dillon@apollo.backplane.com>
Fri, 13 Apr 2012 02:43:31 +0000 (19:43 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Fri, 13 Apr 2012 02:43:31 +0000 (19:43 -0700)
* Implement the core message read/write loop, using poll() and
  non-blocking I/O.  This code can also handle a auxillary events from
  a pipe or tty.

  The initial implementation is very simple and does not yet do message
  tracking and recording.

* Implement piecemeal message parsing and sanity/crc checks.

* Implement piecemeal message generation.

* Implement simple debug messaging for testing and future debugging.

  Debug commands and messages use link-local messages (source=0, target=0),
  in one-way mode (msgid=0), so they're trivial to formulate.

  hammer2 -d node (run master listener in debug mode)
  hammer2 debug (in another xterm connect to it)

  Debug messages are really simple, sending a debug command sends a
  line buffer, and the replies contain buffers to write to stdout.

  The target node is responsible for providing the prompt, so we have
  positive feedback that the debug stream is working.

sbin/hammer2/Makefile
sbin/hammer2/cmd_debug.c [new file with mode: 0644]
sbin/hammer2/cmd_leaf.c [new file with mode: 0644]
sbin/hammer2/cmd_node.c [new file with mode: 0644]
sbin/hammer2/hammer2.h
sbin/hammer2/icrc.c [new file with mode: 0644]
sbin/hammer2/main.c
sbin/hammer2/msg.c [new file with mode: 0644]
sbin/hammer2/network.h [new file with mode: 0644]
sbin/hammer2/subs.c
sys/vfs/hammer2/hammer2_network.h [new file with mode: 0644]

index 742a93c..49b4689 100644 (file)
@@ -1,6 +1,7 @@
 PROG=  hammer2
-SRCS=  main.c subs.c
-SRCS+= cmd_remote.c cmd_snapshot.c cmd_pfs.c cmd_helper.c
+SRCS=  main.c subs.c icrc.c msg.c
+SRCS+= cmd_remote.c cmd_snapshot.c cmd_pfs.c
+SRCS+= cmd_node.c cmd_leaf.c cmd_debug.c
 #MAN=  hammer2.8
 NOMAN= TRUE
 
diff --git a/sbin/hammer2/cmd_debug.c b/sbin/hammer2/cmd_debug.c
new file mode 100644 (file)
index 0000000..230d886
--- /dev/null
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "hammer2.h"
+
+static void debug_recv(hammer2_iocom_t *iocom);
+static void debug_send(hammer2_iocom_t *iocom);
+static void debug_tty(hammer2_iocom_t *iocom);
+static void hammer2_debug_parse(hammer2_iocom_t *iocom,
+                               hammer2_msg_t *msg, char *cmdbuf);
+
+int
+cmd_debug(void)
+{
+       struct sockaddr_in lsin;
+       struct hammer2_iocom iocom;
+       hammer2_msg_t *msg;
+       int fd;
+
+       /*
+        * Acquire socket and set options
+        */
+       if ((fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
+               fprintf(stderr, "cmd_debug: socket(): %s\n",
+                       strerror(errno));
+               return 1;
+       }
+
+       /*
+        * Connect to the target
+        */
+       bzero(&lsin, sizeof(lsin));
+       lsin.sin_family = AF_INET;
+       lsin.sin_addr.s_addr = 0;
+       lsin.sin_port = htons(HAMMER2_LISTEN_PORT);
+       if (connect(fd, (struct sockaddr *)&lsin, sizeof(lsin)) < 0) {
+               close(fd);
+               fprintf(stderr, "debug: connect failed: %s\n",
+                       strerror(errno));
+               return 0;
+       }
+
+       /*
+        * Run the session.  The remote end transmits our prompt.
+        */
+       hammer2_iocom_init(&iocom, fd, 0);
+       printf("debug: connected\n");
+
+       msg = hammer2_iocom_allocmsg(&iocom, HAMMER2_DBG_SHELL, 0);
+       hammer2_ioq_write(&iocom, msg);
+
+       hammer2_iocom_core(&iocom, debug_recv, debug_send, debug_tty);
+       fprintf(stderr, "debug: disconnected\n");
+       close(fd);
+       return 0;
+}
+
+/*
+ * Callback from hammer2_iocom_core() when messages might be present
+ * on the socket.
+ */
+static
+void
+debug_recv(hammer2_iocom_t *iocom)
+{
+       hammer2_msg_t *msg;
+
+       while ((iocom->flags & HAMMER2_IOCOMF_EOF) == 0 &&
+              (msg = hammer2_ioq_read(iocom)) != NULL) {
+               switch(msg->any.head.cmd & HAMMER2_MSGF_CMDSWMASK) {
+               case HAMMER2_LNK_ERROR:
+                       fprintf(stderr, "Link Error: %d\n",
+                               msg->any.head.error);
+                       break;
+               case HAMMER2_DBG_SHELL:
+                       /*
+                        * We send the commands, not accept them.
+                        */
+                       hammer2_iocom_freemsg(iocom, msg);
+                       break;
+               case HAMMER2_DBG_SHELL | HAMMER2_MSGF_REPLY:
+                       /*
+                        * A reply from the remote is data we copy to stdout.
+                        */
+                       if (msg->aux_size) {
+                               msg->aux_data[msg->aux_size - 1] = 0;
+                               write(1, msg->aux_data, strlen(msg->aux_data));
+                       }
+                       hammer2_iocom_freemsg(iocom, msg);
+                       break;
+               default:
+                       assert((msg->any.head.cmd & HAMMER2_MSGF_REPLY) == 0);
+                       fprintf(stderr, "Unknown message: %08x\n",
+                               msg->any.head.cmd);
+                       hammer2_ioq_reply_term(iocom, msg,
+                                              HAMMER2_MSG_ERR_UNKNOWN);
+                       break;
+               }
+       }
+       if (iocom->ioq_rx.error) {
+               fprintf(stderr, "node_master_recv: comm error %d\n",
+                       iocom->ioq_rx.error);
+       }
+}
+
+/*
+ * Callback from hammer2_iocom_core() when messages might be transmittable
+ * to the socket.
+ */
+static
+void
+debug_send(hammer2_iocom_t *iocom)
+{
+       hammer2_ioq_write(iocom, NULL);
+}
+
+static
+void
+debug_tty(hammer2_iocom_t *iocom)
+{
+       hammer2_msg_t *msg;
+       char buf[256];
+       size_t len;
+
+       if (fgets(buf, sizeof(buf), stdin) != NULL) {
+               len = strlen(buf);
+               if (len && buf[len - 1] == '\n')
+                       buf[--len] = 0;
+               ++len;
+               msg = hammer2_iocom_allocmsg(iocom, HAMMER2_DBG_SHELL, len);
+               bcopy(buf, msg->aux_data, len);
+               hammer2_ioq_write(iocom, msg);
+       } else {
+               /*
+                * Set EOF flag without setting any error code for normal
+                * EOF.
+                */
+               iocom->flags |= HAMMER2_IOCOMF_EOF;
+       }
+}
+
+/*
+ * This is called from the master node to process a received debug
+ * shell command.  We process the command, outputting the results,
+ * then finish up by outputting another prompt.
+ */
+void
+hammer2_debug_remote(hammer2_iocom_t *iocom, hammer2_msg_t *msg)
+{
+       if (msg->aux_data)
+               msg->aux_data[msg->aux_size - 1] = 0;
+       if (msg->any.head.cmd & HAMMER2_MSGF_REPLY) {
+               /*
+                * A reply just prints out the string.  No newline is added
+                * (it is expected to be embedded if desired).
+                */
+               if (msg->aux_data)
+                       write(2, msg->aux_data, strlen(msg->aux_data));
+               hammer2_iocom_freemsg(iocom, msg);
+       } else {
+               /*
+                * Otherwise this is a command which we must process.
+                * When we are finished we generate a final reply.
+                */
+               hammer2_debug_parse(iocom, msg, msg->aux_data);
+               iocom_printf(iocom, msg, "debug> ");
+               hammer2_iocom_freemsg(iocom, msg);
+       }
+}
+
+static void
+hammer2_debug_parse(hammer2_iocom_t *iocom, hammer2_msg_t *msg, char *cmdbuf)
+{
+       char *cmd = strsep(&cmdbuf, " \t");
+
+       if (cmd == NULL || *cmd == 0) {
+               ;
+       } else if (strcmp(cmd, "help") == 0 || strcmp(cmd, "?") == 0) {
+               iocom_printf(iocom, msg,
+                            "help        Command help\n"
+               );
+       } else {
+               iocom_printf(iocom, msg, "Unrecognized command: %s\n", cmd);
+       }
+}
+
+void
+iocom_printf(hammer2_iocom_t *iocom, hammer2_msg_t *msg, const char *ctl, ...)
+{
+       hammer2_msg_t *rmsg;
+       va_list va;
+       char buf[1024];
+       size_t len;
+
+       va_start(va, ctl);
+       vsnprintf(buf, sizeof(buf), ctl, va);
+       va_end(va);
+       len = strlen(buf) + 1;
+
+       rmsg = hammer2_iocom_allocmsg(iocom, HAMMER2_DBG_SHELL, len);
+       bcopy(buf, rmsg->aux_data, len);
+       rmsg->any.head = msg->any.head;
+       rmsg->any.head.aux_icrc = 0;
+
+       hammer2_ioq_reply(iocom, rmsg);
+}
diff --git a/sbin/hammer2/cmd_leaf.c b/sbin/hammer2/cmd_leaf.c
new file mode 100644 (file)
index 0000000..a9026e2
--- /dev/null
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "hammer2.h"
+
+/*
+ * Start-up the leaf daemon for a PFS on this machine.
+ *
+ * One leaf daemon is run for each mounted PFS.  The daemon may multi-thread
+ * to improve performance if desired.  The daemon performs the following
+ * functions:
+ *
+ *     (1) Makes and maintains connections to all cluster nodes found for
+ *         the PFS, retrieved from the REMOTE configuration stored in
+ *         the HAMMER2 mount.  A localhost connection is always implied
+ *         (using the backbone), but also having more direct connections
+ *         can result in higher performance.
+ *
+ *         This also includes any required encryption or authentication.
+ *
+ *     (2) Runs the spanning tree protocol as a leaf, meaning that
+ *         the leaf daemon does not serve as a relay and the individual
+ *         connections made in (1) do not cross-connect.
+ *
+ *     (3) Obtains the PFS's registration and makes it available to the
+ *         cluster via the spanning tree protocol.
+ *
+ *     (4) Creates a communications pipe to the HAMMER2 VFS in the kernel
+ *         (installed via ioctl()) which the HAMMER2 VFS uses to accept and
+ *         communicate high-level requests.
+ *
+ *     (5) Performs all complex high-level messaging protocol operations,
+ *         such as quorum operations, maintains persistent cache state,
+ *         and so on and so forth.
+ *
+ * As you may have noted, the leaf daemon serves as an intermediary between
+ * the kernel and the rest of the cluster.  The kernel will issue high level
+ * protocol commands to the leaf which performs the protocol and sends a
+ * response.  The kernel does NOT have to deal with the quorum or other
+ * complex maintainance.
+ *
+ * Basically the kernel is simply another client from the point of view
+ * of the high-level protocols, requesting cache state locks and such from
+ * the leaf (in a degenerate situation one master lock is all that is needed).
+ * If the kernel PFS has local media storage that storage can be used for
+ * numerous purposes, such as caching, and in the degenerate non-clustered
+ * case simply represents the one-and-only master copy of the filesystem.
+ */
+int
+cmd_leaf(const char *sel_info)
+{
+       int ecode = 0;
+       int fd;
+
+       /*
+        * Obtain an ioctl descriptor and retrieve the registration info
+        * for the PFS.
+        */
+       if ((fd = hammer2_ioctl_handle(sel_info)) < 0)
+               return(1);
+
+       /*
+        * Start a daemon to interconnect the HAMMER2 PFS in-kernel to the
+        * master-node daemon.  This daemon's thread will spend most of its
+        * time in the kernel.
+        */
+/*     hammer2_demon(helper_pfs_interlink, (void *)(intptr_t)fd);*/
+       if (NormalExit)
+               close(fd);
+
+       return ecode;
+}
+
+#if 0
+/*
+ * LEAF interconnect between PFS and the messaging core.  We create a
+ * socket connection to the messaging core, register the PFS with the
+ * core, and then pass the messaging descriptor to the kernel.
+ *
+ * The kernel takes over operation of the interconnect until the filesystem
+ * is unmounted or the descriptor is lost or explicitly terminated via
+ * a hammer2 command.
+ *
+ * This is essentially a localhost connection, so we don't have to worry
+ * about encryption.  Any encryption will be handled by the messaging
+ * core.
+ */
+static
+void *
+leaf_connect(void *data)
+{
+       int fd;
+
+       fd = (int)(intptr_t)data;
+
+       return (NULL);
+}
+#endif
diff --git a/sbin/hammer2/cmd_node.c b/sbin/hammer2/cmd_node.c
new file mode 100644 (file)
index 0000000..4ba95cb
--- /dev/null
@@ -0,0 +1,214 @@
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "hammer2.h"
+
+static void *node_master_accept(void *data);
+static void *node_master_service(void *data);
+static void node_master_recv(hammer2_iocom_t *iocom);
+static void node_master_send(hammer2_iocom_t *iocom);
+
+/*
+ * Start-up the master listener daemon for the machine.
+ *
+ * The master listener serves as a rendezvous point in the cluster, accepting
+ * connections, performing registrations and authentications, maintaining
+ * the spanning tree, and keeping track of message state so disconnects can
+ * be handled properly.
+ *
+ * Once authenticated only low-level messaging protocols (which includes
+ * tracking persistent messages) are handled by this daemon.  This daemon
+ * does not run the higher level quorum or locking protocols.
+ *
+ * This daemon can also be told to maintain connections to other nodes,
+ * forming a messaging backbone, which in turn allows PFS's (if desired) to
+ * simply connect to the master daemon via localhost if desired.
+ * Backbones are specified via /etc/hammer2.conf.
+ */
+int
+cmd_node(void)
+{
+       struct sockaddr_in lsin;
+       int on;
+       int lfd;
+
+       /*
+        * Acquire socket and set options
+        */
+       if ((lfd = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
+               fprintf(stderr, "node_master_listen: socket(): %s\n",
+                       strerror(errno));
+               return 1;
+       }
+       on = 1;
+       setsockopt(lfd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on));
+
+       /*
+        * Setup listen port and try to bind.  If the bind fails we assume
+        * that a master listener process is already running and silently
+        * fail.
+        */
+       bzero(&lsin, sizeof(lsin));
+       lsin.sin_family = AF_INET;
+       lsin.sin_addr.s_addr = INADDR_ANY;
+       lsin.sin_port = htons(HAMMER2_LISTEN_PORT);
+       if (bind(lfd, (struct sockaddr *)&lsin, sizeof(lsin)) < 0) {
+               close(lfd);
+               fprintf(stderr, "master listen: daemon already running\n");
+               return 0;
+       }
+       fprintf(stderr, "master listen: startup\n");
+       listen(lfd, 50);
+
+       /*
+        * Fork and disconnect the controlling terminal and parent process,
+        * executing the specified function as a pthread.
+        *
+        * Returns to the original process which can then continue running.
+        * In debug mode this call will create the pthread without forking
+        * and set NormalExit to 0, instead of fork.
+        */
+       hammer2_demon(node_master_accept, (void *)(intptr_t)lfd);
+       if (NormalExit)
+               close(lfd);
+       return 0;
+}
+
+/*
+ * Master listen/accept thread.  Accept connections on the master socket,
+ * starting a pthread for each one.
+ */
+static
+void *
+node_master_accept(void *data)
+{
+       struct sockaddr_in asin;
+       socklen_t alen;
+       pthread_t thread;
+       int lfd = (int)(intptr_t)data;
+       int fd;
+
+       /*
+        * Nobody waits for us
+        */
+       setproctitle("hammer2 master listen");
+       pthread_detach(pthread_self());
+
+       /*
+        * Accept connections and create pthreads to handle them after
+        * validating the IP.
+        */
+       for (;;) {
+               alen = sizeof(asin);
+               fd = accept(lfd, (struct sockaddr *)&asin, &alen);
+               if (fd < 0) {
+                       if (errno == EINTR)
+                               continue;
+                       break;
+               }
+               thread = NULL;
+               fprintf(stderr, "node_master_accept: accept fd %d\n", fd);
+               pthread_create(&thread, NULL,
+                              node_master_service, (void *)(intptr_t)fd);
+       }
+       return (NULL);
+}
+
+/*
+ * Service an accepted connection (runs as a pthread)
+ */
+static
+void *
+node_master_service(void *data)
+{
+       hammer2_iocom_t iocom;
+       int fd;
+
+       fd = (int)(intptr_t)data;
+       hammer2_iocom_init(&iocom, fd, -1);
+       hammer2_iocom_core(&iocom, node_master_recv, node_master_send, NULL);
+
+       fprintf(stderr,
+               "iocom on fd %d terminated error rx=%d, tx=%d\n",
+               fd, iocom.ioq_rx.error, iocom.ioq_tx.error);
+       close(fd);
+
+       return (NULL);
+}
+
+/*
+ * Callback from hammer2_iocom_core() when messages might be present
+ * on the socket.
+ */
+static
+void
+node_master_recv(hammer2_iocom_t *iocom)
+{
+       hammer2_msg_t *msg;
+
+       while ((iocom->flags & HAMMER2_IOCOMF_EOF) == 0 &&
+              (msg = hammer2_ioq_read(iocom)) != NULL) {
+               fprintf(stderr, "MSG RECEIVED: %08x error %d\n",
+                       msg->any.head.cmd, msg->any.head.error);
+               switch(msg->any.head.cmd & HAMMER2_MSGF_CMDSWMASK) {
+               case HAMMER2_LNK_ERROR:
+                       break;
+               case HAMMER2_DBG_SHELL:
+               case HAMMER2_DBG_SHELL | HAMMER2_MSGF_REPLY:
+                       hammer2_debug_remote(iocom, msg);
+                       break;
+               default:
+                       hammer2_ioq_reply_term(iocom, msg,
+                                              HAMMER2_MSG_ERR_UNKNOWN);
+                       break;
+               }
+       }
+       if (iocom->ioq_rx.error) {
+               fprintf(stderr,
+                       "node_master_recv: comm error %d\n",
+                       iocom->ioq_rx.error);
+       }
+}
+
+/*
+ * Callback from hammer2_iocom_core() when messages might be transmittable
+ * to the socket.
+ */
+static
+void
+node_master_send(hammer2_iocom_t *iocom)
+{
+       hammer2_ioq_write(iocom, NULL);
+}
index 7f451e5..bfacbab 100644 (file)
  * Rollup headers for hammer2 utility
  */
 #include <sys/types.h>
+#include <sys/uio.h>
 #include <sys/mount.h>
 #include <sys/file.h>
 #include <sys/socket.h>
 #include <sys/time.h>
 #include <sys/wait.h>
 #include <sys/tty.h>
+#include <sys/endian.h>
 
 #include <netinet/in.h>
 #include <netinet/tcp.h>
 #include <vfs/hammer2/hammer2_disk.h>
 #include <vfs/hammer2/hammer2_mount.h>
 #include <vfs/hammer2/hammer2_ioctl.h>
+#include <vfs/hammer2/hammer2_network.h>
 
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdarg.h>
 #include <stddef.h>
 
-#include <ctype.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <signal.h>
 #include <string.h>
 #include <unistd.h>
-#include <pthread.h>
+#include <ctype.h>
 #include <uuid.h>
+#include <assert.h>
+#include <pthread.h>
+#include <poll.h>
+
+#include "network.h"
 
 extern int DebugOpt;
 extern int NormalExit;
 
 int hammer2_ioctl_handle(const char *sel_path);
-void hammer2_disconnect(void *(*func)(void *), void *arg);
+void hammer2_demon(void *(*func)(void *), void *arg);
+void hammer2_bswap_head(hammer2_msg_hdr_t *head);
 
 int cmd_remote_connect(const char *sel_path, const char *url);
 int cmd_remote_disconnect(const char *sel_path, const char *url);
@@ -82,4 +90,31 @@ int cmd_pfs_create(const char *sel_path, const char *name,
                        uint8_t pfs_type, const char *uuid_str);
 int cmd_pfs_delete(const char *sel_path, const char *name);
 
-int cmd_helper(const char *sel_path);
+int cmd_node(void);
+int cmd_leaf(const char *sel_path);
+int cmd_debug(void);
+
+void hammer2_ioq_init(hammer2_iocom_t *iocom, hammer2_ioq_t *ioq);
+void hammer2_ioq_done(hammer2_iocom_t *iocom, hammer2_ioq_t *ioq);
+void hammer2_iocom_init(hammer2_iocom_t *iocom, int sock_fd, int alt_fd);
+void hammer2_iocom_done(hammer2_iocom_t *iocom);
+hammer2_msg_t *hammer2_iocom_allocmsg(hammer2_iocom_t *iocom,
+                       uint32_t cmd, int aux_size);
+void hammer2_iocom_reallocmsg(hammer2_iocom_t *iocom, hammer2_msg_t *msg,
+                       int aux_size);
+void hammer2_iocom_freemsg(hammer2_iocom_t *iocom, hammer2_msg_t *msg);
+
+void hammer2_iocom_core(hammer2_iocom_t *iocom,
+                       void (*iocom_recvmsg)(hammer2_iocom_t *),
+                       void (*iocom_sendmsg)(hammer2_iocom_t *),
+                       void (*iocom_altmsg)(hammer2_iocom_t *));
+hammer2_msg_t *hammer2_ioq_read(hammer2_iocom_t *iocon);
+void hammer2_ioq_write(hammer2_iocom_t *iocon, hammer2_msg_t *msg);
+void hammer2_ioq_reply(hammer2_iocom_t *iocom, hammer2_msg_t *msg);
+void hammer2_ioq_reply_term(hammer2_iocom_t *iocom, hammer2_msg_t *msg,
+                       uint16_t error);
+void hammer2_ioq_write_drain(hammer2_iocom_t *iocon);
+
+void hammer2_debug_remote(hammer2_iocom_t *iocom, hammer2_msg_t *msg);
+void iocom_printf(hammer2_iocom_t *iocom, hammer2_msg_t *msg,
+                       const char *ctl, ...);
diff --git a/sbin/hammer2/icrc.c b/sbin/hammer2/icrc.c
new file mode 100644 (file)
index 0000000..82cadcd
--- /dev/null
@@ -0,0 +1,147 @@
+/*-
+ * Copyright (c) 2005-2010 Daniel Braniss <danny@cs.huji.ac.il>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+/*
+ | iSCSI
+ | $Id: isc_subr.c 560 2009-05-07 07:37:49Z danny $
+ */
+
+#include <sys/types.h>
+#include <sys/uuid.h>
+
+#include <vfs/hammer2/hammer2_disk.h>
+
+/*****************************************************************/
+/*                                                               */
+/* CRC LOOKUP TABLE                                              */
+/* ================                                              */
+/* The following CRC lookup table was generated automagically    */
+/* by the Rocksoft^tm Model CRC Algorithm Table Generation       */
+/* Program V1.0 using the following model parameters:            */
+/*                                                               */
+/*    Width   : 4 bytes.                                         */
+/*    Poly    : 0x1EDC6F41L                                      */
+/*    Reverse : TRUE.                                            */
+/*                                                               */
+/* For more information on the Rocksoft^tm Model CRC Algorithm,  */
+/* see the document titled "A Painless Guide to CRC Error        */
+/* Detection Algorithms" by Ross Williams                        */
+/* (ross@guest.adelaide.edu.au.). This document is likely to be  */
+/* in the FTP archive "ftp.adelaide.edu.au/pub/rocksoft".        */
+/*                                                               */
+/*****************************************************************/
+
+static uint32_t crc32Table[256] = {
+    0x00000000L, 0xF26B8303L, 0xE13B70F7L, 0x1350F3F4L,
+    0xC79A971FL, 0x35F1141CL, 0x26A1E7E8L, 0xD4CA64EBL,
+    0x8AD958CFL, 0x78B2DBCCL, 0x6BE22838L, 0x9989AB3BL,
+    0x4D43CFD0L, 0xBF284CD3L, 0xAC78BF27L, 0x5E133C24L,
+    0x105EC76FL, 0xE235446CL, 0xF165B798L, 0x030E349BL,
+    0xD7C45070L, 0x25AFD373L, 0x36FF2087L, 0xC494A384L,
+    0x9A879FA0L, 0x68EC1CA3L, 0x7BBCEF57L, 0x89D76C54L,
+    0x5D1D08BFL, 0xAF768BBCL, 0xBC267848L, 0x4E4DFB4BL,
+    0x20BD8EDEL, 0xD2D60DDDL, 0xC186FE29L, 0x33ED7D2AL,
+    0xE72719C1L, 0x154C9AC2L, 0x061C6936L, 0xF477EA35L,
+    0xAA64D611L, 0x580F5512L, 0x4B5FA6E6L, 0xB93425E5L,
+    0x6DFE410EL, 0x9F95C20DL, 0x8CC531F9L, 0x7EAEB2FAL,
+    0x30E349B1L, 0xC288CAB2L, 0xD1D83946L, 0x23B3BA45L,
+    0xF779DEAEL, 0x05125DADL, 0x1642AE59L, 0xE4292D5AL,
+    0xBA3A117EL, 0x4851927DL, 0x5B016189L, 0xA96AE28AL,
+    0x7DA08661L, 0x8FCB0562L, 0x9C9BF696L, 0x6EF07595L,
+    0x417B1DBCL, 0xB3109EBFL, 0xA0406D4BL, 0x522BEE48L,
+    0x86E18AA3L, 0x748A09A0L, 0x67DAFA54L, 0x95B17957L,
+    0xCBA24573L, 0x39C9C670L, 0x2A993584L, 0xD8F2B687L,
+    0x0C38D26CL, 0xFE53516FL, 0xED03A29BL, 0x1F682198L,
+    0x5125DAD3L, 0xA34E59D0L, 0xB01EAA24L, 0x42752927L,
+    0x96BF4DCCL, 0x64D4CECFL, 0x77843D3BL, 0x85EFBE38L,
+    0xDBFC821CL, 0x2997011FL, 0x3AC7F2EBL, 0xC8AC71E8L,
+    0x1C661503L, 0xEE0D9600L, 0xFD5D65F4L, 0x0F36E6F7L,
+    0x61C69362L, 0x93AD1061L, 0x80FDE395L, 0x72966096L,
+    0xA65C047DL, 0x5437877EL, 0x4767748AL, 0xB50CF789L,
+    0xEB1FCBADL, 0x197448AEL, 0x0A24BB5AL, 0xF84F3859L,
+    0x2C855CB2L, 0xDEEEDFB1L, 0xCDBE2C45L, 0x3FD5AF46L,
+    0x7198540DL, 0x83F3D70EL, 0x90A324FAL, 0x62C8A7F9L,
+    0xB602C312L, 0x44694011L, 0x5739B3E5L, 0xA55230E6L,
+    0xFB410CC2L, 0x092A8FC1L, 0x1A7A7C35L, 0xE811FF36L,
+    0x3CDB9BDDL, 0xCEB018DEL, 0xDDE0EB2AL, 0x2F8B6829L,
+    0x82F63B78L, 0x709DB87BL, 0x63CD4B8FL, 0x91A6C88CL,
+    0x456CAC67L, 0xB7072F64L, 0xA457DC90L, 0x563C5F93L,
+    0x082F63B7L, 0xFA44E0B4L, 0xE9141340L, 0x1B7F9043L,
+    0xCFB5F4A8L, 0x3DDE77ABL, 0x2E8E845FL, 0xDCE5075CL,
+    0x92A8FC17L, 0x60C37F14L, 0x73938CE0L, 0x81F80FE3L,
+    0x55326B08L, 0xA759E80BL, 0xB4091BFFL, 0x466298FCL,
+    0x1871A4D8L, 0xEA1A27DBL, 0xF94AD42FL, 0x0B21572CL,
+    0xDFEB33C7L, 0x2D80B0C4L, 0x3ED04330L, 0xCCBBC033L,
+    0xA24BB5A6L, 0x502036A5L, 0x4370C551L, 0xB11B4652L,
+    0x65D122B9L, 0x97BAA1BAL, 0x84EA524EL, 0x7681D14DL,
+    0x2892ED69L, 0xDAF96E6AL, 0xC9A99D9EL, 0x3BC21E9DL,
+    0xEF087A76L, 0x1D63F975L, 0x0E330A81L, 0xFC588982L,
+    0xB21572C9L, 0x407EF1CAL, 0x532E023EL, 0xA145813DL,
+    0x758FE5D6L, 0x87E466D5L, 0x94B49521L, 0x66DF1622L,
+    0x38CC2A06L, 0xCAA7A905L, 0xD9F75AF1L, 0x2B9CD9F2L,
+    0xFF56BD19L, 0x0D3D3E1AL, 0x1E6DCDEEL, 0xEC064EEDL,
+    0xC38D26C4L, 0x31E6A5C7L, 0x22B65633L, 0xD0DDD530L,
+    0x0417B1DBL, 0xF67C32D8L, 0xE52CC12CL, 0x1747422FL,
+    0x49547E0BL, 0xBB3FFD08L, 0xA86F0EFCL, 0x5A048DFFL,
+    0x8ECEE914L, 0x7CA56A17L, 0x6FF599E3L, 0x9D9E1AE0L,
+    0xD3D3E1ABL, 0x21B862A8L, 0x32E8915CL, 0xC083125FL,
+    0x144976B4L, 0xE622F5B7L, 0xF5720643L, 0x07198540L,
+    0x590AB964L, 0xAB613A67L, 0xB831C993L, 0x4A5A4A90L,
+    0x9E902E7BL, 0x6CFBAD78L, 0x7FAB5E8CL, 0x8DC0DD8FL,
+    0xE330A81AL, 0x115B2B19L, 0x020BD8EDL, 0xF0605BEEL,
+    0x24AA3F05L, 0xD6C1BC06L, 0xC5914FF2L, 0x37FACCF1L,
+    0x69E9F0D5L, 0x9B8273D6L, 0x88D28022L, 0x7AB90321L,
+    0xAE7367CAL, 0x5C18E4C9L, 0x4F48173DL, 0xBD23943EL,
+    0xF36E6F75L, 0x0105EC76L, 0x12551F82L, 0xE03E9C81L,
+    0x34F4F86AL, 0xC69F7B69L, 0xD5CF889DL, 0x27A40B9EL,
+    0x79B737BAL, 0x8BDCB4B9L, 0x988C474DL, 0x6AE7C44EL,
+    0xBE2DA0A5L, 0x4C4623A6L, 0x5F16D052L, 0xAD7D5351L
+};
+
+uint32_t
+hammer2_icrc32(const void *buf, size_t size)
+{
+     const uint8_t *p = buf;
+     uint32_t crc = 0;
+
+     crc = crc ^ 0xffffffff;
+     while (size--)
+         crc = crc32Table[(crc ^ *p++) & 0xff] ^ (crc >> 8);
+     crc = crc ^ 0xffffffff;
+     return crc;
+}
+
+uint32_t
+hammer2_icrc32c(const void *buf, size_t size, uint32_t crc)
+{
+     const uint8_t *p = buf;
+
+     crc = crc ^ 0xffffffff;
+     while (size--)
+         crc = crc32Table[(crc ^ *p++) & 0xff] ^ (crc >> 8);
+     crc = crc ^ 0xffffffff;
+     return crc;
+}
index 81ef955..570948b 100644 (file)
@@ -54,7 +54,7 @@ main(int ac, char **av)
        /*
         * Core options
         */
-       while ((ch = getopt(ac, av, "aqs:t:u:")) != -1) {
+       while ((ch = getopt(ac, av, "adqs:t:u:")) != -1) {
                switch(ch) {
                case 'a':
                        all_opt = 1;
@@ -172,13 +172,44 @@ main(int ac, char **av)
                 * Create snapshot with optional pfs_type and optional
                 * label override.
                 */
-       } else if (strcmp(av[0], "helper") == 0) {
+       } else if (strcmp(av[0], "node") == 0) {
                /*
-                * Typically run as a daemon, this multi-threaded helper
-                * subsystem manages socket communications for the
-                * filesystem.
+                * Start the master node daemon.  This daemon accepts
+                * connections from local and remote clients, implements
+                * and maintains the spanning tree protocol, and manages
+                * the core messaging protocol.
                 */
-               ecode = cmd_helper(sel_path);
+               ecode = cmd_node();
+       } else if (strcmp(av[0], "leaf") == 0) {
+               /*
+                * Start the management daemon for a specific PFS.
+                *
+                * This will typically connect to the local master node
+                * daemon, register the PFS, and then pass its side of
+                * the socket descriptor to the kernel HAMMER2 VFS via an
+                * ioctl().  The process and/or thread context remains in the
+                * kernel until the PFS is unmounted or the connection is
+                * lost, then returns from the ioctl.
+                *
+                * It is possible to connect directly to a remote master node
+                * instead of the local master node in situations where
+                * encryption is not desired or no local master node is
+                * desired.  This is not recommended because it represents
+                * a single point of failure for the PFS's communications.
+                *
+                * Direct kernel<->kernel communication between HAMMER2 VFSs
+                * is theoretically possible for directly-connected
+                * registrations (i.e. where the spanning tree is degenerate),
+                * but not recommended.  We specifically try to reduce the
+                * complexity of the HAMMER2 VFS kernel code.
+                */
+               ecode = cmd_leaf(sel_path);
+       } else if (strcmp(av[0], "debug") == 0) {
+               /*
+                * Connect to the command line monitor in the hammer2 master
+                * node for the machine using HAMMER2_DBG_SHELL messages.
+                */
+               ecode = cmd_debug();
        } else {
                fprintf(stderr, "Unrecognized command: %s\n", av[0]);
                usage(1);
diff --git a/sbin/hammer2/msg.c b/sbin/hammer2/msg.c
new file mode 100644 (file)
index 0000000..fb01809
--- /dev/null
@@ -0,0 +1,877 @@
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "hammer2.h"
+
+/*
+ * Initialize a low-level ioq
+ */
+void
+hammer2_ioq_init(hammer2_iocom_t *iocom __unused, hammer2_ioq_t *ioq)
+{
+       bzero(ioq, sizeof(*ioq));
+       ioq->state = HAMMER2_MSGQ_STATE_HEADER1;
+       TAILQ_INIT(&ioq->msgq);
+}
+
+void
+hammer2_ioq_done(hammer2_iocom_t *iocom, hammer2_ioq_t *ioq)
+{
+       hammer2_msg_t *msg;
+
+       while ((msg = TAILQ_FIRST(&ioq->msgq)) != NULL) {
+               TAILQ_REMOVE(&ioq->msgq, msg, entry);
+               hammer2_iocom_freemsg(iocom, msg);
+       }
+       if ((msg = ioq->msg) != NULL) {
+               ioq->msg = NULL;
+               hammer2_iocom_freemsg(iocom, msg);
+       }
+}
+
+/*
+ * Initialize a low-level communications channel
+ */
+void
+hammer2_iocom_init(hammer2_iocom_t *iocom, int sock_fd, int alt_fd)
+{
+       bzero(iocom, sizeof(*iocom));
+
+       TAILQ_INIT(&iocom->freeq);
+       TAILQ_INIT(&iocom->freeq_aux);
+       iocom->sock_fd = sock_fd;
+       iocom->alt_fd = alt_fd;
+       iocom->flags = HAMMER2_IOCOMF_RREQ | HAMMER2_IOCOMF_WIDLE;
+       hammer2_ioq_init(iocom, &iocom->ioq_rx);
+       hammer2_ioq_init(iocom, &iocom->ioq_tx);
+
+       if (sock_fd >= 0)
+               fcntl(sock_fd, F_SETFL, O_NONBLOCK);
+#if 0
+       /* if line buffered our single fgets() should be fine */
+       if (alt_fd >= 0)
+               fcntl(alt_fd, F_SETFL, O_NONBLOCK);
+#endif
+}
+
+void
+hammer2_iocom_done(hammer2_iocom_t *iocom)
+{
+       hammer2_msg_t *msg;
+
+       iocom->sock_fd = -1;
+       hammer2_ioq_done(iocom, &iocom->ioq_rx);
+       hammer2_ioq_done(iocom, &iocom->ioq_tx);
+       if ((msg = TAILQ_FIRST(&iocom->freeq)) != NULL) {
+               TAILQ_REMOVE(&iocom->freeq, msg, entry);
+               free(msg);
+       }
+       if ((msg = TAILQ_FIRST(&iocom->freeq_aux)) != NULL) {
+               TAILQ_REMOVE(&iocom->freeq_aux, msg, entry);
+               free(msg->aux_data);
+               msg->aux_data = NULL;
+               free(msg);
+       }
+}
+
+hammer2_msg_t *
+hammer2_iocom_allocmsg(hammer2_iocom_t *iocom, uint32_t cmd, int aux_size)
+{
+       hammer2_msg_t *msg;
+       int hbytes;
+
+       if (aux_size) {
+               aux_size = (aux_size + HAMMER2_MSG_ALIGNMASK) &
+                          ~HAMMER2_MSG_ALIGNMASK;
+               if ((msg = TAILQ_FIRST(&iocom->freeq_aux)) != NULL)
+                       TAILQ_REMOVE(&iocom->freeq_aux, msg, entry);
+       } else {
+               if ((msg = TAILQ_FIRST(&iocom->freeq)) != NULL)
+                       TAILQ_REMOVE(&iocom->freeq, msg, entry);
+       }
+       if (msg == NULL) {
+               msg = malloc(sizeof(*msg));
+               msg->aux_data = NULL;
+               msg->aux_size = 0;
+       }
+       if (msg->aux_size != aux_size) {
+               if (msg->aux_data) {
+                       free(msg->aux_data);
+                       msg->aux_data = NULL;
+                       msg->aux_size = 0;
+               }
+               if (aux_size) {
+                       msg->aux_data = malloc(aux_size);
+                       msg->aux_size = aux_size;
+               }
+       }
+       msg->flags = 0;
+       hbytes = (cmd & HAMMER2_MSGF_SIZE) * HAMMER2_MSG_ALIGN;
+       bzero(&msg->any.head, hbytes);
+       msg->any.head.cmd = cmd;
+
+       return (msg);
+}
+
+void
+hammer2_iocom_reallocmsg(hammer2_iocom_t *iocom __unused, hammer2_msg_t *msg,
+                        int aux_size)
+{
+       aux_size = (aux_size + HAMMER2_MSG_ALIGNMASK) & ~HAMMER2_MSG_ALIGNMASK;
+       if (aux_size && msg->aux_size != aux_size) {
+               if (msg->aux_data) {
+                       free(msg->aux_data);
+                       msg->aux_data = NULL;
+               }
+               msg->aux_data = malloc(aux_size);
+               msg->aux_size = aux_size;
+       }
+       msg->flags = 0;
+}
+
+void
+hammer2_iocom_freemsg(hammer2_iocom_t *iocom, hammer2_msg_t *msg)
+{
+       if (msg->aux_data)
+               TAILQ_INSERT_TAIL(&iocom->freeq_aux, msg, entry);
+       else
+               TAILQ_INSERT_TAIL(&iocom->freeq, msg, entry);
+}
+
+/*
+ * I/O core loop for an iocom.
+ */
+void
+hammer2_iocom_core(hammer2_iocom_t *iocom,
+                  void (*recvmsg_func)(hammer2_iocom_t *),
+                  void (*sendmsg_func)(hammer2_iocom_t *),
+                  void (*altmsg_func)(hammer2_iocom_t *))
+{
+       struct pollfd fds[2];
+       int timeout = 5000;
+
+       iocom->recvmsg_callback = recvmsg_func;
+       iocom->sendmsg_callback = sendmsg_func;
+       iocom->altmsg_callback = altmsg_func;
+
+       while ((iocom->flags & HAMMER2_IOCOMF_EOF) == 0) {
+               fds[0].fd = iocom->sock_fd;
+               fds[0].events = 0;
+               fds[0].revents = 0;
+
+               if (iocom->flags & HAMMER2_IOCOMF_RREQ)
+                       fds[0].events |= POLLIN;
+               else
+                       timeout = 0;
+               if ((iocom->flags & HAMMER2_IOCOMF_WIDLE) == 0) {
+                       if (iocom->flags & HAMMER2_IOCOMF_WREQ)
+                               fds[0].events |= POLLOUT;
+                       else
+                               timeout = 0;
+               }
+
+               if (iocom->alt_fd >= 0) {
+                       fds[1].fd = iocom->alt_fd;
+                       fds[1].events |= POLLIN;
+                       fds[1].revents = 0;
+                       poll(fds, 2, timeout);
+               } else {
+                       poll(fds, 1, timeout);
+               }
+               if ((fds[0].revents & POLLIN) ||
+                   (iocom->flags & HAMMER2_IOCOMF_RREQ) == 0) {
+                       iocom->recvmsg_callback(iocom);
+               }
+               if ((iocom->flags & HAMMER2_IOCOMF_WIDLE) == 0) {
+                       if ((fds[0].revents & POLLOUT) ||
+                           (iocom->flags & HAMMER2_IOCOMF_WREQ) == 0) {
+                               iocom->sendmsg_callback(iocom);
+                       }
+               }
+               if (iocom->alt_fd >= 0 && (fds[1].revents & POLLIN))
+                       iocom->altmsg_callback(iocom);
+       }
+}
+
+/*
+ * Read the next ready message from the ioq, issuing I/O if needed.
+ * Caller should retry on a read-event when NULL is returned.
+ *
+ * If an error occurs during reception a HAMMER2_LNK_ERROR msg will
+ * be returned (and the caller must not call us again after that).
+ */
+hammer2_msg_t *
+hammer2_ioq_read(hammer2_iocom_t *iocom)
+{
+       hammer2_ioq_t *ioq = &iocom->ioq_rx;
+       hammer2_msg_t *msg;
+       hammer2_msg_hdr_t *head;
+       ssize_t n;
+       int bytes;
+       int flags;
+       int nmax;
+       uint16_t xcrc16;
+       uint32_t xcrc32;
+
+       /*
+        * If a message is already pending we can just remove and
+        * return it.
+        */
+       if ((msg = TAILQ_FIRST(&ioq->msgq)) != NULL) {
+               TAILQ_REMOVE(&ioq->msgq, msg, entry);
+               return(msg);
+       }
+
+       /*
+        * Message read in-progress (msg is NULL at the moment).  We don't
+        * allocate a msg until we have its core header.
+        */
+       bytes = ioq->fifo_end - ioq->fifo_beg;
+       nmax = sizeof(iocom->rxbuf) - ioq->fifo_end;
+       msg = ioq->msg;
+
+       switch(ioq->state) {
+       case HAMMER2_MSGQ_STATE_HEADER1:
+               /*
+                * Load the primary header, fail on any non-trivial read
+                * error or on EOF.  Since the primary header is the same
+                * size is the message alignment it will never straddle
+                * the end of the buffer.
+                */
+               if (bytes < (int)sizeof(msg->any.head)) {
+                       n = read(iocom->sock_fd,
+                                iocom->rxbuf + ioq->fifo_end,
+                                nmax);
+                       if (n <= 0) {
+                               if (n == 0) {
+                                       ioq->error = HAMMER2_IOQ_ERROR_EOF;
+                                       break;
+                               }
+                               if (errno != EINTR &&
+                                   errno != EINPROGRESS &&
+                                   errno != EAGAIN) {
+                                       ioq->error = HAMMER2_IOQ_ERROR_SOCK;
+                                       break;
+                               }
+                               n = 0;
+                               /* fall through */
+                       }
+                       ioq->fifo_end += n;
+                       bytes += n;
+                       nmax -= n;
+               }
+
+               /*
+                * Insufficient data accumulated (msg is NULL, caller will
+                * retry on event).
+                */
+               assert(msg == NULL);
+               if (bytes < (int)sizeof(msg->any.head))
+                       break;
+
+               flags = 0;
+               head = (void *)(iocom->rxbuf + ioq->fifo_beg);
+
+               /*
+                * XXX Decrypt the core header
+                */
+
+               /*
+                * Check and fixup the core header.  Note that the icrc
+                * has to be calculated before any fixups, but the crc
+                * fields in the msg may have to be swapped like everything
+                * else.
+                */
+               if (head->magic != HAMMER2_MSGHDR_MAGIC &&
+                   head->magic != HAMMER2_MSGHDR_MAGIC_REV) {
+                       ioq->error = HAMMER2_IOQ_ERROR_SYNC;
+                       break;
+               }
+
+               xcrc32 = hammer2_icrc32((char *)head + HAMMER2_MSGHDR_CRCOFF,
+                                       HAMMER2_MSGHDR_CRCBYTES);
+               if (head->magic == HAMMER2_MSGHDR_MAGIC_REV) {
+                       hammer2_bswap_head(head);
+                       flags |= HAMMER2_MSGX_BSWAPPED;
+               }
+               xcrc16 = (uint16_t)xcrc32 ^ (uint16_t)(xcrc32 >> 16);
+               if (xcrc16 != head->icrc1) {
+                       ioq->error = HAMMER2_IOQ_ERROR_HCRC;
+                       break;
+               }
+
+               /*
+                * Calculate the full header size and aux data size
+                */
+               ioq->hbytes = (head->cmd & HAMMER2_MSGF_SIZE) *
+                             HAMMER2_MSG_ALIGN;
+               ioq->abytes = head->aux_bytes * HAMMER2_MSG_ALIGN;
+               if (ioq->hbytes < (int)sizeof(msg->any.head) ||
+                   ioq->hbytes > (int)sizeof(msg->any) ||
+                   ioq->abytes > HAMMER2_MSGAUX_MAX) {
+                       ioq->error = HAMMER2_IOQ_ERROR_FIELD;
+                       break;
+               }
+
+               /*
+                * Finally allocate the message and copy the core header
+                * to the embedded extended header.
+                */
+               if (ioq->abytes) {
+                       if ((msg = TAILQ_FIRST(&iocom->freeq_aux)) != NULL) {
+                               TAILQ_REMOVE(&iocom->freeq_aux, msg, entry);
+                       } else {
+                               msg = malloc(sizeof(*msg));
+                               msg->aux_data = NULL;
+                               msg->aux_size = 0;
+                       }
+                       if (msg->aux_size != ioq->abytes) {
+                               if (msg->aux_data) {
+                                       free(msg->aux_data);
+                                       msg->aux_data = NULL;
+                               }
+                               msg->aux_data = malloc(ioq->abytes);
+                               /* msg->aux_size = ioq->abytes; */
+                       }
+               } else {
+                       if ((msg = TAILQ_FIRST(&iocom->freeq)) != NULL) {
+                               TAILQ_REMOVE(&iocom->freeq, msg, entry);
+                       } else {
+                               msg = malloc(sizeof(*msg));
+                               msg->aux_data = NULL;
+                               /* msg->aux_size = 0; */
+                       }
+               }
+               msg->aux_size = 0;      /* data copied so far */
+               msg->flags = flags;
+               ioq->msg = msg;
+
+               /*
+                * We are either done or we fall-through
+                */
+               if (ioq->hbytes == sizeof(msg->any.head) && ioq->abytes == 0) {
+                       bcopy(head, &msg->any.head, sizeof(msg->any.head));
+                       ioq->fifo_beg += ioq->hbytes;
+                       break;
+               }
+
+               /*
+                * Fall through to the next state.  Make sure that the
+                * extended header does not straddle the end of the buffer.
+                * We still want to issue larger reads into our buffer,
+                * book-keeping is easier if we don't bcopy() yet.
+                */
+               if (bytes + nmax < ioq->hbytes) {
+                       bcopy(iocom->rxbuf + ioq->fifo_beg, iocom->rxbuf,
+                             bytes);
+                       ioq->fifo_beg = 0;
+                       ioq->fifo_end = bytes;
+                       nmax = sizeof(iocom->rxbuf) - ioq->fifo_end;
+               }
+               ioq->state = HAMMER2_MSGQ_STATE_HEADER2;
+               /* fall through */
+       case HAMMER2_MSGQ_STATE_HEADER2:
+               /*
+                * Fill out the extended header.
+                */
+               assert(msg != NULL);
+               if (bytes < ioq->hbytes) {
+                       n = read(iocom->sock_fd,
+                                msg->any.buf + ioq->fifo_end,
+                                nmax);
+                       if (n <= 0) {
+                               if (n == 0) {
+                                       ioq->error = HAMMER2_IOQ_ERROR_EOF;
+                                       break;
+                               }
+                               if (errno != EINTR &&
+                                   errno != EINPROGRESS &&
+                                   errno != EAGAIN) {
+                                       ioq->error = HAMMER2_IOQ_ERROR_SOCK;
+                                       break;
+                               }
+                               n = 0;
+                               /* fall through */
+                       }
+                       ioq->fifo_end += n;
+                       bytes += n;
+                       nmax -= n;
+               }
+
+               /*
+                * Insufficient data accumulated (set msg NULL so caller will
+                * retry on event).
+                */
+               if (bytes < ioq->hbytes) {
+                       msg = NULL;
+                       break;
+               }
+
+               /*
+                * XXX Decrypt the extended header
+                */
+               head = (void *)(iocom->rxbuf + ioq->fifo_beg);
+
+               /*
+                * Check the crc on the extended header
+                */
+               if (ioq->hbytes > (int)sizeof(hammer2_msg_hdr_t)) {
+                       xcrc32 = hammer2_icrc32(head + 1,
+                                               ioq->hbytes - sizeof(*head));
+                       xcrc16 = (uint16_t)xcrc32 ^ (uint16_t)(xcrc32 >> 16);
+                       if (head->icrc2 != xcrc16) {
+                               ioq->error = HAMMER2_IOQ_ERROR_XCRC;
+                               break;
+                       }
+               }
+
+               /*
+                * Copy the extended header into the msg and adjust the
+                * FIFO.
+                */
+               bcopy(head, &msg->any, ioq->hbytes);
+
+               /*
+                * We are either done or we fall-through.
+                */
+               if (ioq->abytes == 0) {
+                       ioq->fifo_beg += ioq->hbytes;
+                       break;
+               }
+
+               /*
+                * Must adjust nmax and bytes (and the state) when falling
+                * through.
+                */
+               ioq->fifo_beg += ioq->hbytes;
+               nmax -= ioq->hbytes;
+               bytes -= ioq->hbytes;
+               ioq->state = HAMMER2_MSGQ_STATE_AUXDATA1;
+               /* fall through */
+       case HAMMER2_MSGQ_STATE_AUXDATA1:
+               /*
+                * Copy the partial or complete payload from remaining
+                * bytes in the FIFO.  We have to fall-through either
+                * way so we can check the crc.
+                */
+               assert(msg->aux_size == 0);
+               if (bytes >= ioq->abytes) {
+                       bcopy(iocom->rxbuf + ioq->fifo_beg, msg->aux_data,
+                             ioq->abytes);
+                       msg->aux_size = ioq->abytes;
+                       ioq->fifo_beg += ioq->abytes;
+                       bytes -= ioq->abytes;
+               } else if (bytes) {
+                       bcopy(iocom->rxbuf + ioq->fifo_beg, msg->aux_data,
+                             bytes);
+                       msg->aux_size = bytes;
+                       ioq->fifo_beg += bytes;
+                       bytes = 0;
+               }
+               ioq->state = HAMMER2_MSGQ_STATE_AUXDATA2;
+               /* fall through */
+       case HAMMER2_MSGQ_STATE_AUXDATA2:
+               /*
+                * Read the remainder of the payload directly into the
+                * msg->aux_data buffer.
+                */
+               assert(msg);
+               if (msg->aux_size < ioq->abytes) {
+                       assert(bytes == 0);
+                       n = read(iocom->sock_fd,
+                                msg->aux_data + msg->aux_size,
+                                ioq->abytes - msg->aux_size);
+                       if (n <= 0) {
+                               if (n == 0) {
+                                       ioq->error = HAMMER2_IOQ_ERROR_EOF;
+                                       break;
+                               }
+                               if (errno != EINTR &&
+                                   errno != EINPROGRESS &&
+                                   errno != EAGAIN) {
+                                       ioq->error = HAMMER2_IOQ_ERROR_SOCK;
+                                       break;
+                               }
+                               n = 0;
+                               /* fall through */
+                       }
+                       msg->aux_size += n;
+               }
+
+               /*
+                * Insufficient data accumulated (set msg NULL so caller will
+                * retry on event).
+                */
+               if (msg->aux_size < ioq->abytes) {
+                       msg = NULL;
+                       break;
+               }
+               assert(msg->aux_size == ioq->abytes);
+
+               /*
+                * XXX Decrypt the data
+                */
+
+               /*
+                * Check aux_icrc, then we are done.
+                */
+               xcrc32 = hammer2_icrc32(msg->aux_data, msg->aux_size);
+               if (xcrc32 != msg->any.head.aux_icrc) {
+                       ioq->error = HAMMER2_IOQ_ERROR_ACRC;
+                       break;
+               }
+               break;
+       case HAMMER2_MSGQ_STATE_ERROR:
+       default:
+               /*
+                * We don't double-return errors, the caller should not
+                * have called us again after getting an error msg.
+                */
+               assert(0);
+               break;
+       }
+
+       /*
+        * Handle error, RREQ, or completion
+        *
+        * NOTE: nmax and bytes are invalid at this point, we don't bother
+        *       to update them when breaking out.
+        */
+       if (ioq->error) {
+               /*
+                * An unrecoverable error occured during processing,
+                * return a special error message.  Try to leave the
+                * ioq state alone for post-mortem debugging.
+                *
+                * Link error messages are returned as one-way messages,
+                * so no flags get set.  Source and target is 0 (link-level),
+                * msgid is 0 (link-level).  All we really need to do is
+                * set up magic, cmd, and error.
+                */
+               if (msg == NULL) {
+                       if ((msg = TAILQ_FIRST(&iocom->freeq)) != NULL) {
+                               TAILQ_REMOVE(&iocom->freeq, msg, entry);
+                       } else {
+                               msg = malloc(sizeof(*msg));
+                               msg->aux_data = NULL;
+                               msg->aux_size = 0;
+                       }
+                       assert(ioq->msg == NULL);
+               } else {
+                       assert(ioq->msg == msg);
+                       ioq->msg = NULL;
+               }
+               if (msg->aux_data) {
+                       free(msg->aux_data);
+                       msg->aux_data = NULL;
+                       msg->aux_size = 0;
+               }
+               bzero(&msg->any.head, sizeof(msg->any.head));
+               msg->any.head.magic = HAMMER2_MSGHDR_MAGIC;
+               msg->any.head.cmd = HAMMER2_LNK_ERROR;
+               msg->any.head.error = ioq->error;
+               ioq->state = HAMMER2_MSGQ_STATE_ERROR;
+               iocom->flags |= HAMMER2_IOCOMF_EOF;
+       } else if (msg == NULL) {
+               /*
+                * Insufficient data received to finish building the message,
+                * set RREQ and return NULL.
+                *
+                * Leave ioq->msg intact.
+                * Leave the FIFO intact.
+                */
+               iocom->flags |= HAMMER2_IOCOMF_RREQ;
+               ioq->fifo_beg = 0;
+               ioq->fifo_end = 0;
+       } else {
+               /*
+                * Return msg, clear the FIFO if it is now empty.
+                * Flag RREQ if the caller needs to wait for a read-event
+                * or not.
+                *
+                * The fifo has already been advanced past the message.
+                * Trivially reset the FIFO indices if possible.
+                */
+               if (ioq->fifo_beg == ioq->fifo_end) {
+                       iocom->flags |= HAMMER2_IOCOMF_RREQ;
+                       ioq->fifo_beg = 0;
+                       ioq->fifo_end = 0;
+               } else {
+                       iocom->flags &= ~HAMMER2_IOCOMF_RREQ;
+               }
+               ioq->state = HAMMER2_MSGQ_STATE_HEADER1;
+               ioq->msg = NULL;
+       }
+       return (msg);
+}
+
+/*
+ * Calculate the header and data crc's and write a low-level message to
+ * the connection.  If aux_icrc is non-zero the aux_data crc is already
+ * assumed to have been set.
+ *
+ * A non-NULL msg is added to the queue but not necessarily flushed.
+ * Calling this function with msg == NULL will get a flush going.
+ */
+void
+hammer2_ioq_write(hammer2_iocom_t *iocom, hammer2_msg_t *msg)
+{
+       hammer2_ioq_t *ioq = &iocom->ioq_tx;
+       ssize_t nmax;
+       ssize_t nact;
+       int hbytes;
+       int abytes;
+       int hoff;
+       int aoff;
+       uint16_t xcrc16;
+       uint32_t xcrc32;
+       struct iovec iov[HAMMER2_IOQ_MAXIOVEC];
+       int n;
+
+       if (ioq->error) {
+               if (msg) {
+                       TAILQ_INSERT_TAIL(&ioq->msgq, msg, entry);
+                       ++ioq->msgcount;
+               }
+               hammer2_ioq_write_drain(iocom);
+               return;
+       }
+
+       if (msg) {
+               /*
+                * Finish populating the msg fields
+                */
+               msg->any.head.magic = HAMMER2_MSGHDR_MAGIC;
+               msg->any.head.salt = (random() << 8) | (ioq->seq & 255);
+               ++ioq->seq;
+
+               /*
+                * Calculate aux_icrc if 0, calculate icrc2, and finally
+                * calculate icrc1.
+                */
+               if (msg->aux_size && msg->any.head.aux_icrc == 0) {
+                       assert((msg->aux_size & HAMMER2_MSG_ALIGNMASK) == 0);
+                       xcrc32 = hammer2_icrc32(msg->aux_data, msg->aux_size);
+                       msg->any.head.aux_icrc = xcrc32;
+               }
+               msg->any.head.aux_bytes = msg->aux_size / HAMMER2_MSG_ALIGN;
+               assert((msg->aux_size & HAMMER2_MSG_ALIGNMASK) == 0);
+
+               if ((msg->any.head.cmd & HAMMER2_MSGF_SIZE) >
+                   sizeof(msg->any.head) / HAMMER2_MSG_ALIGN) {
+                       hbytes = (msg->any.head.cmd & HAMMER2_MSGF_SIZE) *
+                               HAMMER2_MSG_ALIGN;
+                       hbytes -= sizeof(msg->any.head);
+                       xcrc32 = hammer2_icrc32(&msg->any.head + 1, hbytes);
+                       xcrc16 = (uint16_t)xcrc32 ^ (uint16_t)(xcrc32 >> 16);
+                       msg->any.head.icrc2 = xcrc16;
+               } else {
+                       msg->any.head.icrc2 = 0;
+               }
+               xcrc32 = hammer2_icrc32(msg->any.buf + HAMMER2_MSGHDR_CRCOFF,
+                                       HAMMER2_MSGHDR_CRCBYTES);
+               xcrc16 = (uint16_t)xcrc32 ^ (uint16_t)(xcrc32 >> 16);
+               msg->any.head.icrc1 = xcrc16;
+
+               /*
+                * XXX Encrypt the message
+                */
+
+               /*
+                * Enqueue the message, stop now if we already know that
+                * we can't write.
+                */
+               TAILQ_INSERT_TAIL(&ioq->msgq, msg, entry);
+               ++ioq->msgcount;
+               iocom->flags &= ~HAMMER2_IOCOMF_WIDLE;
+               if (iocom->flags & HAMMER2_IOCOMF_WREQ)
+                       return;
+
+               /*
+                * Flush if we can aggregate several msgs, otherwise
+                * we will wait for the global flush (msg == NULL).
+                */
+               if (ioq->msgcount < HAMMER2_IOQ_MAXIOVEC / 2)
+                       return;
+       } else if (iocom->flags &= HAMMER2_IOCOMF_WIDLE) {
+               /*
+                * Nothing to do if WIDLE is set.
+                */
+               assert(TAILQ_FIRST(&ioq->msgq) == NULL);
+               return;
+       }
+
+       /*
+        * Pump messages out the connection by building an iovec.
+        */
+       n = 0;
+       nmax = 0;
+
+       TAILQ_FOREACH(msg, &ioq->msgq, entry) {
+               hoff = 0;
+               hbytes = (msg->any.head.cmd & HAMMER2_MSGF_SIZE) *
+                        HAMMER2_MSG_ALIGN;
+               aoff = 0;
+               abytes = msg->aux_size;
+               if (n == 0) {
+                       hoff += ioq->hbytes;
+                       aoff += ioq->abytes;
+               }
+               if (hbytes - hoff > 0) {
+                       iov[n].iov_base = (char *)&msg->any.head + hoff;
+                       iov[n].iov_len = hbytes - hoff;
+                       nmax += hbytes - hoff;
+                       ++n;
+                       if (n == HAMMER2_IOQ_MAXIOVEC)
+                               break;
+               }
+               if (abytes - aoff > 0) {
+                       assert(msg->aux_data != NULL);
+                       iov[n].iov_base = msg->aux_data + aoff;
+                       iov[n].iov_len = abytes - aoff;
+                       nmax += abytes - aoff;
+                       ++n;
+                       if (n == HAMMER2_IOQ_MAXIOVEC)
+                               break;
+               }
+       }
+       if (n == 0)
+               return;
+
+       /*
+        * Execute the writev() then figure out what happened.
+        */
+       nact = writev(iocom->sock_fd, iov, n);
+       if (nact < 0) {
+               if (errno != EINTR &&
+                   errno != EINPROGRESS &&
+                   errno != EAGAIN) {
+                       ioq->error = HAMMER2_IOQ_ERROR_SOCK;
+                       hammer2_ioq_write_drain(iocom);
+               } else {
+                       iocom->flags |= HAMMER2_IOCOMF_WREQ;
+               }
+               return;
+       }
+       if (nact == nmax)
+               iocom->flags &= ~HAMMER2_IOCOMF_WREQ;
+       else
+               iocom->flags |= HAMMER2_IOCOMF_WREQ;
+
+       while ((msg = TAILQ_FIRST(&ioq->msgq)) != NULL) {
+               hbytes = (msg->any.head.cmd & HAMMER2_MSGF_SIZE) *
+                        HAMMER2_MSG_ALIGN;
+               abytes = msg->aux_size;
+
+               if (nact < hbytes - ioq->hbytes) {
+                       ioq->hbytes += nact;
+                       break;
+               }
+               nact -= hbytes - ioq->hbytes;
+               ioq->hbytes = hbytes;
+               if (nact < abytes - ioq->abytes) {
+                       ioq->abytes += nact;
+                       break;
+               }
+               nact -= abytes - ioq->abytes;
+
+               TAILQ_REMOVE(&ioq->msgq, msg, entry);
+               --ioq->msgcount;
+               ioq->hbytes = 0;
+               ioq->abytes = 0;
+               if (msg->aux_data)
+                       TAILQ_INSERT_TAIL(&iocom->freeq_aux, msg, entry);
+               else
+                       TAILQ_INSERT_TAIL(&iocom->freeq, msg, entry);
+       }
+       if (msg == NULL) {
+               iocom->flags |= HAMMER2_IOCOMF_WIDLE;
+               iocom->flags &= ~HAMMER2_IOCOMF_WREQ;
+       }
+       if (ioq->error) {
+               iocom->flags |= HAMMER2_IOCOMF_EOF |
+                               HAMMER2_IOCOMF_WIDLE;
+               iocom->flags &= ~HAMMER2_IOCOMF_WREQ;
+       }
+}
+
+/*
+ * Kill pending msgs on ioq_tx and adjust the flags such that no more
+ * write events will occur.  We don't kill read msgs because we want
+ * the caller to pull off our contrived terminal error msg to detect
+ * the connection failure.
+ */
+void
+hammer2_ioq_write_drain(hammer2_iocom_t *iocom)
+{
+       hammer2_ioq_t *ioq = &iocom->ioq_tx;
+       hammer2_msg_t *msg;
+
+       while ((msg = TAILQ_FIRST(&ioq->msgq)) != NULL) {
+               TAILQ_REMOVE(&ioq->msgq, msg, entry);
+               --ioq->msgcount;
+               hammer2_iocom_freemsg(iocom, msg);
+       }
+       iocom->flags |= HAMMER2_IOCOMF_WIDLE;
+       iocom->flags &= ~HAMMER2_IOCOMF_WREQ;
+}
+
+/*
+ * Reply to a message after setting various fields appropriately.
+ * This function will swap (source) and (target) and enqueue the
+ * message for transmission.
+ */
+void
+hammer2_ioq_reply(hammer2_iocom_t *iocom, hammer2_msg_t *msg)
+{
+       uint16_t t16;
+
+       t16 = msg->any.head.source;
+       msg->any.head.source = msg->any.head.target;
+       msg->any.head.target = t16;
+       msg->any.head.cmd ^= HAMMER2_MSGF_REPLY;
+       hammer2_ioq_write(iocom, msg);
+}
+
+void
+hammer2_ioq_reply_term(hammer2_iocom_t *iocom, hammer2_msg_t *msg,
+                      uint16_t error)
+{
+       if (msg->any.head.cmd & HAMMER2_MSGF_CREATE) {
+               msg->any.head.cmd |= HAMMER2_MSGF_CREATE | HAMMER2_MSGF_DELETE;
+               msg->any.head.error = error;
+               hammer2_ioq_reply(iocom, msg);
+       }
+}
diff --git a/sbin/hammer2/network.h b/sbin/hammer2/network.h
new file mode 100644 (file)
index 0000000..d599cd3
--- /dev/null
@@ -0,0 +1,234 @@
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/***************************************************************************
+ *                             LOW LEVEL MESSAGING                        *
+ ***************************************************************************
+ *
+ * hammer2_msg - A standalone copy of a message, typically referenced by
+ *              or embedded in other structures, or used with I/O queues.
+ *
+ * These structures are strictly temporary, so they do not have to be
+ * particularly optimized for size.  All possible message headers are
+ * directly embedded (any), and the message may contain a reference
+ * to allocated auxillary data.  The structure is recycled quite often
+ * by a connection.
+ *
+ * This structure is typically not used for storing persistent message
+ * state (see hammer2_pmsg for that).
+ */
+struct hammer2_msg {
+       TAILQ_ENTRY(hammer2_msg) entry; /* queue */
+       char            *aux_data;      /* aux-data if any */
+       int             aux_size;
+       int             flags;
+       hammer2_any_t   any;            /* raw extended msg header */
+};
+
+typedef struct hammer2_msg hammer2_msg_t;
+
+TAILQ_HEAD(hammer2_msg_queue, hammer2_msg);
+typedef struct hammer2_msg_queue hammer2_msg_queue_t;
+
+#define HAMMER2_MSGX_BSWAPPED  0x0001
+
+/*
+ * hammer2_ioq - An embedded component of hammer2_connect, holds state
+ * for the buffering and parsing of incoming and outgoing messages.
+ */
+struct hammer2_ioq {
+       enum { HAMMER2_MSGQ_STATE_HEADER1,
+              HAMMER2_MSGQ_STATE_HEADER2,
+              HAMMER2_MSGQ_STATE_AUXDATA1,
+              HAMMER2_MSGQ_STATE_AUXDATA2,
+              HAMMER2_MSGQ_STATE_ERROR } state;
+       int             fifo_beg;               /* buffered data */
+       int             fifo_end;
+       int             hbytes;                 /* header size */
+       int             abytes;                 /* aux_data size */
+       int             error;
+       int             seq;                    /* salt sequencer */
+       int             msgcount;
+       hammer2_msg_t   *msg;
+       hammer2_msg_queue_t msgq;
+};
+
+typedef struct hammer2_ioq hammer2_ioq_t;
+
+#define HAMMER2_IOQ_ERROR_SYNC 1               /* bad magic / out of sync */
+#define HAMMER2_IOQ_ERROR_EOF  2               /* unexpected EOF */
+#define HAMMER2_IOQ_ERROR_SOCK 3               /* read() error on socket */
+#define HAMMER2_IOQ_ERROR_FIELD        4               /* invalid field */
+#define HAMMER2_IOQ_ERROR_HCRC 5               /* core header crc bad */
+#define HAMMER2_IOQ_ERROR_XCRC 6               /* ext header crc bad */
+#define HAMMER2_IOQ_ERROR_ACRC 7               /* aux data crc bad */
+#define HAMMER2_IOQ_ERROR_STATE        8               /* bad state */
+
+#define HAMMER2_IOQ_MAXIOVEC    16
+
+/*
+ * hammer2_iocom - governs a messaging stream connection
+ */
+struct hammer2_iocom {
+       hammer2_ioq_t   ioq_rx;
+       hammer2_ioq_t   ioq_tx;
+       hammer2_msg_queue_t freeq;              /* free msgs hdr only */
+       hammer2_msg_queue_t freeq_aux;          /* free msgs w/aux_data */
+       void    (*recvmsg_callback)(struct hammer2_iocom *);
+       void    (*sendmsg_callback)(struct hammer2_iocom *);
+       void    (*altmsg_callback)(struct hammer2_iocom *);
+       int     sock_fd;                        /* comm socket or pipe */
+       int     alt_fd;                         /* thread signal, tty, etc */
+       int     flags;
+       char    rxbuf[HAMMER2_MSGBUF_SIZE];     /* for ioq_rx only */
+};
+
+typedef struct hammer2_iocom hammer2_iocom_t;
+
+#define HAMMER2_IOCOMF_EOF     0x00000001      /* EOF or ERROR on desc */
+#define HAMMER2_IOCOMF_RREQ    0x00000002      /* request read-data event */
+#define HAMMER2_IOCOMF_WREQ    0x00000004      /* request write-avail event */
+#define HAMMER2_IOCOMF_WIDLE   0x00000008      /* request write-avail event */
+#define HAMMER2_IOCOMF_SIGNAL  0x00000010
+
+/***************************************************************************
+ *                             HIGH LEVEL MESSAGING                       *
+ ***************************************************************************
+ *
+ */
+
+#if 0
+
+
+
+/*
+ * The global registration structure consolidates information accumulated
+ * via the spanning tree algorithm and tells us which connection (link)
+ * is the best path to get to any given registration.
+ *
+ * glob_node   - Splay entry for this registration in the global index
+ *               of all registrations.
+ *
+ * glob_entry  - tailq entry when this registration's best_span element
+ *               has changed state.
+ *
+ * span_list   - Head of a simple list of spanning tree entries which
+ *               we use to determine the best link.
+ *
+ * best_span   - Which of the span structure on span_list is the best
+ *               one.
+ *
+ * source_root - Splay tree root indexing all mesasges sent from this
+ *               registration.  The messages are indexed by
+ *               {linkid,msgid} XXX
+ *
+ * target_root - Splay tree root indexing all messages being sent to
+ *               this registration.  The messages are indexed by
+ *               {linkid,msgid}. XXX
+ *
+ *
+ * Whenever spanning tree data causes a registration's best_link field to
+ * change that registration is transmitted as spanning tree data to every
+ * active link.  Note that pure clients to the cluster, of which there can
+ * be millions, typically do not transmit spanning tree data to each other.
+ *
+ * Each registration is assigned a unique linkid local to the node (another
+ * node might assign a different linkid to the same registration).  This
+ * linkid must be persistent as long as messages are active and is used
+ * to identify the message source and target.
+ */
+TAILQ_HEAD(hammer2_span_list, hammer2_span);
+typedef struct hammer2_span_list hammer2_span_list_t;
+
+struct hammer2_reg {
+       SPLAY_ENTRY(hammer2_reg) glob_node;     /* index of registrations */
+       TAILQ_ENTRY(hammer2_reg) glob_entry;    /* when modified */
+       hammer2_span_list_t     span_list;      /* list of hammer2_span's */
+       hammer2_span_t          *best_span;     /* best span entry */
+       hammer2_pmsg_splay_head_t source_root;  /* msgs sent from reg */
+       hammer2_pmsg_splay_head_t target_root;  /* msgs sent to reg */
+       uuid_t  pfs_id;                         /* key field */
+       uuid_t  pfs_fsid;                       /* key field */
+       uint32_t linkid;
+       int     flags;
+       int     refs;
+};
+
+#define HAMMER2_PROTO_REGF_MODIFIED    0x0001
+
+/*
+ * Each link (connection) collects spanning tree data received via the
+ * link and stores it in these span structures.
+ */
+struct hammer2_span {
+       TAILQ_ENTRY(hammer2_span)       span_entry;     /* from hammer2_reg */
+       SPLAY_ENTRY(hammer2_span)       span_node;      /* from hammer2_link */
+       hammer2_reg_t                   *reg;
+       hammer2_link_t                  *link;
+       int                             weight;
+};
+
+/*
+ * Most hammer2 messages represent transactions and have persistent state
+ * which must be recorded.  Some messages, such as cache states and inode
+ * representations are very long-lasting transactions.
+ *
+ * Each node in the graph must keep track of the message state in order
+ * to perform the proper action when a connection is lost.  To do this
+ * the message is indexed on the source and target (global) registration,
+ * and the actual span element the message was received on and transmitted
+ * to is recorded (allowing us to retrieve the physical links involved).
+ *
+ * The {source_reg, target_reg, msgid} uniquely identifies a message.  Any
+ * streaming operations using the same msgid use the same rendezvous.
+ *
+ * It is important to note that recorded state must use the same physical
+ * link (and thus the same chain of links across the graph) as was 'forged'
+ * by the initial message for that msgid.  If the source span a message is
+ * received on does not match the recorded source, or the recorded target
+ * is no longer routeable, the message will be returned or generate an ABORT
+ * with LINKFAIL as appropriate.
+ */
+struct hammer2_pmsg {
+       SPLAY_ENTRY(hammer2_pmsg) source_reg;
+       SPLAY_ENTRY(hammer2_pmsg) target_reg;
+       hammer2_span_t  *source;
+       hammer2_span_t  *target;
+       uint16_t        msgid;
+       void            *aux_data;              /* allocated aux data */
+       hammer2_msg_any_t any;                  /* dynamically allocated */
+};
+
+#endif
index 751bf8a..23935f6 100644 (file)
@@ -62,8 +62,13 @@ hammer2_ioctl_handle(const char *sel_path)
        return (fd);
 }
 
+/*
+ * Execute the specified function as a detached independent process/daemon,
+ * unless we are in debug mode.  If we are in debug mode the function is
+ * executed as a pthread in the current process.
+ */
 void
-hammer2_disconnect(void *(*func)(void *), void *arg)
+hammer2_demon(void *(*func)(void *), void *arg)
 {
        pthread_t thread = NULL;
        pid_t pid;
@@ -133,3 +138,24 @@ hammer2_disconnect(void *(*func)(void *), void *arg)
        pthread_exit(NULL);
        _exit(2);       /* NOT REACHED */
 }
+
+/*
+ * This swaps endian for a hammer2_msg_hdr.  Note that the extended
+ * header is not adjusted, just the core header.
+ */
+void
+hammer2_bswap_head(hammer2_msg_hdr_t *head)
+{
+       head->magic     = bswap16(head->magic);
+       head->icrc1     = bswap16(head->icrc1);
+       head->salt      = bswap32(head->salt);
+       head->source    = bswap16(head->source);
+       head->target    = bswap16(head->target);
+       head->msgid     = bswap32(head->msgid);
+       head->cmd       = bswap32(head->cmd);
+       head->error     = bswap16(head->error);
+       head->resv05    = bswap16(head->resv05);
+       head->icrc2     = bswap16(head->icrc2);
+       head->aux_bytes = bswap16(head->aux_bytes);
+       head->aux_icrc  = bswap32(head->aux_icrc);
+}
diff --git a/sys/vfs/hammer2/hammer2_network.h b/sys/vfs/hammer2/hammer2_network.h
new file mode 100644 (file)
index 0000000..5045857
--- /dev/null
@@ -0,0 +1,457 @@
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+#ifndef VFS_HAMMER2_NETWORK_H_
+#define VFS_HAMMER2_NETWORK_H_
+
+#ifndef _VFS_HAMMER2_DISK_H_
+#include "hammer2_disk.h"
+#endif
+
+/*
+ * Mesh network protocol structures.
+ *
+ * The mesh is constructed from point-to-point streaming links with varying
+ * levels of interconnectedness, forming a graph.  When a link is established
+ * link id #0 is reserved for link-level communications.  This link is used
+ * for authentication, registration, ping, further link id negotiations,
+ * spanning tree, and so on.
+ *
+ * The spanning tree forms a weighted shortest-path-first graph amongst
+ * those nodes with sufficient administrative rights to relay between
+ * registrations.  Each link maintains a full reachability set, aggregates
+ * it, and retransmits via the shortest path.  However, leaf nodes (even leaf
+ * nodes with multiple connections) can opt not to be part of the spanning
+ * tree and typically (due to administrative rights) their registrations
+ * are not reported to other leafs.
+ *
+ * All message responses follow the SAME PATH that the original message
+ * followed, but in reverse.  This is an absolute requirement since messages
+ * expecting replies record persistent state at each hop.
+ *
+ * Message state is handled by the CREATE, DELETE, REPLY, and ABORT
+ * flags.  Message state is typically recorded at the end points and
+ * at each hop until a DELETE is received from both sides.
+ *
+ * One-way messages such as those used by spanning tree commands are not
+ * recorded.  These are sent with no flags set.  Aborts and replies are not
+ * possible.
+ *
+ * A normal message with no persistent state is sent with CREATE|DELETE and
+ * the response is returned with REPLY|CREATE|DELETE.  A normal command can
+ * be aborted by sending an ABORT message to the msgid that is in progress.
+ * An ABORT sent by the originator must still wait for the reply from the
+ * target and, since we've already sent the DELETE with our CREATE|DELETE,
+ * may also cross the REPLY|CREATE|DELETE message in the opposite direction.
+ * In this situation the message state has been destroyed on the target and
+ * the target ignores the ABORT (because CREATE is not set, and
+ * differentiated from one-way messages because ABORT is set).
+ *
+ * A command which has persistent state must maintain a persistent message.
+ * For example, a lock or cache state request.  A persistent message is
+ * initiated with just CREATE and the initial response is returned with
+ * REPLY|CREATE.  Successive messages are sent with no flags and responses
+ * with just REPLY.  The DELETE flag acts like a half-close (and degenerately
+ * works the same as it does for normal messages).  This flag can be set
+ * in the initial command or any successive command, and in the initial reply
+ * or in any successive reply.  The recorded message state is destroyed when
+ * both sides have sent a DELETE.
+ *
+ * Aborts for persistent messages work in the same fashion as they do for
+ * normal messages, except that the target can also initiate an ABORT
+ * by using ABORT|REPLY.  The target has one restriction, however, it cannot
+ * send an ABORT with the CREATE flag set (i.e. as the initial reply),
+ * because if the originator reuses the msgid the originator would not
+ * then be able to determine that the ABORT is associated with the previous
+ * session and not the new session.
+ *
+ * If a link failure occurs any active or persistent messages will be
+ * auto-replied to the originator, and auto-aborted to the target.
+ *
+ * Additional features:
+ *
+ *     ABORT+CREATE    - This may be used to make a non-blocking request.
+ *                       The target receives the normal command and is free
+ *                       to ignore the ABORT flag, but may use it as an
+ *                       indication that a non-blocking request is being
+ *                       made.  The target must still reply the message of
+ *                       course.  Works for normal and persistent messages
+ *                       but does NOT work for one-way messages (because
+ *                       ABORT alone without recorded msgid state has to be
+ *                       ignored).
+ *
+ *     ABORT           - ABORT messages are allowed to bypass input queues.
+ *                       Normal ABORTs are sent without the DELETE flag,
+ *                       even for normal messages which had already set the
+ *                       DELETE flag in the initial message.  This allows
+ *                       the normal DELETE half-close operation to proceed
+ *                       so an ABORT is basically advisory and the originator
+ *                       must still wait for a reply.  Aborts are also
+ *                       advisory when sent by targets.
+ *
+ *                       ABORT messages cannot be used with one-way messages
+ *                       as this would cause such messages to be ignored.
+ *
+ *     ABORT+DELETE    - This is a special form of ABORT that allows the
+ *                       recorded message state on the sender and on all
+ *                       hops the message is relayed through to be destroyed
+ *                       on the fly, as if a two-way DELETE had occurred.
+ *                       It will cause an auto-reply or auto-abort to be
+ *                       issued as if the link had been lost, but allows
+ *                       the link to remain live.
+ *
+ *                       This form is basically like a socket close(),
+ *                       where you aren't just sending an EOF but you are
+ *                       completely aborting the request in both directions.
+ *
+ *                       This form cannot be used with CREATE as that could
+ *                       generate a false reply if msgid is reused and
+ *                       crosses the abort over the wire.
+ *
+ *                       ABORT messages cannot be used with one-way messages
+ *                       as this would cause such messages to be ignored.
+ *
+ *     SUBSTREAMS      - Persistent messages coupled with the fact that
+ *                       all commands and responses run through a single
+ *                       chain of relays over reliable streams allows one
+ *                       to treat persistent message updates as a data
+ *                       stream and use the DELETE flag or an ABORT to
+ *                       indicate EOF.
+ *
+ *
+ *                     NEGOTIATION OF {source} AND {target}
+ *
+ * In this discussion 'originator' describes the original sender of a message
+ * and not the relays inbetween, while 'sender' describes the last relay.
+ * The two mean the same thing only when the originator IS the last relay.
+ *
+ * The {source} field is sender-localized.  The sender assigns this field
+ * based on which connection the message originally came from.  The initial
+ * message as sent by the originator sets source=0.  This also means that a
+ * leaf connection will always send messages with source=0.
+ *
+ * The {source} field must be re-localized at each hop, since messages
+ * coming from multiple connections to a node will use conflicting
+ * {source} values.  This can lead to linkid exhaustion which is discussed
+ * a few paragraphs down.
+ *
+ * The {target} field is sender-allocated.  Messages sent to {target} are
+ * preceeded by a FORGE message to {target} which associates a registration
+ * with {target}, or UNFORGE to delete the associtation.
+ *
+ * The msgid field is 32 bits (remember some messages have long-lived
+ * persistent state so this is important!).  One-way messages always use
+ * msgid=0.
+ *
+ *                             LINKID EXHAUSTION
+ *
+ * Because {source} must be re-localized at each hop it is possible to run
+ * out of link identifiers.  At the same time we want to allow millions of
+ * client/leaf connections, and 'millions' is a lot bigger than 65535.
+ *
+ * We also have a problem with the persistent message state... If a single
+ * client's vnode cache has a million vnodes that can represent a million
+ * persistent cache states.  Multiply by a million clients and ... oops!
+ *
+ * To solve these problems leafs connect into protocol-aggregators rather
+ * than directly to the cluster.  The linkid and core message protocols only
+ * occur within the cluster and not by the leafs.  A leaf can still connect
+ * to multiple aggregators for redundancy if it desires but may have to
+ * pick and choose which inodes go where since acquiring a cache state lock
+ * over one connection will cause conflicts to be invalidated on the other.
+ * In otherwords, there are limitations to this approach.
+ *
+ * A protocol aggregator takes any number of connections and aggregates
+ * the operations down to a single linkid.  For example, this means that
+ * the protocol aggregator is responsible for maintaining all the cache
+ * state and performing crunches to reduce the overall amount of state
+ * down to something the cluster core can handle.
+ *
+ * --
+ *
+ * All message headers are 32-byte aligned and sized (all command and
+ * response structures must be 32-byte aligned), and all transports must
+ * support message headers up to HAMMER2_MSGHDR_MAX.  The msg structure
+ * can handle up to 8160 bytes but to keep things fairly clean we limit
+ * message headers to 2048 bytes.
+ *
+ * Any in-band data is padded to a 32-byte alignment and placed directly
+ * after the extended header (after the higher-level cmd/rep structure).
+ * The actual unaligned size of the in-band data is encoded in the aux_bytes
+ * field in this case.  Maximum data sizes are negotiated during registration.
+ *
+ * Use of out-of-band data must be negotiated.  In this case bit 31 of
+ * aux_bytes will be set and the remaining bits will contain information
+ * specific to the out-of-band transfer (such as DMA channel, slot, etc).
+ *
+ * (must be 32 bytes exactly to match the alignment requirement and to
+ *  support pad records in shared-memory FIFO schemes)
+ */
+struct hammer2_msg_hdr {
+       uint16_t        magic;          /* sanity, synchronization, endian */
+       uint16_t        icrc1;          /* base header crc &salt on */
+       uint32_t        salt;           /* random salt helps crypto/replay */
+
+       uint16_t        source;         /* source linkid */
+       uint16_t        target;         /* target linkid */
+       uint32_t        msgid;          /* message id */
+
+       uint32_t        cmd;            /* flags | cmd | hdr_size / 32 */
+       uint16_t        error;          /* error field */
+       uint16_t        resv05;
+
+       uint16_t        icrc2;          /* extended header crc (after base) */
+       uint16_t        aux_bytes;      /* aux data descriptor or size / 32 */
+       uint32_t        aux_icrc;       /* aux data iscsi crc */
+};
+
+typedef struct hammer2_msg_hdr hammer2_msg_hdr_t;
+
+#define HAMMER2_MSGHDR_MAGIC           0x4832
+#define HAMMER2_MSGHDR_MAGIC_REV       0x3248
+#define HAMMER2_MSGHDR_CRCOFF          offsetof(hammer2_msg_hdr_t, salt)
+#define HAMMER2_MSGHDR_CRCBYTES                (sizeof(hammer2_msg_hdr_t) -    \
+                                        HAMMER2_MSGHDR_CRCOFF)
+
+/*
+ * Administrative protocol limits.
+ */
+#define HAMMER2_MSGHDR_MAX             2048    /* msg struct max is 8192-32 */
+#define HAMMER2_MSGAUX_MAX             65536   /* msg struct max is 2MB-32 */
+#define HAMMER2_MSGBUF_SIZE            (HAMMER2_MSGHDR_MAX * 4)
+#define HAMMER2_MSGBUF_MASK            (HAMMER2_MSGBUF_SIZE - 1)
+
+/*
+ * The message (cmd) field also encodes various flags and the total size
+ * of the message header.  This allows the protocol processors to validate
+ * persistency and structural settings for every command simply by
+ * switch()ing on the (cmd) field.
+ */
+#define HAMMER2_MSGF_CREATE            0x80000000U     /* msg start */
+#define HAMMER2_MSGF_DELETE            0x40000000U     /* msg end */
+#define HAMMER2_MSGF_REPLY             0x20000000U     /* reply path */
+#define HAMMER2_MSGF_ABORT             0x10000000U     /* abort req */
+#define HAMMER2_MSGF_AUXOOB            0x08000000U     /* aux-data is OOB */
+#define HAMMER2_MSGF_FLAG2             0x04000000U
+#define HAMMER2_MSGF_FLAG1             0x02000000U
+#define HAMMER2_MSGF_FLAG0             0x01000000U
+
+#define HAMMER2_MSGF_FLAGS             0xFF000000U     /* all flags */
+#define HAMMER2_MSGF_PROTOS            0x00F00000U     /* all protos */
+#define HAMMER2_MSGF_CMDS              0x000FFF00U     /* all cmds */
+#define HAMMER2_MSGF_SIZE              0x000000FFU     /* N*32 */
+
+#define HAMMER2_MSGF_CMDSWMASK         (HAMMER2_MSGF_CMDS |    \
+                                        HAMMER2_MSGF_SIZE |    \
+                                        HAMMER2_MSGF_PROTOS |  \
+                                        HAMMER2_MSGF_REPLY)
+
+#define HAMMER2_MSG_PROTO_LNK          0x00000000U
+#define HAMMER2_MSG_PROTO_DBG          0x00100000U
+#define HAMMER2_MSG_PROTO_CAC          0x00200000U
+#define HAMMER2_MSG_PROTO_QRM          0x00300000U
+#define HAMMER2_MSG_PROTO_BLK          0x00400000U
+#define HAMMER2_MSG_PROTO_VOP          0x00500000U
+
+/*
+ * Message command constructors, sans flags
+ */
+#define HAMMER2_MSG_ALIGN              32
+#define HAMMER2_MSG_ALIGNMASK          (HAMMER2_MSG_ALIGN - 1)
+#define HAMMER2_MSG_DOALIGN(bytes)     (((bytes) + HAMMER2_MSG_ALIGNMASK) & \
+                                        ~HAMMER2_MSG_ALIGNMASK)
+#define HAMMER2_MSG_HDR_ENCODE(elm)    ((sizeof(struct elm) +          \
+                                         HAMMER2_MSG_ALIGNMASK) /      \
+                                        HAMMER2_MSG_ALIGN)
+
+#define HAMMER2_MSG_LNK(cmd, elm)      (HAMMER2_MSG_PROTO_LNK |        \
+                                        ((cmd) << 8) |                 \
+                                        HAMMER2_MSG_HDR_ENCODE(elm))
+
+#define HAMMER2_MSG_DBG(cmd, elm)      (HAMMER2_MSG_PROTO_DBG |        \
+                                        ((cmd) << 8) |                 \
+                                        HAMMER2_MSG_HDR_ENCODE(elm))
+
+#define HAMMER2_MSG_CAC(cmd, elm)      (HAMMER2_MSG_PROTO_CAC |        \
+                                        ((cmd) << 8) |                 \
+                                        HAMMER2_MSG_HDR_ENCODE(elm))
+
+#define HAMMER2_MSG_QRM(cmd, elm)      (HAMMER2_MSG_PROTO_QRM |        \
+                                        ((cmd) << 8) |                 \
+                                        HAMMER2_MSG_HDR_ENCODE(elm))
+
+#define HAMMER2_MSG_BLK(cmd, elm)      (HAMMER2_MSG_PROTO_BLK |        \
+                                        ((cmd) << 8) |                 \
+                                        HAMMER2_MSG_HDR_ENCODE(elm))
+
+#define HAMMER2_MSG_VOP(cmd, elm)      (HAMMER2_MSG_PROTO_VOP |        \
+                                        ((cmd) << 8) |                 \
+                                        HAMMER2_MSG_HDR_ENCODE(elm))
+
+/*
+ * Link layer ops basically talk to just the other side of a direct
+ * connection.
+ *
+ * PAD         - One-way message on link-0, ignored by target.  Used to
+ *               pad message buffers on shared-memory transports.  Not
+ *               typically used with TCP.
+ *
+ * AUTHn       - Authenticate the connection, negotiate administrative
+ *               rights & encryption, protocol class, etc.  Only PAD and
+ *               AUTH messages (not even PING) are accepted until
+ *               authentication is complete.  This message also identifies
+ *               the host.
+ *
+ * PING                - One-way message on link-0, keep-alive, run by both sides
+ *               typically 1/sec on idle link, link is lost after 10 seconds
+ *               of inactivity.
+ *
+ * HSPAN       - One-way message on link-0, host-spanning tree message.
+ *               Connection and authentication status is propagated using
+ *               these messages on a per-connection basis.  Works like SPAN
+ *               but is only used for general status.  See the hammer2
+ *               'rinfo' command.
+ *
+ * SPAN                - One-way message on link-0, spanning tree message adds,
+ *               drops, or updates a remote registration.  Sent by both
+ *               sides, delta changes only.  Visbility into remote
+ *               registrations may be limited and received registrations
+ *               may be filtered depending on administrative controls.
+ *
+ *               A multiply-connected node maintains SPAN information on
+ *               each link independently and then retransmits an aggregation
+ *               of the shortest-weighted path for each registration to
+ *               all links when a received change adjusts the path.
+ *
+ *               The leaf protocol also uses this to make a PFS available
+ *               to the cluster (e.g. on-mount).
+ */
+#define HAMMER2_LNK_PAD                HAMMER2_MSG_LNK(0x000, hammer2_msg_hdr)
+#define HAMMER2_LNK_PING       HAMMER2_MSG_LNK(0x001, hammer2_msg_hdr)
+#define HAMMER2_LNK_AUTH       HAMMER2_MSG_LNK(0x010, hammer2_lnk_auth)
+#define HAMMER2_LNK_HSPAN      HAMMER2_MSG_LNK(0x011, hammer2_lnk_hspan)
+#define HAMMER2_LNK_SPAN       HAMMER2_MSG_LNK(0x012, hammer2_lnk_span)
+#define HAMMER2_LNK_ERROR      HAMMER2_MSG_LNK(0xFFF, hammer2_msg_hdr)
+
+/*
+ * Debug layer ops operate on any link
+ *
+ * SHELL       - Persist stream, access the debug shell on the target
+ *               registration.  Multiple shells can be operational.
+ */
+#define HAMMER2_DBG_SHELL      HAMMER2_MSG_DBG(0x001, hammer2_dbg_shell)
+
+struct hammer2_dbg_shell {
+       hammer2_msg_hdr_t       head;
+};
+typedef struct hammer2_dbg_shell hammer2_dbg_shell_t;
+
+/*
+ * Cache layer ops operate on any link, link-0 may be used when the
+ * directly connected target is the desired registration.
+ *
+ * LOCK                - Persist state, blockable, abortable.
+ *
+ *               Obtain cache state (MODIFIED, EXCLUSIVE, SHARED, or INVAL)
+ *               in any of three domains (TREE, INUM, ATTR, DIRENT) for a
+ *               particular key relative to cache state already owned.
+ *
+ *               TREE - Effects entire sub-tree at the specified element
+ *                      and will cause existing cache state owned by
+ *                      other nodes to be adjusted such that the request
+ *                      can be granted.
+ *
+ *               INUM - Only effects inode creation/deletion of an existing
+ *                      element or a new element, by inumber and/or name.
+ *                      typically can be held for very long periods of time
+ *                      (think the vnode cache), directly relates to
+ *                      hammer2_chain structures representing inodes.
+ *
+ *               ATTR - Only effects an inode's attributes, such as
+ *                      ownership, modes, etc.  Used for lookups, chdir,
+ *                      open, etc.  mtime has no affect.
+ *
+ *               DIRENT - Only affects an inode's attributes plus the
+ *                      attributes or names related to any directory entry
+ *                      directly under this inode (non-recursively).  Can
+ *                      be retained for medium periods of time when doing
+ *                      directory scans.
+ *
+ *               This function may block and can be aborted.  You may be
+ *               granted cache state that is more broad than the state you
+ *               requested (e.g. a different set of domains and/or an element
+ *               at a higher layer in the tree).  When quorum operations
+ *               are used you may have to reconcile these grants to the
+ *               lowest common denominator.
+ *
+ *               In order to grant your request either you or the target
+ *               (or both) may have to obtain a quorum agreement.  Deadlock
+ *               resolution may be required.  When doing it yourself you
+ *               will typically maintain an active message to each master
+ *               node in the system.  You can only grant the cache state
+ *               when a quorum of nodes agree.
+ *
+ *               The cache state includes transaction id information which
+ *               can be used to resolve data requests.
+ */
+#define HAMMER2_CAC_LOCK       HAMMER2_MSG_CAC(0x001, hammer2_cac_lock)
+
+/*
+ * Quorum layer ops operate on any link, link-0 may be used when the
+ * directly connected target is the desired registration.
+ *
+ * COMMIT      - Persist state, blockable, abortable
+ *
+ *               Issue a COMMIT in two phases.  A quorum must acknowledge
+ *               the operation to proceed to phase-2.  Message-update to
+ *               proceed to phase-2.
+ */
+#define HAMMER2_QRM_COMMIT     HAMMER2_MSG_QRM(0x001, hammer2_qrm_commit)
+
+/*
+ * General message errors
+ *
+ *     0x00 - 0x1F     Local iocomm errors
+ *     0x20 - 0x2F     Global errors
+ */
+#define HAMMER2_MSG_ERR_UNKNOWN                0x20
+
+union hammer2_any {
+       char                    buf[HAMMER2_MSGHDR_MAX];
+       hammer2_msg_hdr_t       head;
+};
+
+typedef union hammer2_any hammer2_any_t;
+
+#endif